What were the top 10 trending titles for this time period? For each year?

1. load data

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
import calmap
import os
import csv

In [4]:
def read_files_by_country(dir_name, country_code):
    try:
        json_file = os.path.join(dir_name, f"{country_code}_category_id.json")
        csv_file = os.path.join(dir_name, f"{country_code}_youtube_trending_data.csv")

        with open(json_file, 'r', encoding='utf-8') as json_data:
            json_content = json.load(json_data)
            # print(f"Contents of {json_file}:")
            # print(json_content) 

        # for japanese
        # videos = pd.read_csv(csv_file, encoding="latin")
        videos = pd.read_csv(csv_file)
        # print(videos)

    except FileNotFoundError:
        print(f"Files for {country_code} not found.")
    
    return json_content, videos

country_code = 'RU'
json_content, videos = read_files_by_country('/Users/{UserName}/Desktop/Data/archive1718', country_code) 

In [5]:
videos['description'] = videos['description'].apply(lambda x: '' if pd.isnull(x)  else x)

##converting dates and times and creating a new column for publish time

videos['publish_date'] = pd.to_datetime(videos['publish_time'].str[0:10])
videos['publish_time'] = pd.to_datetime(videos['publish_time'].str[11:19], format='%H:%M:%S')
videos['trending_date'] = pd.to_datetime(videos['trending_date'],format='%y.%d.%m')

##creating new column which has the time until the video begins to trend

json_df = pd.DataFrame(json_content['items'])
# Convert 'id' column to int64
json_df['id'] = json_df['id'].astype('int64')


# Extract 'title' from 'snippet' and assign it a name
json_df['category_title'] = json_df['snippet'].apply(lambda x: x['title'])

# Merge the DataFrames with the 'title' column
videos = videos.merge(json_df[['id', 'category_title']], left_on='category_id', right_on='id')

# Drop the 'id' column from the merged DataFrame
videos = videos.drop(columns=['id'])

trending_repeat = videos.groupby(['video_id'])['category_title'].value_counts().reset_index(name='Trending Days')
# trending_repeat.head()

repeat_all = trending_repeat.pivot_table(index='Trending Days', columns='category_title', values='video_id', aggfunc='count').fillna(0)
# repeat_all

repeat_all_percentage = repeat_all.divide(repeat_all.sum(axis=1), axis=0) * 100
# repeat_all_percentage.head()

Sorting by numbers of trending days:

In [6]:
# calculate the number of trending days for each video
import pandas as pd
sort_trending = videos.groupby('video_id').size().reset_index(name='days_trending')
# drop duplicates
video_noduplicates = videos.drop_duplicates(subset='video_id', keep='first')
merge_videos = pd.merge(video_noduplicates, sort_trending, on='video_id')
# merge_videos.head()
# specify trending year
merge_videos['trending_year'] = merge_videos['trending_date'].dt.year
merge_videos['country'] = country_code
# sort the top 10 trending videos by trending days
top_trending = merge_videos.sort_values(by='days_trending', ascending=False).head(10)
top_trending.head()
top_trending.columns

Index(['video_id', 'trending_date', 'title', 'channel_title', 'category_id',
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'video_error_or_removed', 'description', 'publish_date',
       'category_title', 'days_trending', 'trending_year', 'country'],
      dtype='object')

In [7]:
# shift the columns
new_order = ['country', 'trending_year', 'days_trending', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'channel_title', 'category_id', \
       'publish_time', 'tags', 'views', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'video_error_or_removed', 'description']
top_trending = top_trending[new_order]

top_trending_short = top_trending[['country', 'trending_year', 'title', 'channel_title', 'days_trending']]
top_trending_short
# write top_trending to a csv file in append format
top_trending.to_csv('top_trending.csv', mode='a', index=False, header=True)

Conclusion: Sorting by numbers of trending days, top 10 trending videos are mostly from category 'Music' and 'Entertainment'.

Sorting by views:

In [8]:
top_views = merge_videos.sort_values(by='views', ascending=False).head(10)
new_order_views = ['country', 'views', 'trending_year', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'days_trending', 'channel_title', 'category_id', \
       'publish_time', 'tags', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'video_error_or_removed', 'description']
top_views = top_views[new_order_views]

top_views.to_csv('top_views.csv', mode='a', index=False, header=True)

Conclusion: Sorting by numbers of views, top 10 trending videos are mostly from category 'Music' and 'Entertainment' as well.

Analyse the trending video by year:

In [9]:
top2017 = merge_videos[(merge_videos['trending_year'] == 2017)].sort_values(by='views', ascending=False).head(10)
top2018 = merge_videos[(merge_videos['trending_year'] == 2018)].sort_values(by='views', ascending=False).head(10)

new_order_views = ['country', 'views', 'trending_year', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'days_trending', 'channel_title', 'category_id', \
       'publish_time', 'tags', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'video_error_or_removed', 'description']
top2017 = top2017[new_order_views]
top2018 = top2018[new_order_views]
# write to csv
top2017.to_csv('top2017.csv', mode='a', index=False, header=True)
top2018.to_csv('top2018.csv', mode='a', index=False, header=True)

What were the top 10 trending channels? (more trending videos the channel has, more popular it is)

In [10]:
# calculate video numbers of each channel
top_category = merge_videos.groupby(['channel_title']).size().reset_index(name='number of videos').sort_values(by=['number of videos'], ascending=False)
top_category

Unnamed: 0,channel_title,number of videos
436,DHCãã¬ã,82
4486,é£ãããã§ãããã,79
1060,MAXé´æ¨ TV,64
4487,é£ãããé£¯,59
1676,TV Derana,59
...,...,...
1927,[Namikaze Rei]æ³¢é¢¨ é¶,1
1929,[ã¨ã³ã¼ãº]Echoes,1
1930,[ã´ãªã©ã¤ã] è¿«çã´ãªã©,1
1931,_ìë¼ì ìë ì¤í¸ë¦¬ë° ê³ì ,1


In [11]:
channel_category = merge_videos.groupby('channel_title')['category_title'].unique().reset_index()
channel_category['category_title'] = channel_category['category_title'].str.join(', ')
result = top_category.merge(channel_category, how='left', on='channel_title').head(10)
# write result to csv in append format
result['country'] = country_code

channel_order = ['country', 'channel_title', 'number of videos', 'category_title']
result = result[channel_order]
result.to_csv('top_channel.csv', mode='a', index=False, header=True)
# result.columns