What were the top 10 trending titles for this time period? For each year?

1. load data

In [272]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import matplotlib.pyplot as plt
from tabulate import tabulate
import seaborn as sns
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
import calmap
import os
import csv

In [273]:
def read_files_by_country(dir_name, country_code):
    try:
        json_file = os.path.join(dir_name, f"{country_code}_category_id.json")
        csv_file = os.path.join(dir_name, f"{country_code}_youtube_trending_data.csv")

        with open(json_file, 'r', encoding='utf-8') as json_data:
            json_content = json.load(json_data)
            # print(f"Contents of {json_file}:")
            # print(json_content) 

        # for japanese
        # videos = pd.read_csv(csv_file, encoding="latin")
        videos = pd.read_csv(csv_file)
        # print(videos)

    except FileNotFoundError:
        print(f"Files for {country_code} not found.")
    
    return json_content, videos

country_code = 'RU'
json_content, videos = read_files_by_country('/Users/{UserName}/Desktop/Data/archive2023', country_code) 

In [274]:
videos['description'] = videos['description'].apply(lambda x: '' if pd.isnull(x)  else x)

##converting dates and times and creating a new column for publish time

videos['publishedAt'] = pd.to_datetime(videos['publishedAt'])
videos['trending_date'] = pd.to_datetime(videos['trending_date'])
videos.dropna(subset=['publishedAt'], inplace=True)
videos.dropna(subset=['trending_date'], inplace=True)
# videos.head()

In [275]:

videos['publish_date'] = videos['publishedAt'].dt.date
videos['publish_time'] = videos['publishedAt'].dt.time
videos['trending_date'] = videos['trending_date'].dt.date


In [276]:
##creating new column which has the time until the video begins to trend

json_df = pd.DataFrame(json_content['items'])
# Convert 'id' column to int64
json_df['id'] = json_df['id'].astype('int64')


# Extract 'title' from 'snippet' and assign it a name
json_df['category_title'] = json_df['snippet'].apply(lambda x: x['title'])

# Merge the DataFrames with the 'title' column
videos = videos.merge(json_df[['id', 'category_title']], left_on='categoryId', right_on='id')

# Drop the 'id' column from the merged DataFrame
videos = videos.drop(columns=['id'])

trending_repeat = videos.groupby(['video_id'])['category_title'].value_counts().reset_index(name='Trending Days')
# trending_repeat.head()

repeat_all = trending_repeat.pivot_table(index='Trending Days', columns='category_title', values='video_id', aggfunc='count').fillna(0)
# repeat_all

repeat_all_percentage = repeat_all.divide(repeat_all.sum(axis=1), axis=0) * 100
# repeat_all_percentage.head()

In [277]:
pd.to_datetime(videos['trending_date'][0:10])

0   2020-08-12
1   2020-08-12
2   2020-08-12
3   2020-08-12
4   2020-08-12
5   2020-08-12
6   2020-08-12
7   2020-08-12
8   2020-08-12
9   2020-08-12
Name: trending_date, dtype: datetime64[ns]

Sorting by numbers of trending days: 

In [278]:
# calculate the number of trending days for each video
import pandas as pd
sort_trending = videos.groupby('video_id').size().reset_index(name='days_trending')
# drop duplicates
video_noduplicates = videos.drop_duplicates(subset='video_id', keep='first')
merge_videos = pd.merge(video_noduplicates, sort_trending, on='video_id')
# merge_videos.head()
# specify trending year



In [279]:

merge_videos['trending_year'] = pd.to_datetime(merge_videos['trending_date']).dt.year
merge_videos['country'] = country_code
# sort the top 10 trending videos by trending days
top_trending = merge_videos.sort_values(by='days_trending', ascending=False).head(10)
top_trending.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,...,thumbnail_link,comments_disabled,ratings_disabled,description,publish_date,publish_time,category_title,days_trending,trending_year,country
23637,b-lvHKc8_9k,ТАТУ: 20 лет спустя! Главная российская группа...,2021-06-08 07:00:08+00:00,UCvQXaJTjA3jRucTKN4CGiwg,Осторожно: Собчак,22,2021-06-08,собчак|ксения собчак|осторожно|интервью|осторо...,240037,15200,...,https://i.ytimg.com/vi/b-lvHKc8_9k/default.jpg,False,False,Каждый слушает музыку по-своему. А как слушаеш...,2021-06-08,07:00:08,People & Blogs,9,2021,RU
65285,lF3l475PchA,Михаил Галустян х Григорий Лепс | ЧТО БЫЛО ДАЛ...,2021-06-03 10:12:19+00:00,UCNqktdxgAADBj36dC7VGOgg,LABELCOM,23,2021-06-04,что было дальше|чбд|нурлан сабуров|сабуров|щер...,5804538,297179,...,https://i.ytimg.com/vi/lF3l475PchA/default.jpg,False,False,Откройте бесплатный счёт для бизнеса в Альфа-Б...,2021-06-03,10:12:19,Comedy,9,2021,RU
23491,K_6CTnzaQYo,СКАЖИ В ЛИЦО - MORGENSHTERN / ХЕЙТЕР ПЕРЕОБУЛСЯ,2021-05-31 11:00:04+00:00,UCe9pVmC9VWLON835WSevAiQ,ДЖАВИД,22,2021-06-01,эмоции|реакция|хейтер|пранк|шоу|морген|моргенш...,1524044,166800,...,https://i.ytimg.com/vi/K_6CTnzaQYo/default.jpg,False,False,Качественные отношения только с качественным а...,2021-05-31,11:00:04,People & Blogs,9,2021,RU
65327,nIkH6C3_CX8,[BadComedian] - ЗОЯ (Спасение рядового ИИСУСА),2021-06-08 13:45:20+00:00,UC6cqazSR6CnVMClY0bJI0Lg,BadComedian,23,2021-06-09,BadComedian|Евген|Bad|Comedian|Фильмы|Кино|Трэ...,1947534,316504,...,https://i.ytimg.com/vi/nIkH6C3_CX8/default.jpg,False,False,"#BadComedian обзор фильма ЗОЯ, который рассказ...",2021-06-08,13:45:20,Comedy,9,2021,RU
80356,HfGSrCanHCs,[Эксклюзив] Моргенштерн на сцене Премии МУЗ-ТВ...,2021-06-05 16:14:33+00:00,UC2eH4vw5JuVpzLDrCkxW2pQ,МУЗ-ТВ,10,2021-06-06,музтв|муз тв|премия муз тв 2021|муз-тв|премия ...,479897,13804,...,https://i.ytimg.com/vi/HfGSrCanHCs/default.jpg,False,False,Моргенштерн и Ксения Собчак обменялись любезно...,2021-06-05,16:14:33,Music,8,2021,RU


In [280]:
# shift the columns
new_order = ['country', 'trending_year', 'days_trending', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'channelTitle', 'categoryId', \
       'publish_time', 'tags', 'view_count', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'description']
top_trending = top_trending[new_order]

top_trending_short = top_trending[['country', 'trending_year', 'title', 'channelTitle', 'days_trending']]
top_trending_short
# write top_trending to a csv file in append format
top_trending.to_csv('csv2023/2023top_trending.csv', mode='a', index=False, header=True)

Conclusion: Sorting by numbers of trending days, top 10 trending videos are mostly from category 'Music' and 'Entertainment'.

Sorting by views:

In [281]:
top_views = merge_videos.sort_values(by='view_count', ascending=False).head(10)
new_order_views = ['country', 'view_count', 'trending_year', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'days_trending', 'channelTitle', 'categoryId', \
       'publish_time', 'tags', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'description']
top_views = top_views[new_order_views]

top_views.to_csv('csv2023/2023top_views.csv', mode='a', index=False, header=True)

Conclusion: Sorting by numbers of views, top 10 trending videos are mostly from category 'Music' and 'Entertainment' as well.

Analyse the trending video by year:

In [282]:
top2020 = merge_videos[(merge_videos['trending_year'] == 2020)].sort_values(by='view_count', ascending=False).head(10)
top2021 = merge_videos[(merge_videos['trending_year'] == 2021)].sort_values(by='view_count', ascending=False).head(10)
top2022 = merge_videos[(merge_videos['trending_year'] == 2022)].sort_values(by='view_count', ascending=False).head(10)
top2023 = merge_videos[(merge_videos['trending_year'] == 2023)].sort_values(by='view_count', ascending=False).head(10)

new_order_views = ['country', 'view_count', 'trending_year', 'title', 'category_title', \
        'video_id', 'publish_date', 'trending_date', 'days_trending', 'channelTitle', 'categoryId', \
       'publish_time', 'tags', 'likes', 'dislikes', 'comment_count',\
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',\
       'description']
top2020 = top2020[new_order_views]
top2021 = top2021[new_order_views]
top2022 = top2022[new_order_views]
top2023 = top2023[new_order_views]
# write to csv
top2020.to_csv('csv2023/top2020.csv', mode='a', index=False, header=True)
top2021.to_csv('csv2023/top2021.csv', mode='a', index=False, header=True)
top2022.to_csv('csv2023/top2022.csv', mode='a', index=False, header=True)
top2023.to_csv('csv2023/top2023.csv', mode='a', index=False, header=True)

What were the top 10 trending channels? (more trending videos the channel has, more popular it is)

In [283]:
# calculate video numbers of each channel
top_category = merge_videos.groupby(['channelTitle']).size().reset_index(name='number of videos').sort_values(by=['number of videos'], ascending=False)
top_category

Unnamed: 0,channelTitle,number of videos
7389,Телеканал Дождь,1163
6515,Матч ТВ,994
7041,Россия 24,879
5381,Анатолий Шарий,806
1346,EdisonPts,624
...,...,...
3956,Silence of Music,1
3958,Silva Hakobyan,1
3964,SiriusXM,1
3966,Sisters Family TV,1


In [284]:
channel_category = merge_videos.groupby('channelTitle')['category_title'].unique().reset_index()
channel_category['category_title'] = channel_category['category_title'].str.join(', ')
result = top_category.merge(channel_category, how='left', on='channelTitle').head(10)
# write result to csv in append format
result['country'] = country_code

channel_order = ['country', 'channelTitle', 'number of videos', 'category_title']
result = result[channel_order]
result.to_csv('csv2023/2023top_channel.csv', mode='a', index=False, header=True)
# result.columns