In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pprint
import itertools
import nltk
import chardet
import string
import matplotlib.pyplot as plt
from PIL import Image
from collections import Counter
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud, STOPWORDS

In [3]:
# import file
US = pd.read_csv('Dataset/US_youtube_trending_data.csv') 
GB = pd.read_csv('Dataset/GB_youtube_trending_data.csv')
CA = pd.read_csv('Dataset/CA_youtube_trending_data.csv')

US['country'] = 'US'
GB['country'] = 'GB'
CA['country'] = 'CA'
frames = [US, GB, CA]

#merge
df = pd.concat(frames).drop_duplicates()

In [4]:
# Drop unnecessary rows 
df.drop(['channelId', 'thumbnail_link', 'comments_disabled', 'ratings_disabled'], inplace=True, axis=1)

df.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,description,country
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...,US
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,"While running her own modding shop, Ramya Pare...",US
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,I left youtube for a month and this is what ha...,US
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11T16:38:55Z,XXL,10,2020-08-12T00:00:00Z,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,Subscribe to XXL → http://bit.ly/subscribe-xxl...,US
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11T15:10:05Z,Mr. Kate,26,2020-08-12T00:00:00Z,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45802,964,2196,Transforming The LaBrant Family's empty white ...,US


## Simple Data Cleaning

In [5]:
# Lowercase title and tags columns 
df['title'] = df['title'].str.lower()
df['tags'] = df['tags'].str.lower()
df['description'] = df['description'].str.lower()

In [6]:
# Splitting tag and title contents for easier parsing
df['title content'] = df['title'].str.split()
df['tag content'] = df['tags'].str.split("|")
df['description content'] = df['description'].str.split()

In [7]:
# Getting the total word count of video title (title length)
df['total count title'] = df['title'].str.split().str.len()

# Getting the total tag count of video tags (tag length)
df['total count tag'] = df['tags'].str.split("|").str.len()
df.head()

Unnamed: 0,video_id,title,publishedAt,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,description,country,title content,tag content,description content,total count title,total count tag
0,3C66w5Z0ixs,i asked her to be my girlfriend...,2020-08-11T19:20:14Z,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,subscribe to brawadis ▶ http://bit.ly/subscrib...,US,"[i, asked, her, to, be, my, girlfriend...]","[brawadis, prank, basketball, skits, ghost, fu...","[subscribe, to, brawadis, ▶, http://bit.ly/sub...",7,15
1,M9Pmf9AB4Mo,apex legends | stories from the outlands – “th...,2020-08-11T17:00:10Z,Apex Legends,20,2020-08-12T00:00:00Z,apex legends|apex legends characters|new apex ...,2381688,146739,2794,16549,"while running her own modding shop, ramya pare...",US,"[apex, legends, |, stories, from, the, outland...","[apex legends, apex legends characters, new ap...","[while, running, her, own, modding, shop,, ram...",10,25
2,J78aPJ3VyNs,i left youtube for a month and this is what ha...,2020-08-11T16:34:06Z,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,i left youtube for a month and this is what ha...,US,"[i, left, youtube, for, a, month, and, this, i...","[jacksepticeye, funny, funny meme, memes, jack...","[i, left, youtube, for, a, month, and, this, i...",11,30
3,kXLn3HkpjaA,xxl 2020 freshman class revealed - official an...,2020-08-11T16:38:55Z,XXL,10,2020-08-12T00:00:00Z,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,subscribe to xxl → http://bit.ly/subscribe-xxl...,US,"[xxl, 2020, freshman, class, revealed, -, offi...","[xxl freshman, xxl freshmen, 2020 xxl freshman...","[subscribe, to, xxl, →, http://bit.ly/subscrib...",8,23
4,VIUo6yapDbc,ultimate diy home movie theater for the labran...,2020-08-11T15:10:05Z,Mr. Kate,26,2020-08-12T00:00:00Z,the labrant family|diy|interior design|makeove...,1123889,45802,964,2196,transforming the labrant family's empty white ...,US,"[ultimate, diy, home, movie, theater, for, the...","[the labrant family, diy, interior design, mak...","[transforming, the, labrant, family's, empty, ...",9,33


#### Divide into 3 countries

In [8]:
df_us = df[df['country'] == 'US']
df_gb = df[df['country'] == 'GB']
df_ca = df[df['country'] == 'CA']

##### Published Times Analysis 

In [9]:
# Published Times Analysis 

# Remove the dates, mins, and seconds in 'publshedAt' column 
df_us['publishedAt'] = df_us['publishedAt'].str[10:]
df_us['publishedAt'] = df_us['publishedAt'].str[:3]

df_gb['publishedAt'] = df_gb['publishedAt'].str[10:]
df_gb['publishedAt'] = df_gb['publishedAt'].str[:3]

df_ca['publishedAt'] = df_ca['publishedAt'].str[10:]
df_ca['publishedAt'] = df_ca['publishedAt'].str[:3]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_us['publishedAt'] = df_us['publishedAt'].str[10:]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_us['publishedAt'] = df_us['publishedAt'].str[:3]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gb['publishedAt'] = df_gb['publishedAt'].str[10:]
A value is trying to be set on a copy of a slic

In [12]:
# Create a dataframe with published time counts
time_counts_us = df_us['publishedAt'].value_counts().to_dict()
time_counts_gb = df_gb['publishedAt'].value_counts().to_dict()
time_counts_ca = df_ca['publishedAt'].value_counts().to_dict()

## i.Do title, tags and description word count affect viewership count ?

 #### title  and tag count

In [13]:
# Getting the total word count of video title (title length)
df['total count title'] = df['title'].str.split().str.len()

# Getting the total tag count of video tags (tag length)
df['total count tag'] = df['tags'].str.split("|").str.len()

 #### tag word count

In [14]:
#Create total frequency count of individual tags 
df_us['tag content'].to_list()
us_tag_counts = dict(Counter(itertools.chain.from_iterable(df_us['tag content'].to_list())))

df_gb['tag content'].to_list()
gb_tag_counts = dict(Counter(itertools.chain.from_iterable(df_gb['tag content'].to_list())))

df_ca['tag content'].to_list()
ca_tag_counts = dict(Counter(itertools.chain.from_iterable(df_ca['tag content'].to_list())))

#Convert to dataframe and sort
us_tags = pd.DataFrame(list(us_tag_counts.items()),columns = ['tag','count']) 
us_tags = us_tags.sort_values(by='count', ascending=False)

gb_tags = pd.DataFrame(list(gb_tag_counts.items()),columns = ['tag','count']) 
gb_tags = gb_tags.sort_values(by='count', ascending=False)

ca_tags = pd.DataFrame(list(ca_tag_counts.items()),columns = ['tag','count']) 
ca_tags = ca_tags.sort_values(by='count', ascending=False)

## ii Do title, tags content affect viewership count?

### 1) What are the top N hot topics for each category of videos?

#### Top categories for each country

In [16]:
#df_gb category video count vs top views : 10: music, 24:Entertainment, 20: gaming
df_us_topcategory = pd.DataFrame(df_us.groupby('categoryId')['view_count'].sum()).sort_values(by = 'view_count',ascending=False).reset_index()
video_count = pd.DataFrame(df_us['categoryId'].value_counts()).reset_index().rename(columns={'index' : 'categoryId',
                                                                                             'categoryId':'video_count'})
df_us_topcategory = df_us_topcategory.merge(video_count, how = 'inner', on = 'categoryId')

In [17]:
#Create total frequency count of individual words in title 
us_title_list = df_us['title content'].to_list()
us_all_title_counts = dict(Counter(itertools.chain.from_iterable(df_us['title content'].to_list())))

gb_title_list = df_gb['title content'].to_list()
gb_all_title_counts = dict(Counter(itertools.chain.from_iterable(df_gb['title content'].to_list())))

ca_title_list = df_ca['title content'].to_list()
ca_all_title_counts = dict(Counter(itertools.chain.from_iterable(df_ca['title content'].to_list())))

#Convert to dataframe and sort
df_title_us = pd.DataFrame(list(us_all_title_counts.items()),columns = ['word','count']) 
df_title_us.sort_values(by='count', ascending=False)

df_title_gb = pd.DataFrame(list(gb_all_title_counts.items()),columns = ['word','count']) 
df_title_gb.sort_values(by='count', ascending=False)

df_title_ca = pd.DataFrame(list(ca_all_title_counts.items()),columns = ['word','count']) 
df_title_ca.sort_values(by='count', ascending=False)

Unnamed: 0,word,count
6,|,15140
78,-,12748
23,the,11946
30,to,5198
12,a,4999
...,...,...
13762,heo,1
13758,(sgp),1
13757,yew,1
13756,kean,1


### Extracting hot topics with NLTK
####  (https://www.analyticsvidhya.com/blog/2019/08/how-to-remove-stopwords-text-normalization-nltk-spacy-gensim-python/)

#### split categories

In [None]:
#split categories
df_us_10 = df_us[df_us['categoryId'] == 10]
df_us_24 = df_us[df_us['categoryId'] == 24]
df_us_20 =  df_us[df_us['categoryId'] == 20]
df_us_25 =  df_us[df_us['categoryId'] == 25]

#### Textual Analysis

In [None]:
#Extracting hot topics with NLTK
text = df_us_25['tags'].str.lower().replace('|', ' ').str.cat(sep=' ')


stop_words = set(stopwords.words('english')) 
  
word_tokens = word_tokenize(text) 
    
filtered_sentence = [] 
  
for w in word_tokens: 
    if w not in stop_words: 
        filtered_sentence.append(w) 
        
# Stemming with NLTK
Stem_words = []
ps =PorterStemmer()
for w in filtered_sentence:
    rootWord=ps.stem(w)
    Stem_words.append(rootWord)
    
# Lemmatization with NLTK
filtered_sentence = list(filter(lambda token: token not in string.punctuation, filtered_sentence))
filtered_sentence

# remove unnecessay words
stopwords = ["'s", "’", "..." , "ft." , "2" ,"x" , "1", "n't", "–", "3", "5", "4",
             "2021","2020","trailer", "de", "official", "season", "video", "official", "season", 
             "episode","la", "le", "je", "part", "je", "des","world","day", "10","e", "avec", "‘",
             "à", "music", "none", "new","lil", "like", "songs", "song","thee","love","bad","g", 
             "mix", "100","6", "news", "watch", "man", "live", "interview","first", "today","minutes"]

for word in list(filtered_sentence):  # iterating on a copy since removing will mess things up
    if word in stopwords:
        filtered_sentence.remove(word)


#### Word Cloud

In [None]:
#wordcloud
word_could_dict= Counter(filtered_sentence)

wordcloud = WordCloud(width = 1000, height = 500, background_color ='black',
                      stopwords = stopwords,
                      min_font_size = 10).generate_from_frequencies(word_could_dict)


plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis("off")
# plt.show()
plt.savefig('us25_tags_wordcloud.png', bbox_inches='tight')
plt.close()


#### Hot Topic words count

In [None]:
filtered_sentence = pd.DataFrame(filtered_sentence)
filtered_sentence_unique = pd.DataFrame(filtered_sentence.value_counts())
filtered_sentence_unique = filtered_sentence_unique.rename(columns={'0':'count'})
filtered_sentence_unique.head(50)

In [None]:
# Convert list to string
# using list comprehension
listToStr = ' '.join([str(elem) for elem in filtered_sentence_unique])
  
listToStr 

## iii. Does published time affect viewership count?

#### Published Times Analysis 

In [None]:
# Published Times Analysis 

# Remove the dates, mins, and seconds in 'publshedAt' column 
df_us['publishedAt'] = df_us['publishedAt'].str[10:]
df_us['publishedAt'] = df_us['publishedAt'].str[:4]

df_gb['publishedAt'] = df_gb['publishedAt'].str[10:]
df_gb['publishedAt'] = df_gb['publishedAt'].str[:4]

df_ca['publishedAt'] = df_ca['publishedAt'].str[10:]
df_ca['publishedAt'] = df_ca['publishedAt'].str[:4]

In [None]:
# Create a dataframe with published time counts
time_counts_us = df_us['publishedAt'].value_counts().to_dict()
time_counts_gb = df_gb['publishedAt'].value_counts().to_dict()
time_counts_ca = df_ca['publishedAt'].value_counts().to_dict()
time_counts_ca

## iv.  Does the genre of the video affect viewership count?

In [None]:
# Create a dataframe with categoryId counts
category_counts_us = df_us['categoryId'].value_counts().to_dict()
category_counts_gb = df_gb['categoryId'].value_counts().to_dict()
category_counts_ca = df_ca['categoryId'].value_counts().to_dict()
category_counts_ca