### Analysis of YouTube Archive (in Vietnamese)

This notebook runs basic analyses of from the view and watch histories data files extracted via the `01-youtube-api-scraper` notebook. 

Among the analyses it performs are:

#### Overall findings
- Totals
- Ads vs watched things
- searched terms vs watched vs other
- searched terms (list)
    
#### Specific to Google Ads
- repeat videos
- NLP, such as words most often used
- Time-related trends

#### Specific to non-ad views
- repeat videos
- NLP, such as words most often used
- Time-related trends


In [None]:
# —————— libraries built into Python ———————
import re
import datetime
import time

# —————— libraries that need to be installed, which you can do via pip ———————
import pandas as pd
from pyvi import ViTokenizer, ViPosTagger
import matplotlib.pyplot as plt
from deep_translator import (GoogleTranslator,
                             PonsTranslator,
                             LingueeTranslator,
                             MyMemoryTranslator,
                             YandexTranslator,
                             DeeplTranslator,
                             QcriTranslator,
                             single_detection,
                             batch_detection)

In [None]:
pd.set_option('display.max_colwidth', None)

Load all data files:

In [None]:
watch_history = pd.read_csv(
    "../output/watch_history.csv",
    parse_dates = ["date_recorded"]
)
watch_history.head()

In [None]:
search_history = pd.read_csv(
    "../output/search_history.csv",
    parse_dates = ["date_recorded"]
)


search_history.head()

In [None]:
watch_history.columns

## Analyses


#### First prepare the data:
- cut off dates starting 2020
- separate ads from views

#### Then produce analyses:

Overall findings:
- Totals
- Ads vs watched things
- searched terms vs watched vs other
- searched terms (list)
    
Specific to Google Ads:
- repeat videos
- NLP, such as words most often used
- Time-related trends

Specific to non-ad views:
- Time related trends
- repeat videos
- NLP, such as words most often used
- Time-related trends




In [None]:
watch_history_sm = watch_history[watch_history["date_recorded"].apply(lambda x: x.year) >= 2020]
search_history_sm = search_history[search_history["date_recorded"].apply(lambda x: x.year) >= 2020]
watch_history_sm_no_ads = watch_history_sm[watch_history_sm["meta_data_details"]!= "\u2003From Google Ads"]
watch_history_ads_only = watch_history_sm[watch_history_sm["meta_data_details"]== "\u2003From Google Ads"]
search_history_ads_only = search_history_sm[search_history_sm["meta_data_details"]== "\u2003From Google Ads"]
search_history_sm_no_ads = search_history_sm[search_history_sm["meta_data_details"]!= "\u2003From Google Ads"]

In [None]:
search_history_sm.head(1)

In [None]:
len(watch_history_ads_only)

#### Ads vs watched things as a percentage

In [None]:
watch_history["meta_data_details"].value_counts().reset_index()["count"]/len(watch_history)*100

#### Searched terms vs watched vs other

In [None]:
search_history_sm["content_type"].value_counts()

#### Searched terms (list)

In [None]:
search_history_sm["content_type"][5859]

In [None]:
search_terms = search_history_sm[search_history_sm["content_type"] == "Searched for\xa0"]

In [None]:
search_terms.head(2)

In [None]:
search_terms["translated_search"]  = search_terms["video_title"].apply(
    lambda x: GoogleTranslator(source='auto', target='en').translate(text=str(x))
)

search_terms.head()

In [None]:
search_terms.to_csv("../output/search_terms_only.csv", index=False)

#### Analyses of words in search terms and video titles 

In [None]:
# this line uses pyvi to create word lists
def tokenize(string): 
    return ViPosTagger.postagging(ViTokenizer.tokenize(string))[0]

In [None]:
#turn into text:
words = []
for index, row in search_terms.iterrows():
    for word in tokenize(row["video_title"]):
        words.append(word)

In [None]:
search_terms_tokens = pd.DataFrame(words)
search_terms_tokens.head()

In [None]:
search_terms_analysis = search_terms_tokens[0].value_counts().reset_index().rename(columns={0:"word"})
search_terms_analysis.head()

In [None]:
search_terms_analysis["cleaned_word"]=search_terms_analysis["word"].apply(lambda x: x.replace("_", " "))

In [None]:
search_terms_analysis.head()

#### The following cells use `deep-translator`'s Google Translator functionality to translate each word

In [None]:
def translateString(string):
    if re.search('[a-zA-Z]', string):
        return GoogleTranslator(source='auto', target='en').translate(text=str(string))
    else: 
        return string


In [None]:
search_terms_analysis["translated_word"]=search_terms_analysis["cleaned_word"].apply(translateString)


In [None]:
search_terms_analysis.head(10)

In [None]:
search_terms_analysis.to_csv("../output/search_terms_analysis.csv")


#### Find trends in video searches

In [None]:
search_terms_daily_tallies = search_terms.set_index("date_recorded").resample("D")["video_title"].count()

In [None]:
search_terms_daily_tallies

In [None]:
search_terms_daily_tallies.to_csv("../output/search_terms_daily_tallies.csv")

In [None]:
search_terms_daily_tallies.plot()

#### Specific to Google Ads:
- repeat videos
- NLP, such as words most often used
- Time-related trends

In [None]:
most_watched_ads = watch_history_ads_only.groupby(["video_title", "link" ]).agg({
    "file":"count"
}).reset_index(
).sort_values(
    by=["file"],
    ascending=False
)

most_watched_ads.head(10)

In [None]:
most_watched_ads.to_csv("../output/most_watched_ads.csv", index=False)

In [None]:
watch_history_ads_only.to_csv("../output/watch_history_ads_only.csv", index=False)

print(
    len(watch_history_ads_only),
    len(most_watched_ads)
)

#### Time related trends

In [None]:
watch_history_ads_only_daily_tallies = watch_history_ads_only.set_index("date_recorded").resample("D")["video_title"].count()

In [None]:
watch_history_ads_only_daily_tallies 

In [None]:
watch_history_ads_only_daily_tallies.to_csv("../output/watch_history_ads_only_daily_tallies.csv")

In [None]:
watch_history_ads_onlyhourly_tallies = watch_history_ads_only.set_index("date_recorded").resample("H")["video_title"].count()

In [None]:
watch_history_ads_onlyhourly_tallies

#### Specific to non-ad views:

- repeat videos
- NLP, such as words most often used
- Time-related trends

In [None]:
watch_history_sm_no_ads.head(1)

In [None]:
watch_history_sm_no_ads_tallies = watch_history_sm_no_ads.groupby(["video_title", "link" ]).agg({
    "file":"count"
}).reset_index(
).sort_values(
    by=["file"],
    ascending=False
)

watch_history_sm_no_ads_tallies.head()

In [None]:

watch_history_sm_no_ads_tallies.head()

In [None]:
len(watch_history_sm_no_ads_tallies)

In [None]:
watch_history_sm_no_ads_tallies.to_csv("../output/most_watched_videos.csv", index=False)

#### Translate only the top 1000 watched videos 
This is done to avoid errors due to rate limiting

In [None]:
def translateTitle(string):
    return GoogleTranslator(source='auto', target='en').translate(text=str(string))

top1000 = watch_history_sm_no_ads_tallies[0:999]
top1000["translated_title"] = top1000["video_title"].apply(translateTitle)


In [None]:
top1000.head()

In [None]:
top1000.to_csv("../output/top1000.csv", index=False)

#### Create word count list for the watch history

In [None]:
#turn into text:
video_title_words = []
for index, row in watch_history_sm_no_ads_tallies.iterrows():
    for word in tokenize(row["video_title"]):
        video_title_words.append(word)

In [None]:
video_title_tokens = pd.DataFrame(video_title_words).rename(columns={0:"word"})
video_title_tokens.head()

In [None]:
watched_videos_word_analysis = video_title_tokens["word"].value_counts().reset_index()
watched_videos_word_analysis.to_csv("../output/watched_videos_word_analysis.csv")
watched_videos_word_analysis.head()

#### Translate top 500 most frequently found words in the titles of the videos she watched

In [None]:
watched_videos_word_analysis_top500 = watched_videos_word_analysis[0:499]
watched_videos_word_analysis_top500["translated_word"] = watched_videos_word_analysis_top500["word"].apply(translateString)
watched_videos_word_analysis_top500.to_csv("../output/watched_videos_word_analysis_top500.csv", index=False)
watched_videos_word_analysis_top500.head()
