### YouTube archive data extraction

This notebook extracts data from the `search-history.html`, `watch-history.html` and `my-comments.html` files of a YouTube Archive. You can request a YouTube archive of an account's view and watch history [on Google's Takeout page](https://takeout.google.com/settings/takeout) by selecting the archive you would like to download while logged into the account for which you want the data. This same account will be emailed a download link once the archive is ready. 

In [1]:
# —————— libraries built into Python ———————

import json # to read json formatted data
import csv # to write and read csv
import time # to build in wait time for loops
import glob # to access file paths

# —————— libraries that need to be installed, which you can do via pip ———————

from bs4 import BeautifulSoup # to parse HTML, documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import pandas as pd # to use pandas to process data


The cells below extracts data from the HTML pages: 

In [None]:
%%time
# Ensure that the path to your file is correct
with open("../data/youtube-archive/YouTube and YouTube Music/history/search-history.html") as page:
    rows=[]
    soup = BeautifulSoup(page,  "html.parser")
    contents = soup.find("body")
    contentlist = contents.find_all( "div" , class_="outer-cell")
    for content_item in contentlist:
        content_cell = content_item.find_all("div" , class_="content-cell")

        video_title = content_cell[0].find("a").text if content_cell[0].find("a") != None else ""
        link = content_cell[0].find("a")["href"]
        content_type = content_cell[0].find("a").previous_sibling
        date_recorded =  content_cell[0].find_all("br")[-1].next_sibling
        meta_data_products = content_cell[-1].find_all("br")[0].next_sibling if content_cell[-1].find_all("br") != None else ""
        
        if "Details" in str(content_cell[-1].find_all("b")[1]):
            meta_data_details = content_cell[-1].find_all("b")[1].next_sibling.next_sibling
        else:
            meta_data_details = None
        content_text_full = content_cell[0].text if content_cell[0] != None else ""
        meta_data_full = content_cell[-1].text if content_cell[-1] != None else ""

        row ={
            "video_title": video_title,
            "link": link,
            "content_type": content_type,
            "date_recorded": date_recorded,
            "meta_data_products": meta_data_products,
            "meta_data_details": meta_data_details,
            "content_text_full": content_text_full,
            "meta_data_full": meta_data_full,
            "file": "../data/youtube-archive/YouTube and YouTube Music/history/search-history.html".split("/")[-1]
        }

        rows.append(row)

In [None]:
search_history = pd.DataFrame(rows)
search_history.head()

In [None]:
search_history.to_csv("../output/search_history.csv", encoding='utf-8', index =False)

In [None]:
len(search_history)

In [None]:
search_history["meta_data_details"].value_counts()

In [None]:
search_history["content_type"].value_counts()

#### This cell extracts data from the watch history

In [None]:
%%time
# Ensure that the path to your file is correct
with open("../data/youtube-archive/YouTube and YouTube Music/history/watch-history.html") as page:
    rows=[]
    soup = BeautifulSoup(page,  "html.parser")
    contents = soup.find("body")
    contentlist = contents.find_all( "div" , class_="outer-cell")
    for content_item in contentlist:
        content_cell = content_item.find_all("div" , class_="content-cell")

        video_title = content_cell[0].find("a").text if content_cell[0].find("a") != None else ""
        link = content_cell[0].find("a")["href"] if content_cell[0].find("a") != None else ""
        content_type = content_cell[0].find("a").previous_sibling  if content_cell[0].find("a") != None else ""
        date_recorded =  content_cell[0].find_all("br")[-1].next_sibling
        meta_data_products = content_cell[-1].find_all("br")[0].next_sibling if content_cell[-1].find_all("br") != None else ""
        
        if "Details" in str(content_cell[-1].find_all("b")[1]):
            meta_data_details = content_cell[-1].find_all("b")[1].next_sibling.next_sibling
        else:
            meta_data_details = None
        content_text_full = content_cell[0].text if content_cell[0] != None else ""
        meta_data_full = content_cell[-1].text if content_cell[-1] != None else ""

        row ={
            "video_title": video_title,
            "link": link,
            "date_recorded": date_recorded,
            "meta_data_products": meta_data_products,
            "meta_data_details": meta_data_details,
            "content_text_full": content_text_full,
            "meta_data_full": meta_data_full,
            "file": "../data/youtube-archive/YouTube and YouTube Music/history/search-history.html".split("/")[-1]
        }

        rows.append(row)

In [None]:
watch_history = pd.DataFrame(rows)

watch_history.to_csv("../output/watch_history.csv", encoding='utf-8', index=False)
watch_history.head()


In [None]:
len(watch_history)

#### This cell extracts comments

In [None]:
%%time

rows = []
# Ensure that the path to your file is correct
with open("../data/youtube-archive/YouTube and YouTube Music/my-comments/my-comments.html") as page:
    rows=[]
    soup = BeautifulSoup(page,  "html.parser")
    comment_container = soup.find("ul")
    comment_list = comment_container.find_all( "li")
    for comment in comment_list: 
        comment_text = comment.text.split("Z.")[1]
        links = comment.find_all("a")
        comment_link = links[0]['href']
        video_link = links[1]["href"]
        date = comment.text.split("at")[1].split("Z")[0]
        row ={
            "date":date, 
            "comment_link":comment_link,
            "video_link":video_link, 
            "comment_text":comment_text
        }
        rows.append(row)



In [None]:
len(rows)

In [None]:
comments_df = pd.DataFrame(rows)
comments_df.to_csv("../output/comments.csv", index =False)

comments_df.head()