In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict

In [None]:
# Nov 17, 2021, 7:56:53 PM CET
date_format = "%b %d, %Y, %I:%M:%S %p %Z"
class VideoInfo:
    def __init__(self, id, title, channel, date):
        self.id = id
        self.title = title
        self.channel = channel
        self.date = date

    def __str__(self):
        return f"Video {self.id}: '{self.title}' from '{self.channel}', on {self.date}"

In [None]:
def load_html(filename):
    soup=None
    with open(filename) as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup

In [None]:
def column_chart(x_data, y_data, title, x_axis_title, y_axis_title):
    fig = plt.figure(figsize=(20, 10))
    # ax = fig.add_axes([0, 0, 1, 1])
    plt.bar(x_data, y_data)
    plt.xlabel(x_axis_title)
    plt.ylabel(y_axis_title)
    plt.title(title)
    plt.show()

In [None]:
# May take a while - took 20 minutes for 25k videos
# Find this file via google's takeout feature
soup = load_html("data/watch-history.html")

In [None]:
# Exceptional cases
removed_video = "Watched a video that has been removed"
# Music channels
music_channel_ids = ["- Topic", "VEVO", "Vevo", "Nensak", "Ultimate Powa", "FrightfulAccountant", "OzWho", "Orpheus", "Christina Proxenou", "Warner Classics", "Wolgadeutscher", "Nick Cave and the Bad Seeds"]

In [None]:
def contains_any(word, candidate_words):
    for cw in candidate_words:
        if cw in word:
            if "BillieEilishVEVO" == word:
                print("---")
            return True
    return False


In [None]:
def extract_watch_history(soup):
    '''
    Processes a watch history html file an returns a [VideoInfo] for videos and another for music
    Note: absolute front-end noob
    '''
    videos = []
    music_videos = []
    total_entries = 0
    videos_watched = 0
    mvideos_watched = 0
    unknown_videos = 0
    just_date_videos = 0
    non_youtube_videos = 0
    for entity in soup.find_all("div", {"class":"content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1"}):
        total_entries += 1
        first_text = entity.getText("}{").split("}{")[0]
        if "Watched" not in first_text:
            non_youtube_videos += 1
            assert first_text == "Visited YouTube Music"
            continue
        title = "Unknown"
        channel = "Unknown"
        date = "Unknown"
        text_segments = len(entity.getText("}{").split("}{"))
        if removed_video in first_text:
            title = "Removed"
            date = datetime.strptime(entity.getText("}{").split("}{")[-1], date_format)
        elif text_segments == 5:
            title = entity.a.getText()
            channel = entity.a.find_next('a').getText()
            date = datetime.strptime(entity.getText("}{").split("}{")[-1], date_format)
        elif text_segments == 4:
            title = entity.a.getText()
            channel = entity.a.find_next('a').getText()
            date = datetime.strptime(entity.getText("}{").split("}{")[-1], date_format)
        elif text_segments == 3:
            date = datetime.strptime(entity.getText("}{").split("}{")[-1], date_format)
            just_date_videos += 1
        else:
            unknown_videos += 1
            print(text_segments)
            print(entity)
            print("-----------------------------------------")

        if contains_any(channel, music_channel_ids):
            music_videos.append(VideoInfo(mvideos_watched, title, channel, date))
            mvideos_watched +=1
        else:
            videos.append(VideoInfo(videos_watched, title, channel, date))
            videos_watched +=1
        if (total_entries % 5000 == 0):
            print(f"Processed {total_entries} entries")
    print(f"Watched {videos_watched} videos")
    print(f"Watched {mvideos_watched} music videos")
    print(f"Unknown videos {unknown_videos}/{total_entries}")
    print(f"Just date videos {just_date_videos}/{total_entries}")
    print(f"\nNon-youtube {non_youtube_videos}/{total_entries}")
    return videos, music_videos

In [None]:
videos, music_videos = extract_watch_history(soup)

In [None]:
for i in range(5):
    print(videos[i])
print(f"Watched {len(videos)} videos")

In [None]:
def group_by(videos, attribute, time_interval="month"):
    '''
    Returns a dictionary indexed by attribute
    There is a better way to do this with Python 3.10
    '''
    ret = {}
    for video in videos:
        key = None
        if attribute == "title":
            key = video.title
        elif attribute == "channel":
            key = video.channel
        elif attribute == "date":
            if time_interval == "month":
                key = datetime.strptime(video.date.strftime("%b.%Y"), "%b.%Y")
            else:
                raise Exception(f"Unsupported time_interval {time_interval}")

        else:
            raise Exception(f"Unknown attribute '{attribute}'")
        if key not in ret:
            ret[key] = []
        ret[key].append(video)
    return ret
        
        

In [None]:
def count_by_key(dic, sort_by_value=True, ascending=False):
    '''
    dic: key -> [VideoInfo]
    '''
    ret = OrderedDict()
    for key in dic:
        ret[key] = len(dic[key])
    sorting_key = 1
    if not sort_by_value:
        sorting_key = 0
    rev = True
    if ascending:
        rev = False
    ret = OrderedDict(sorted(ret.items(), key = lambda t: t[sorting_key], reverse=rev))
    return ret

### Per channel analysis

In [None]:
vids_by_channel = group_by(videos, "channel")
vid_count_by_channel = count_by_key(vids_by_channel)
print(f"Channel count: {len(vid_count_by_channel)}")

In [None]:
for channel in vid_count_by_channel:
    print(f"{channel}: {vid_count_by_channel[channel]}")

### Per Video Analysis

In [None]:
vids_by_title = group_by(videos, "title")
vid_count_by_title = count_by_key(vids_by_title)
print(f"Unique video count: {len(vid_count_by_title)}")

In [None]:
for title in vid_count_by_title:
    print(f"{title}: {vid_count_by_title[title]}")

### Per Month Analysis

In [None]:
vids_by_month = group_by(videos, "date")
vid_count_by_month = count_by_key(vids_by_month, sort_by_value=False, ascending=True)
tmp = OrderedDict()
for key in vid_count_by_month:
    tmp[key.strftime("%b.%Y")] = vid_count_by_month[key]
vid_count_by_month = tmp
print(f"Number of months considered: {len(vid_count_by_month)}")

In [None]:
for month in vid_count_by_month:
    print(f"{month}: {vid_count_by_month[month]}")

In [None]:
column_chart(vid_count_by_month.keys(), vid_count_by_month.values(), "Videos watched by month", "Month", "Video Count")