In [None]:
from pprint import pprint
from bs4 import BeautifulSoup
from datetime import datetime
from collections import OrderedDict

In [None]:
# Nov 17, 2021, 7:56:53 PM CET
date_format = "%b %d, %Y, %I:%M:%S %p %Z"
class VideoInfo:
    def __init__(self, id, title, channel, date):
        self.id = id
        self.title = title
        self.channel = channel
        self.date = date

    def __str__(self):
        return f"Video {self.id}: '{self.title}' from '{self.channel}', on {self.date}"

In [None]:
def load_html(filename):
    soup=None
    with open(filename) as f:
        soup = BeautifulSoup(f, 'html.parser')
    return soup

In [None]:
# May take a while
# Find this file via google's takeout feature
soup = load_html("data/watch-history.html")

In [None]:
# Exceptional cases
removed_video = "Watched a video that has been removed"

In [None]:
def extract_watch_history(soup):
    '''
    Processes a watch history html file an returns a [VideoInfo]
    Note: absolute front-end noob
    '''
    videos = []
    videos_watched = 0
    for entity in soup.find_all("div", {"class":"content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1"}):
        first_text = entity.getText("|").split("|")[0]
        if "Watched" not in first_text:
            continue
        title = "Unknown"
        channel = "Unknown"
        date = "Unknown"
        text_segments = len(entity.getText("|").split("|"))
#         if videos_watched > 25000:
#             print(entity)
#             print("-----------------------------------------")
        if removed_video in first_text:
            title = "Removed"
            date = datetime.strptime(entity.getText("|").split("|")[1], date_format)
        elif text_segments == 4:
            title = entity.a.getText()
            channel = entity.a.find_next('a').getText()
            date = datetime.strptime(entity.getText("|").split("|")[-1], date_format)

        videos.append(VideoInfo(videos_watched, title, channel, date))
        videos_watched +=1
        if (videos_watched % 5000 == 0):
            print(f"Processed {videos_watched} videos")
    return videos

In [None]:
videos = extract_watch_history(soup)

In [None]:
for i in range(10):
    print(videos[i])
print(videos[23000])
print(f"Watched {len(videos)} videos")

In [None]:
def group_by(videos, attribute):
    '''
    Returns a dictionary indexed by attribute
    There is a better way to do this with Python 3.10
    '''
    ret = {}
    for video in videos:
        key = None
        if attribute == "title":
            key = video.title
        elif attribute == "channel":
            key = video.channel
        elif attribute == "date":
            key = video.date
        else:
            raise Exception(f"Unknown attribute '{attribute}'")
        if key not in ret:
            ret[key] = []
        ret[key].append(video)
    return ret
        
        

In [None]:
def count_by_key(dic):
    '''
    dic: key -> [VideoInfo]
    '''
    ret = OrderedDict()
    for key in dic:
        ret[key] = len(dic[key])
    ret = OrderedDict(sorted(ret.items(), key = lambda t: t[1], reverse=True))
    print(type(ret))
    return ret

In [None]:
vids_by_channel = group_by(videos, "channel")
vid_count_by_channel = count_by_key(vids_by_channel)

In [None]:
print(f"Channel count: {len(vid_count_by_channel)}")

In [None]:
for key in vid_count_by_channel:
    print(key, vid_count_by_channel[key])