In [15]:
import pandas as pd
import json

In [16]:
def split_operation_title(title):
    operations = ['Liked', 'Disliked', 'Subscribed to', 'Watched', 'Watched a video that has been removed']

    cleaned_title = ""
    extracted_operation = ""

    for operation in operations:
        if title.startswith(operation):
            extracted_operation = operation
            title = title[len(operation):].strip()  # Remove the operation and strip any extra whitespace
            cleaned_title = title
            break  # Once an operation is found at the beginning, no need to search further

    return [extracted_operation, cleaned_title]

In [17]:
def is_post(titleUrl):
    if titleUrl == None:
        return None
    if "youtube.com/post/" in titleUrl:
        return True
    else:
        return False
    
def is_removed(title):
    if title == None:
        return None

    if "Watched a video that has been removed" in title:
        return True
    else:
        return False
    

# not always true
# def is_ad():
#     pass
#   "details": [{
#     "name": "From Google Ads"
#   }],

In [18]:
# Test and demo

titles = [
    "Liked Double Skull Bowie",
    "Disliked Triple Rainbow",
    "Subscribed to Cat Videos"]


for item in titles:
    print(split_operation_title(item))

['Liked', 'Double Skull Bowie']
['Disliked', 'Triple Rainbow']
['Subscribed to', 'Cat Videos']


In [19]:
with open('source_file\My Activity.json') as json_file:
    data = json.load(json_file)

In [20]:
# TEST and DEMO

def test_demo():
    for item in data:
        print("-----------------------------")
        # print(item) #single item
        print(split_operation_title(item['title'])) 
        try: #{'header': 'YouTube', 'title': 'Watched a video that has been removed', 'time': '2023-09-25T16:37:43.195Z', 'products': ['YouTube'], 'activityControls': ['YouTube watch history']}
            print(item['titleUrl'])
            print(is_post(item['titleUrl']))
        except:
            pass
        try: # Subscribe has not subtitles
            print(item['subtitles'][0]['name']) #channel name
            print(item['subtitles'][0]['url']) #channel url
        except:
            pass
        print(item['time'])
        
# test_demo()

In [21]:
def safe_get(item, key, default=None):
    try:
        return item[key]
    except (KeyError, TypeError):
        return default

dict_history = {
    'timestamp': [safe_get(item, 'time') for item in data],
    'operation': [safe_get(split_operation_title(item['title']), 0) for item in data],
    'title': [safe_get(split_operation_title(item['title']), 1) for item in data],
    'is_removed': [is_removed(safe_get(item, 'title')) for item in data],
    'title_url': [safe_get(item, 'titleUrl') for item in data],
    'is_post': [is_post(safe_get(item, 'titleUrl')) for item in data],
    'channel_name': [safe_get(item.get('subtitles', [{}])[0], 'name') for item in data],
    'channel_url': [safe_get(item.get('subtitles', [{}])[0], 'url') for item in data],
}

for key, value in dict_history.items():
    print(key, len(value))

timestamp 5
operation 5
title 5
is_removed 5
title_url 5
is_post 5
channel_name 5
channel_url 5


In [22]:
df = pd.DataFrame.from_dict(dict_history)
df

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
0,2023-01-31T14:14:14.011Z,Watched,Double Skull Bowie,False,https://www.youtube.com/watch?v=nBKwJw3rO6U,False,jimmydiresta,https://www.youtube.com/channel/UCiEk4xHBbz0hZ...
1,2023-09-25T16:37:43.195Z,Watched,a video that has been removed,True,,,,
2,2023-09-20T20:17:01.538Z,Liked,This Video is in Reverse.,False,https://www.youtube.com/watch?v=g_a3TQ9L9cM,False,Eran Amir,https://www.youtube.com/channel/UC_lUpneuEUzHV...
3,2023-06-12T18:04:43.764Z,Watched,Secret d'acteurs,False,https://www.youtube.com/watch?v=vc_MIvoVyxs,False,,
4,2023-06-02T00:39:05.787Z,Disliked,🔱 If u needed a sign this is it 🔱 Check BIO,False,https://www.youtube.com/watch?v=_x_harT-ur8,False,Poseidon Originals,https://www.youtube.com/channel/UCKzQMVJWmvkMW...


In [23]:
liked_videos = df.loc[(df['operation'] == 'Liked') & (df['is_post'] == False) & (df['channel_name'].notna())]


# save liked videos to output.txt, can be used as batch file for yt-dlp
with open('output.txt', 'w') as f:
    f.write(liked_videos['title_url'].str.cat(sep='\n'))

display(liked_videos)

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
2,2023-09-20T20:17:01.538Z,Liked,This Video is in Reverse.,False,https://www.youtube.com/watch?v=g_a3TQ9L9cM,False,Eran Amir,https://www.youtube.com/channel/UC_lUpneuEUzHV...


In [24]:
df.loc[df['is_removed'] == True]

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
1,2023-09-25T16:37:43.195Z,Watched,a video that has been removed,True,,,,


In [25]:
df.loc[(df['operation'] == 'Watched')]

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
0,2023-01-31T14:14:14.011Z,Watched,Double Skull Bowie,False,https://www.youtube.com/watch?v=nBKwJw3rO6U,False,jimmydiresta,https://www.youtube.com/channel/UCiEk4xHBbz0hZ...
1,2023-09-25T16:37:43.195Z,Watched,a video that has been removed,True,,,,
3,2023-06-12T18:04:43.764Z,Watched,Secret d'acteurs,False,https://www.youtube.com/watch?v=vc_MIvoVyxs,False,,


In [26]:
df.loc[(df['operation'] == 'Watched') & (df['is_post'] == False) & (df['channel_name'].notna())] #without ADS

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
0,2023-01-31T14:14:14.011Z,Watched,Double Skull Bowie,False,https://www.youtube.com/watch?v=nBKwJw3rO6U,False,jimmydiresta,https://www.youtube.com/channel/UCiEk4xHBbz0hZ...


In [27]:
df.loc[(df['operation'] == 'Watched') & (df['is_post'] == False) & (df['channel_name'].isna())].sort_values(by=['timestamp']) # ads

Unnamed: 0,timestamp,operation,title,is_removed,title_url,is_post,channel_name,channel_url
3,2023-06-12T18:04:43.764Z,Watched,Secret d'acteurs,False,https://www.youtube.com/watch?v=vc_MIvoVyxs,False,,
