In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pytubefix import YouTube
from datetime import datetime
import numpy as np

In [2]:
# Read HTML content from file
file_path = 'data/Takeout/YouTube and YouTube Music/history/watch-history.html'  # Replace with the actual path to your HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse HTML content with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

div_elements = soup.find_all('div', class_='content-cell')

In [3]:
# Initialize a list to store dictionaries
video_data_list = []

for div_element in div_elements:
    if len(div_element.find_all('a')) == 2:
        video_data = {
            'Title': div_element.find('a').text,
            'Channel': div_element.find_all('a')[1].text,
            'Timestamp': div_element.find_all('br')[-1].next_sibling.strip().replace('\u202f', ''),
            'Video Link': div_element.find('a')['href'],
            'Channel Link': div_element.find_all('a')[1]['href']
        }

        video_data_list.append(video_data)

# Convert the list of dictionaries to a DataFrame
df = pd.concat([pd.DataFrame([data]) for data in video_data_list], ignore_index=True)

# Remove 'PST' from the timestamp string
df['Timestamp'] = df['Timestamp'].str.replace(' PST', '')

# Convert the 'timestamp' column to datetime objects
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%b %d, %Y, %I:%M:%S%p')

# Set the time zone as 'PST'
df['Timestamp'] = df['Timestamp'].dt.tz_localize('US/Pacific')

df = df[df['Timestamp'].dt.year == datetime.now().year]

# Display the DataFrame
df.head()

Unnamed: 0,Title,Channel,Timestamp,Video Link,Channel Link
0,होटल जैसा पनीर बटर मसाला | Restaurant style Pa...,Chef Ranveer Brar,2023-11-22 10:25:00-08:00,https://www.youtube.com/watch?v=a30BLUQiFoc,https://www.youtube.com/channel/UCEHCDn_BBnk3u...
1,What is the Riemann Hypothesis REALLY about?,HexagonVideos,2023-11-22 09:06:11-08:00,https://www.youtube.com/watch?v=e4kOh7qlsM4,https://www.youtube.com/channel/UCQoJIigcorV_0...
2,It Was Just A Warm-Up!😩,Squat University,2023-11-22 09:04:52-08:00,https://www.youtube.com/watch?v=7PbWfSX74RU,https://www.youtube.com/channel/UCyPYQTT20IgzV...
3,Top Pitches With A $1M Valuation In Celebratio...,Shark Tank Global,2023-11-22 00:20:22-08:00,https://www.youtube.com/watch?v=Jn1CimnV290,https://www.youtube.com/channel/UCREgA-BmOocJ9...
4,Rating Lebanon's National Dish,KWOOWK,2023-11-22 00:19:32-08:00,https://www.youtube.com/watch?v=9M8pwa-qcr0,https://www.youtube.com/channel/UC3vQEjRhwgH2H...


In [4]:
# Define the chunk size
chunk_size = 500

# Calculate the number of chunks
num_chunks = len(df) // chunk_size + 1

# Split the DataFrame into chunks
chunks = np.array_split(df, num_chunks)

  return bound(*args, **kwds)


In [None]:
def get_length_video(link):
    try:
        length = YouTube(link).length
    except:
        length = -1
    return length

i = 0
for chunk in chunks:
    chunk['Video Length'] = chunk['Video Link'].apply(lambda x: get_length_video(x))
    i += 1
    print(f'{datetime.now()} Finished processing chunk {i}')

2023-12-03 13:56:58.029034 Finished processing chunk 1
2023-12-03 14:03:03.052595 Finished processing chunk 2
2023-12-03 14:08:47.651607 Finished processing chunk 3
2023-12-03 14:15:25.522640 Finished processing chunk 4
2023-12-03 14:24:00.664628 Finished processing chunk 5
2023-12-03 14:30:12.831208 Finished processing chunk 6
2023-12-03 14:35:45.328164 Finished processing chunk 7
2023-12-03 14:41:50.068369 Finished processing chunk 8
2023-12-03 14:49:07.178008 Finished processing chunk 9
2023-12-03 14:55:18.052067 Finished processing chunk 10


In [None]:
# Apply your function to each chunk
result_chunks = [your_function(chunk) for chunk in chunks]

# Concatenate the results back into a single DataFrame
result_df = pd.concat(result_chunks)

In [None]:
import os
os.system('say "your program has finished"')