In [69]:
import pandas as pd
import json
import os
from multiprocessing import Pool
from bs4 import BeautifulSoup, SoupStrainer
from itertools import chain

In [70]:
TAKEOUT_PATH = "/home/ivan/Desktop/datasets/Takeout" # CHANGE IF NECESSARY
ACTIVITY_LOG_PATH = os.path.join(TAKEOUT_PATH, "Mi actividad")

ACTIVITY_PLACEHOLDER_NAME = "MiActividad.html"

#drive
FOLDER_DRIVE_ACTIVITY_LOG_PATH = os.path.join(ACTIVITY_LOG_PATH, "Drive")
FILE_DRIVE_ACTIVITY_LOG_PATH = os.path.join(FOLDER_DRIVE_ACTIVITY_LOG_PATH, ACTIVITY_PLACEHOLDER_NAME)

#takeout
FOLDER_DRIVE_ACTIVITY_LOG_PATH = os.path.join(ACTIVITY_LOG_PATH, "Takeout")
FILE_TAKEOUT_ACTIVITY_LOG_PATH = os.path.join(FOLDER_DRIVE_ACTIVITY_LOG_PATH, ACTIVITY_PLACEHOLDER_NAME)

#youtube
FOLDER_DRIVE_ACTIVITY_LOG_PATH = os.path.join(ACTIVITY_LOG_PATH, "YouTube")
FILE_YOUTUBE_ACTIVITY_LOG_PATH = os.path.join(FOLDER_DRIVE_ACTIVITY_LOG_PATH, ACTIVITY_PLACEHOLDER_NAME)

In [None]:


def get_html_contents(file_path, chunk_size=524288):
    """
    Generator function that reads an HTML file in chunks to avoid loading the entire file into memory.
    
    Args:
    - file_path (str): Path to the HTML file.
    - chunk_size (int): Size of each chunk in bytes. Default is 512 KB.
    
    Yields:
    - str: A chunk of HTML that ends on a complete tag.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        buffer = ''
        while True:
            data = file.read(chunk_size)
            if not data:
                # Only yield the remaining buffer if it's a valid chunk with a closed tag
                if buffer:
                    yield buffer
                break

            buffer += data
            last_tag_end = max(buffer.rfind('>'), buffer.rfind('/>'))
            if last_tag_end == -1:
                continue  # Continue reading into buffer until a tag end is found

            # Yield up to the last complete tag and adjust the buffer
            yield buffer[:last_tag_end + 1]
            buffer = buffer[last_tag_end + 1:]


def process_chunk(html_content):
    """
    Processes a chunk of HTML to extract relevant data from specified div elements, including a timestamp.
    
    Args:
    - html_content (str): A string of HTML content.
    
    Returns:
    - list of dict: Extracted data from each content cell in the HTML chunk.
    """
    strainer = SoupStrainer('div', class_="outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp")
    soup = BeautifulSoup(html_content, 'html.parser', parse_only=strainer)
    entries = []
    
    for outer_div in soup.find_all('div'):
        content_cells = outer_div.find_all('div', class_="content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")
        for div in content_cells:
            links = div.find_all('a', href=True)
            complete_action = ' '.join(div.stripped_strings)
            action_code = complete_action.split('|')[0].strip() if '|' in complete_action else complete_action
            timestamp = complete_action.split('|')[-1].strip() if '|' in complete_action else ''
            entry = {
                "complete_action": complete_action,
                "action_code": action_code,
                "timestamp": timestamp,
                "link_action_name": links[0]['href'] if len(links) > 0 else '',
                "link_action_text": links[0].get_text(strip=True) if len(links) > 0 else '',
                "channel_link": links[1]['href'] if len(links) > 1 else '',
                "channel_name": links[1].get_text(strip=True) if len(links) > 1 else '',
                "link3": links[2]['href'] if len(links) > 2 else '',
                "link3_text": links[2].get_text(strip=True) if len(links) > 2 else '',
            }
            entries.append(entry)
    
    return entries

def test(file_path):
    """
    Main function to process an HTML file, extract data, and write to a CSV file using multiprocessing.
    Dynamically adjusts the number of processes based on the file size.
    
    Args:
    - file_path (str): Path to the HTML file.
    """
    # Determine file size
    file_size = os.path.getsize(file_path)
    chunk_size = 524288  # 512 KB
    num_chunks = (file_size // chunk_size) + 1 
    
    if num_chunks < 4:
        num_processes = num_chunks
    else:
        num_processes = 4
    
    # Read chunks and process them
    chunks = get_html_contents(file_path, chunk_size=chunk_size)
    with Pool(processes=num_processes) as pool:
        result_iter = pool.imap(process_chunk, chunks)
        results = list(itertools.chain.from_iterable(result_iter))
    
    # Convert results to a DataFrame and save to CSV
    df = pd.DataFrame(results)
    df.to_csv('output.csv', index=False)
    return df



In [105]:
df = test(FILE_DRIVE_ACTIVITY_LOG_PATH)
print(len(df))
value_counts = df["channel_name"].value_counts().sort_values(ascending=False)    
value_counts.head(50)


4


channel_name
    4
Name: count, dtype: int64