In [1]:
import os
import pandas as pd
from typing import List, Tuple
import re

def q3_time(file_path: str) -> List[Tuple[str, int]]:
    all_files = []
    
    for root, dirs, files in os.walk(file_path):
        if 'day=' in os.path.basename(root):
            parquet_files = [os.path.join(root, file) for file in files if file.endswith('.parquet') and file != '.DS_Store']
            if parquet_files:
                all_files.extend(parquet_files)
    
    if not all_files:
        raise ValueError("No parquet files found in the specified directory.")

    try:
        df = pd.concat([pd.read_parquet(file) for file in all_files], ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error reading parquet files: {e}")

    if 'content' not in df.columns:
        raise KeyError("'content' column not found in the DataFrame.")

    mention_pattern = re.compile(r'@(\w+)')

    df['mentions'] = df['content'].apply(lambda x: mention_pattern.findall(x))

    exploded_mentions = df['mentions'].explode().reset_index(drop=True)
    
    mention_counts = exploded_mentions.value_counts().reset_index()
    mention_counts.columns = ['username', 'count']
    
    top_mentions = mention_counts.head(10)

    result = list(top_mentions.itertuples(index=False, name=None))
    
    return result

file_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'
result = q3_time(file_path)
print(result)


[('narendramodi', 2261), ('Kisanektamorcha', 1836), ('RakeshTikaitBKU', 1639), ('PMOIndia', 1422), ('RahulGandhi', 1125), ('GretaThunberg', 1046), ('RaviSinghKA', 1015), ('rihanna', 972), ('UNHumanRights', 962), ('meenaharris', 925)]
