In [8]:
import os
import pandas as pd
from typing import List, Tuple
import re
import emoji

# Regular expression pattern to match potential emojis, excluding skin tone modifiers
emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F"  # Emoticons
    "\U0001F300-\U0001F5FF"  # Symbols & pictographs
    "\U0001F680-\U0001F6FF"  # Transport & map symbols
    "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251"  # Enclosed characters
    "\U0001F926-\U0001F937"  # Supplemental symbols & pictographs
    "]+", flags=re.UNICODE)

# List of known emoji modifiers (e.g., skin tones)
emoji_modifiers = {'🏻', '🏼', '🏽', '🏾', '🏿'}

def q2_time(file_path: str) -> List[Tuple[str, int]]:
    all_files = []
    
    # Recursively traverse directories
    for root, dirs, files in os.walk(file_path):
        if 'day=' in os.path.basename(root):
            parquet_files = [os.path.join(root, file) for file in files if file.endswith('.parquet') and file != '.DS_Store']
            if parquet_files:
                all_files.extend(parquet_files)
    
    if not all_files:
        raise ValueError("No parquet files found in the specified directory.")

    try:
        df = pd.concat([pd.read_parquet(file) for file in all_files], ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error reading parquet files: {e}")

    # Ensure 'content' column exists
    if 'content' not in df.columns:
        raise KeyError("'content' column not found in the DataFrame.")

    # Extract emojis from the 'content' column
    df['emojis'] = df['content'].apply(lambda x: ''.join(emoji_pattern.findall(x)))

    # Drop rows where no emojis are found
    df = df[df['emojis'].str.strip() != '']
    
    # Explode the 'emojis' column to count each emoji individually
    exploded_emojis = df['emojis'].apply(list).explode().reset_index(drop=True)
    
    # Filter out emoji modifiers and ensure the character is a valid emoji
    exploded_emojis = exploded_emojis[exploded_emojis.apply(lambda x: x not in emoji_modifiers and emoji.is_emoji(x))]

    # Count the occurrences of each emoji
    emoji_counts = exploded_emojis.value_counts().reset_index()
    emoji_counts.columns = ['emoji', 'count']
    
    # Get the top 10 most used emojis
    top_emojis = emoji_counts.head(10)

    # Convert the DataFrame to a list of tuples
    result = list(top_emojis.itertuples(index=False, name=None))
    
    return result

# Example usage
file_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'
result = q2_time(file_path)
print(result)


[('🙏', 7286), ('😂', 3072), ('🚜', 2972), ('✊', 2411), ('🌾', 2363), ('❤', 1779), ('👇', 1108), ('💚', 1040), ('💪', 947), ('👍', 834)]
