In [88]:
import os
import pandas as pd
from typing import List, Tuple
from datetime import datetime

def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    all_files = []
    
    # Recursively traverse directories
    for root, dirs, files in os.walk(file_path):
        # Check if it's a day directory
        if 'day=' in os.path.basename(root):
            # Filter parquet files and skip .DS_Store
            parquet_files = [os.path.join(root, file) for file in files if file.endswith('.parquet') and file != '.DS_Store']
            if parquet_files:
                all_files.extend(parquet_files)
    
    if not all_files:
        raise ValueError("No parquet files found in the specified directory.")

    # Read and concatenate all parquet files
    try:
        df = pd.concat([pd.read_parquet(file) for file in all_files], ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error reading parquet files: {e}")

    # Ensure 'date' and 'user' columns exist
    if 'date' not in df.columns or 'user' not in df.columns:
        raise KeyError("'date' or 'user' column not found in the DataFrame.")

    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])

    # Group by date and count tweets
    grouped = df.groupby('date').size().reset_index(name='tweet_count')

    # Find top 10 dates
    top_dates = grouped.sort_values(by='tweet_count', ascending=False).head(10)

    # Find user with most tweets for each top date
    result = []
    # Find user with most tweets for each top date
    for _, row in top_dates.iterrows():
        date = row['date']
        daily_df = df[df['date'] == date]
        top_user = daily_df['user'].value_counts().idxmax()
        # Retrieve the user_info dictionary for the top user
        user_info = daily_df[daily_df['user'] == top_user]['user'].iloc[0]  # Assuming 'user' column contains the user details
        
        # Extract the username from the nested user_info structure
        username = user_info['username']
        
        # Append the date and only the username
        result.append((date.date(), username))

    return result

# Example usage
file_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'
result = q1_time(file_path)
print(result)


[(datetime.date(2021, 2, 20), 'SivaKum66642898'), (datetime.date(2021, 2, 17), 'RanjeetSinghMK'), (datetime.date(2021, 2, 23), 'Cuttack_IYC'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 15), 'bot_shiv'), (datetime.date(2021, 2, 17), 'Gurchar49439958'), (datetime.date(2021, 2, 23), 'Preetm91'), (datetime.date(2021, 2, 16), 'Monica_Gill1'), (datetime.date(2021, 2, 24), 'NavNarinder'), (datetime.date(2021, 2, 17), 'Monica_Gill1')]


In [86]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

df = pd.read_json('/Users/juanignaciomagarinoscastro/Downloads/farmers-protest-tweets-2021-2-4.json', lines=True)

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month.apply(lambda x: f'{x:02}')
df['day'] = df['date'].dt.day.apply(lambda x: f'{x:02}')

chunk_size = 5000

output_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'

date_col = 'date'  
user_col = 'user'  

if date_col not in df.columns:
    raise KeyError(f"'{date_col}' column not found in the DataFrame.")
    
if user_col not in df.columns:
    raise KeyError(f"'{user_col}' column not found in the DataFrame.")

df[date_col] = pd.to_datetime(df[date_col])

grouped = df.groupby(date_col).size().reset_index(name='tweet_count')

top_dates = grouped.sort_values(by='tweet_count', ascending=False).head(10)

result = []
for _, row in top_dates.iterrows():
    date = row[date_col]
    daily_df = df[df[date_col] == date]

    if user_col in daily_df.columns:
        daily_df['username'] = daily_df[user_col].apply(
            lambda x: x.get('username') if isinstance(x, dict) else None
            )
        top_user = daily_df['username'].value_counts().idxmax()
        result.append((date.date(), top_user))
    else:
        raise KeyError(f"'{user_col}' column not found in the DataFrame.")

print(result)
    

[(datetime.date(2021, 2, 20), 'SivaKum66642898'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 23), 'Cuttack_IYC'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 17), 'ajityadavdu'), (datetime.date(2021, 2, 23), 'Preetm91'), (datetime.date(2021, 2, 16), 'Monica_Gill1'), (datetime.date(2021, 2, 24), 'NavNarinder'), (datetime.date(2021, 2, 17), 'Monica_Gill1')]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_df['username'] = daily_df[user_col].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_df['username'] = daily_df[user_col].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_df['username'] = daily_df[user_col].apply(
A value is trying to be set on a copy of a slice from a Da

In [85]:
print(result)

[(datetime.date(2021, 2, 20), 'SivaKum66642898'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 23), 'Cuttack_IYC'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 17), 'ajityadavdu'), (datetime.date(2021, 2, 23), 'Preetm91'), (datetime.date(2021, 2, 16), 'Monica_Gill1'), (datetime.date(2021, 2, 24), 'NavNarinder'), (datetime.date(2021, 2, 17), 'Monica_Gill1')]
