In [91]:
import os
import pandas as pd
from typing import List, Tuple
from datetime import datetime

def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    all_files = []
    
    for root, dirs, files in os.walk(file_path):
        if 'day=' in os.path.basename(root):
            parquet_files = [os.path.join(root, file) for file in files if file.endswith('.parquet') and file != '.DS_Store']
            if parquet_files:
                all_files.extend(parquet_files)
    
    if not all_files:
        raise ValueError("No parquet files found in the specified directory.")

    try:
        df = pd.concat([pd.read_parquet(file) for file in all_files], ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error reading parquet files: {e}")

    if 'date' not in df.columns or 'user' not in df.columns:
        raise KeyError("'date' or 'user' column not found in the DataFrame.")

    df['date'] = pd.to_datetime(df['date'])

    grouped = df.groupby('date').size().reset_index(name='tweet_count')

    top_dates = grouped.sort_values(by='tweet_count', ascending=False).head(10)

    result = []
    for _, row in top_dates.iterrows():
        date = row['date']
        daily_df = df[df['date'] == date]
        top_user = daily_df['user'].value_counts().idxmax()
        user_info = daily_df[daily_df['user'] == top_user]['user'].iloc[0] 
        
        username = user_info['username']
        
        result.append((date.date(), username))

    return result

file_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'
result = q1_time(file_path)
print(result)


[(datetime.date(2021, 2, 20), 'SivaKum66642898'), (datetime.date(2021, 2, 17), 'RanjeetSinghMK'), (datetime.date(2021, 2, 23), 'Cuttack_IYC'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 15), 'bot_shiv'), (datetime.date(2021, 2, 17), 'Gurchar49439958'), (datetime.date(2021, 2, 23), 'Preetm91'), (datetime.date(2021, 2, 16), 'Monica_Gill1'), (datetime.date(2021, 2, 24), 'NavNarinder'), (datetime.date(2021, 2, 17), 'Monica_Gill1')]


In [93]:
import os
import pandas as pd
from typing import List, Tuple
from datetime import datetime

def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    all_files = []
    
    # Walk through the directories to gather Parquet files
    for root, dirs, files in os.walk(file_path):
        if 'day=' in os.path.basename(root):
            parquet_files = [os.path.join(root, file) for file in files if file.endswith('.parquet') and file != '.DS_Store']
            if parquet_files:
                all_files.extend(parquet_files)
    
    if not all_files:
        raise ValueError("No parquet files found in the specified directory.")

    try:
        df = pd.concat([pd.read_parquet(file) for file in all_files], ignore_index=True)
    except Exception as e:
        raise ValueError(f"Error reading parquet files: {e}")

    # Check for required columns
    if 'date' not in df.columns or 'user' not in df.columns:
        raise KeyError("'date' or 'user' column not found in the DataFrame.")

    # Convert date column to datetime and ensure it's in the correct format
    df['date'] = pd.to_datetime(df['date'])

    # Extract the username from the 'user' field if it's a dictionary
    user_col = 'user'
    if isinstance(df[user_col].iloc[0], dict):
        df['username'] = df[user_col].apply(lambda x: x.get('username', '') if isinstance(x, dict) else x)
    else:
        df['username'] = df[user_col]  # If 'user' is already a string, just use it

    # Group by date and username, and count the occurrences
    user_tweet_count = df.groupby(['date', 'username']).size().reset_index(name='tweet_count')

    # Find the top 10 dates with the most tweets
    date_tweet_count = df.groupby('date').size().reset_index(name='tweet_count')
    top_dates = date_tweet_count.sort_values(by='tweet_count', ascending=False).head(10)

    result = []
    for _, row in top_dates.iterrows():
        date = row['date']
        # Filter the DataFrame for the current date
        daily_df = user_tweet_count[user_tweet_count['date'] == date]
        
        # Find the user with the most tweets for the current date
        top_user_row = daily_df.sort_values(by='tweet_count', ascending=False).iloc[0]
        top_user = top_user_row['username']

        # Append the result as (date, top_user)
        result.append((date.date(), top_user))

    return result

# Test the function
file_path = '/Users/juanignaciomagarinoscastro/Downloads/tweets_by_date'
result = q1_time(file_path)
print(result)


[(datetime.date(2021, 2, 20), 'SivaKum66642898'), (datetime.date(2021, 2, 17), 'RaaJVinderkaur'), (datetime.date(2021, 2, 23), 'AlamGahir'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 15), 'ajityadavdu'), (datetime.date(2021, 2, 17), 'ajityadavdu'), (datetime.date(2021, 2, 23), 'Preetm91'), (datetime.date(2021, 2, 16), 'Monica_Gill1'), (datetime.date(2021, 2, 24), 'NavNarinder'), (datetime.date(2021, 2, 17), 'Monica_Gill1')]


In [1]:
from typing import List, Tuple
from datetime import datetime
import pandas as pd
import json

def q1_time(file_path: str) -> List[Tuple[datetime.date, str, int]]:
    with open(file_path, 'r') as json_file:
        #Cargar archivo json linea a linea
        data = [json.loads(line.strip()) for line in json_file]
    #Usar una list comprenhension para extraer los campos
    data = [(item['date'], item['user']['username'], item['id']) for item in data if 'date' in item and 'user' in item and 'id' in item and 'id' in item['user']]

    #Convertir la lista de tuplas en un dataframe de pandas
    df = pd.DataFrame(data, columns=['date', 'user', 'id'])
    
    # Convertir campo date en formato datetime
    df['date'] = pd.to_datetime(df['date']).dt.date

    tweet_counts = df.groupby('date').size()
    top_10_dates = tweet_counts.nlargest(10).index
    df_top_10 = df[df['date'].isin(top_10_dates)]
    top_users = df_top_10.groupby('date')['user'].agg(lambda x: x.value_counts().index[0])

    # Convertir el resulatado en una lista de tuplas
    result = [(date, user) for date, user in zip(top_10_dates, top_users)]
    
    return result

file_path='/Users/juanignaciomagarinoscastro/Downloads/farmers-protest-tweets-2021-2-4.json'
q1_time(file_path)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'rebelpacifist'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'jot__b'),
 (datetime.date(2021, 2, 18), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 15), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 20), 'Preetm91'),
 (datetime.date(2021, 2, 23), 'MangalJ23056160'),
 (datetime.date(2021, 2, 19), 'Surrypuria')]

In [3]:
from typing import List, Tuple
from datetime import datetime
import pandas as pd
import json

def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    # Leer el archivo JSON línea por línea y procesarlo
    data = []
    with open(file_path, 'r') as json_file:
        for line in json_file:
            try:
                item = json.loads(line.strip())
                # Verificar que los campos necesarios están presentes antes de agregar
                if 'date' in item and 'user' in item and 'id' in item:
                    data.append((item['date'], item['user']['username']))
            except json.JSONDecodeError:
                # Manejar errores de decodificación JSON
                continue
    
    # Convertir la lista de tuplas en un DataFrame de pandas
    df = pd.DataFrame(data, columns=['date', 'user'])
    
    # Convertir el campo 'date' a formato datetime directamente sin conversiones intermedias
    df['date'] = pd.to_datetime(df['date']).dt.date

    # Calcular la cantidad de tweets por fecha y encontrar las 10 fechas con más tweets
    tweet_counts = df['date'].value_counts().nlargest(10)
    top_10_dates = tweet_counts.index

    # Filtrar el DataFrame para incluir solo las fechas top 10
    df_top_10 = df[df['date'].isin(top_10_dates)]
    
    # Encontrar el usuario con más tweets para cada una de las fechas top 10
    top_users = df_top_10.groupby('date')['user'].agg(lambda x: x.value_counts().idxmax())

    # Convertir el resultado a una lista de tuplas con (fecha, usuario)
    result = [(date, user) for date, user in zip(top_10_dates, top_users)]
    
    return result

# Ruta del archivo JSON a procesar
file_path = '/Users/juanignaciomagarinoscastro/Downloads/farmers-protest-tweets-2021-2-4.json'
q1_time(file_path)


[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'rebelpacifist'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'jot__b'),
 (datetime.date(2021, 2, 18), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 15), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 20), 'Preetm91'),
 (datetime.date(2021, 2, 23), 'MangalJ23056160'),
 (datetime.date(2021, 2, 19), 'Surrypuria')]