En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

In [3]:
from typing import List, Tuple
from datetime import datetime
from collections import defaultdict, Counter
import json
import memory_profiler
import line_profiler

In [27]:
file_path = "farmers-protest-tweets-2021-2-4.json"

# This functions will read the json file, iterate each line to get date and username and find the top 10 dates with its top user

This function parses the JSON file once and iterates over each line and uses separate dictionaries for storing tweet counts

In [37]:
def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
    # Open dictionaries to store tweet and user tweet counts
    tweet_count = {}
    user_tweets = {}

    # Open the JSON file to iterate and parse each line
    with open(file_path, 'r') as f:
        for line in f:
            tweet = json.loads(line)      
                  
            # Extract the tweet date and username
            tweet_date = datetime.fromisoformat(tweet['date']).date()
            user = tweet['user']['username']

            # Increment the tweet count for the tweet date
            tweet_count[tweet_date] = tweet_count.get(tweet_date, 0) + 1
            
            # Increment the user tweet count for the tweet date and user
            user_tweets[(tweet_date, user)] = user_tweets.get((tweet_date, user), 0) + 1

    # Find the top 10 dates with the most tweets
    top_dates = sorted(tweet_count.items(), key=lambda x: x[1], reverse=True)[:10]
    
    # Find the user with the most tweets for each of the top dates
    result = [(date, max((user for (d, user), count in user_tweets.items() if d == date), key=user_tweets.get)) for date, _ in top_dates]
    return result

# Test
q1_time(file_path)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

In [38]:
%memit -r 1 q1_time(file_path)
%lprun -f q1_time q1_time(file_path)

peak memory: 957.99 MiB, increment: 411.00 MiB


Timer unit: 1e-07 s

Total time: 4.71895 s
File: C:\Users\julio\AppData\Local\Temp\ipykernel_12976\3299383397.py
Function: q1_time at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q1_time(file_path: str) -> List[Tuple[datetime.date, str]]:
     2         2       1484.0    742.0      0.0      with open(file_path, 'r') as f:
     3         1   13825196.0    1e+07     29.3          data = f.readlines()
     4                                           
     5         1          7.0      7.0      0.0      tweet_count = {}
     6         1          8.0      8.0      0.0      user_tweets = {}
     7                                           
     8    117408     408256.0      3.5      0.9      for line in data:
     9    117407   29684550.0    252.8     62.9          tweet = json.loads(line)
    10    117407     901936.0      7.7      1.9          tweet_date = datetime.fromisoformat(tweet['date']).date()
    11    1

This function optimizes memory usage by using a more memory-efficient data structure and avoiding the creation of unnecessary intermediate data structures

In [36]:
def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
    # Open a defaultdict to store tweet counts for each date and user
    date_user_count = defaultdict(lambda: defaultdict(int))

    # Open the JSON file and iterate and parse each line
    with open(file_path, 'r') as file:
        for line in file:
            tweet = json.loads(line)
            
            # Extract the tweet date and username
            tweet_date = datetime.strptime(tweet['date'], '%Y-%m-%dT%H:%M:%S+00:00').date()
            username = tweet['user']['username']
            
            # Increment the count of tweets for the tweet date and username
            date_user_count[tweet_date][username] += 1

    # Find the top 10 dates with the most tweets
    top_10_dates = sorted(date_user_count.items(), key=lambda x: sum(x[1].values()), reverse=True)[:10]
    
    # Open an empty list to store the result
    result = []
    
    # Iterate over the top 10 dates and their corresponding user counts
    for date, users in top_10_dates:
        # Find the user with the maximum tweet count for the current date and append the date and the user with the maximum tweet count to the result list
        max_user = max(users, key=users.get)
        result.append((date, max_user))
    return result

# Test
q1_memory(file_path)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

In [39]:
%memit -r 1 q1_memory(file_path)
%lprun -f q1_memory q1_memory(file_path)

peak memory: 550.82 MiB, increment: 1.61 MiB


Timer unit: 1e-07 s

Total time: 8.8611 s
File: C:\Users\julio\AppData\Local\Temp\ipykernel_12976\1097590013.py
Function: q1_memory at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q1_memory(file_path: str) -> List[Tuple[datetime.date, str]]:
     2         1         21.0     21.0      0.0      date_user_count = defaultdict(lambda: defaultdict(int))
     3                                           
     4         2       1992.0    996.0      0.0      with open(file_path, 'r') as file:
     5    117408   14626567.0    124.6     16.5          for line in file:
     6    117407   32447833.0    276.4     36.6              tweet = json.loads(line)
     7    117407   39986103.0    340.6     45.1              tweet_date = datetime.strptime(tweet['date'], '%Y-%m-%dT%H:%M:%S+00:00').date()
     8    117407     446358.0      3.8      0.5              username = tweet['user']['username']
     9    117407    1064809.0   

# Here we'll read the json file and use a helper function to find the emojis from the content and then iterate to find the most commo ones

Here we will create a function to extract emojis from text uing unicode ranges, and then used counter object to find the 10 most used

In [60]:
def extract_emojis(text: str) -> List[str]:
    emojis = []
    for char in text:
        if 0x1F600 <= ord(char) <= 0x1F64F or \
           0x1F300 <= ord(char) <= 0x1F5FF or \
           0x1F680 <= ord(char) <= 0x1F6FF or \
           0x2600 <= ord(char) <= 0x26FF or \
           0x2700 <= ord(char) <= 0x27BF or \
           0xFE00 <= ord(char) <= 0xFE0F or \
           0x1F900 <= ord(char) <= 0x1F9FF:
            emojis.append(char)
    return emojis

In this function all lines from the JSON file are read into memory at once using file.readline

In [69]:
def q2_time(file_path: str) -> List[Tuple[str, int]]:
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read all lines from the file into memory
        all_content = file.readlines()
    # Create a Counter 
    emojis_counter = Counter()
    # Process and load each line of JSON data
    for line in all_content:
        json_data = json.loads(line.strip())
        # Extract content from JSON data
        content = json_data.get('content', '')
        # Extract emojis from content
        emojis = extract_emojis(content)
        # Update the Counter with the extracted emojis
        emojis_counter.update(emojis)
    # Return the top 10 most common emojis
    return emojis_counter.most_common(10)


# Test
q2_time(file_path)

[('🙏', 7286),
 ('😂', 3072),
 ('️', 3061),
 ('🚜', 2972),
 ('✊', 2411),
 ('🌾', 2363),
 ('🏻', 2080),
 ('❤', 1779),
 ('🤣', 1668),
 ('🏽', 1218)]

In [70]:
%memit -r 1 q2_time(file_path)
%lprun -f q2_time q2_time(file_path)

peak memory: 522.01 MiB, increment: 409.02 MiB


Timer unit: 1e-07 s

Total time: 41.006 s
File: C:\Users\julio\AppData\Local\Temp\ipykernel_12976\3743776943.py
Function: q2_time at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q2_time(file_path: str) -> List[Tuple[str, int]]:
     2         2       1406.0    703.0      0.0      with open(file_path, 'r', encoding='utf-8') as file:
     3         1    9346819.0    9e+06      2.3          all_content = file.readlines() 
     4         1        163.0    163.0      0.0      emojis_counter = Counter()
     5    117408     537739.0      4.6      0.1      for line in all_content:
     6    117407   36636093.0    312.0      8.9          json_data = json.loads(line.strip())
     7    117407     601205.0      5.1      0.1          content = json_data.get('content', '')
     8    117407  359193115.0   3059.4     87.6          emojis = extract_emojis(content)
     9    117407    3739595.0     31.9      0.9          emo

In this function, JSON data is processed line by line without loading the entire file into memory at once

In [68]:
def q2_memory(file_path: str) -> List[Tuple[str, int]]:
    # Create a Counter to count occurrences of emojis
    emojis_counter = Counter()
    with open(file_path, 'r', encoding='utf-8') as file:
        # Process and load each line of JSON data
        for line in file:
            json_data = json.loads(line.strip())
            # Extract content from JSON data
            content = json_data.get('content', '')
            # Extract emojis from content
            emojis = extract_emojis(content)
            # Update the Counter with the extracted emojis
            emojis_counter.update(emojis)
    # Return the top 10 most common emojis
    return emojis_counter.most_common(10)

# Test
q2_memory(file_path)
    

[('🙏', 7286),
 ('😂', 3072),
 ('️', 3061),
 ('🚜', 2972),
 ('✊', 2411),
 ('🌾', 2363),
 ('🏻', 2080),
 ('❤', 1779),
 ('🤣', 1668),
 ('🏽', 1218)]

In [71]:
%memit -r 1 q2_memory(file_path)
%lprun -f q2_memory q2_memory(file_path)

peak memory: 113.50 MiB, increment: 0.40 MiB


Timer unit: 1e-07 s

Total time: 42.1141 s
File: C:\Users\julio\AppData\Local\Temp\ipykernel_12976\1918972568.py
Function: q2_memory at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q2_memory(file_path: str) -> List[Tuple[str, int]]:
     2         1         84.0     84.0      0.0      emojis_counter = Counter()
     3         2       1822.0    911.0      0.0      with open(file_path, 'r', encoding='utf-8') as file:
     4    117408   12698783.0    108.2      3.0          for line in file:
     5    117407   38472255.0    327.7      9.1              json_data = json.loads(line.strip())
     6    117407     640805.0      5.5      0.2              content = json_data.get('content', '')
     7    117407  365105141.0   3109.7     86.7              emojis = extract_emojis(content)
     8    117407    4218375.0     35.9      1.0              emojis_counter.update(emojis)
     9                                      

In [None]:
def q3_memory(file_path: str) -> List[Tuple[str, int]]:
    # Diccionario para almacenar el conteo de menciones para cada usuario
    mention_counts = {}
    
    # Abrir el archivo JSON de tweets
    with open(file_path, 'r', encoding='utf-8') as file:
        # Leer cada línea del archivo
        for line in file:
            # Parsear los datos JSON de la línea
            tweet_data = json.loads(line)
            
            # Extraer el nombre de usuario del tweet
            username = tweet_data['user']['username']
            
            # Contar el número de menciones (@) en el contenido del tweet
            mention_count = tweet_data['content'].count('@')
            
            # Actualizar el conteo de menciones para el usuario en el diccionario
            if username in mention_counts:
                mention_counts[username] += mention_count
            else:
                mention_counts[username] = mention_count
    
    # Seleccionar los top 10 usuarios más influyentes basados en el conteo de menciones
    top_users = sorted(mention_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    
    # Retornar el resultado
    return top_users

In [None]:
def q3_time(file_path: str) -> List[Tuple[str, int]]:
    # Diccionario para almacenar el conteo de menciones para cada usuario
    mention_counts = Counter()
    
    # Abrir el archivo JSON de tweets
    with open(file_path, 'r', encoding='utf-8') as file:
        # Leer cada línea del archivo
        for line in file:
            # Parsear los datos JSON de la línea
            tweet_data = json.loads(line)
            
            # Extraer el nombre de usuario del tweet
            username = tweet_data['user']['username']
            
            # Contar el número de menciones (@) en el contenido del tweet y actualizar el conteo
            mention_count = tweet_data['content'].count('@')
            mention_counts[username] += mention_count
    
    # Seleccionar los top 10 usuarios más influyentes basados en el conteo de menciones
    top_users = mention_counts.most_common(10)
    
    # Retornar el resultado
    return top_users