# Carga Profilers
Se cargan los profilers y se crea una magic function que permite escribir y correr las definiciones de las funciones para facilitar las mediciones con los perfiladores cargados en memoria.

In [1]:
%load_ext memory_profiler

In [2]:
%load_ext line_profiler

In [31]:
from IPython.core.magic import register_cell_magic
from typing import *
import datetime

@register_cell_magic
def write_and_run(line, cell):
    argz = line.split()
    file = argz[-1]
    mode = 'w'
    if len(argz) == 2 and argz[0] == '-a':
        mode = 'a'
    with open(file, mode) as f:
        f.write(cell)
    get_ipython().run_cell(cell)

In [32]:
%%write_and_run test_func.py
def testing_func():
    x=[]
    for i in range (2):
        x+=[i]

In [33]:
 
%lprun -f testing_func testing_func()
from test_func import testing_func
%mprun -f testing_func testing_func()




Filename: c:\Users\Juantc93\DEV\latam_de_challenge\test_func.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     1    146.6 MiB    146.6 MiB           1   def testing_func():
     2    146.6 MiB      0.0 MiB           1       x=[]
     3    146.6 MiB      0.0 MiB           3       for i in range (2):
     4    146.6 MiB      0.0 MiB           2           x+=[i]

## Constants

In [34]:
FILE_PATH="tweets.json.zip"

## Estrategia de solución para optimización en memoria

En general la estrategía de optimización consitió en utilizar librerías de más bajo nivel para la lectura de los datos serializados, haciendo uso de tan solo los estrictamente necesarios para la optimización memoria.

## Estrategia de solución para optimización en tiempo
Para la optimización en tiempo de ejecución se utilizaron básicamente métodos estándar de la librería pandas.

## Twiteros del día

In [42]:
%%write_and_run q1_time.py
def q1_time(file_path):
    import pandas as pd
    df = pd.read_json(file_path, lines=True)\
        .assign(date=lambda x: x.date.dt.date,
                username=lambda x: x.user.apply(lambda y: y.get("username")))

    df_top_ten_date=df["date"]\
        .value_counts()\
        .nlargest(10)\
        .rename("twits")\
        .rename_axis("date")


    return df.loc[:,["date","username"]]\
        .set_index("date")\
        .join(df_top_ten_date,how="inner")\
        .set_index("username",append=True)\
        .groupby(["date","username","twits"])\
        .agg(twits_user=pd.NamedAgg(column="twits",aggfunc="count"))\
        .assign(
            rank=lambda x: x.groupby(["date","twits"])["twits_user"].rank(axis='index',method="first",ascending=False))\
        .query("rank==1")\
        .sort_index(level="twits",ascending=False)\
        .reset_index()\
        .loc[:,["date","username"]]\
        .to_records(index=False).tolist()



In [43]:
q1_time(FILE_PATH)

[(datetime.date(2021, 2, 12), 'RanbirS00614606'),
 (datetime.date(2021, 2, 13), 'MaanDee08215437'),
 (datetime.date(2021, 2, 17), 'RaaJVinderkaur'),
 (datetime.date(2021, 2, 16), 'jot__b'),
 (datetime.date(2021, 2, 14), 'rebelpacifist'),
 (datetime.date(2021, 2, 18), 'neetuanjle_nitu'),
 (datetime.date(2021, 2, 15), 'jot__b'),
 (datetime.date(2021, 2, 20), 'MangalJ23056160'),
 (datetime.date(2021, 2, 23), 'Surrypuria'),
 (datetime.date(2021, 2, 19), 'Preetm91')]

In [44]:
%lprun -f q1_time q1_time(FILE_PATH)

Timer unit: 1e-07 s

Total time: 23.4773 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\2266987483.py
Function: q1_time at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q1_time(file_path):
     2         1         46.0     46.0      0.0      import pandas as pd
     3         2  226155768.0    1e+08     96.3      df = pd.read_json(file_path, lines=True)\
     4         1         39.0     39.0      0.0          .assign(date=lambda x: x.date.dt.date,
     5         1         16.0     16.0      0.0                  username=lambda x: x.user.apply(lambda y: y.get("username")))
     6                                           
     7         4     294062.0  73515.5      0.1      df_top_ten_date=df["date"]\
     8                                                   .value_counts()\
     9         1         11.0     11.0      0.0          .nlargest(10)\
    10         1         13.0     13.0      0.0    

In [45]:

from q1_time import q1_time

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q1_time q1_time(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn



Filename: c:\Users\Juantc93\DEV\latam_de_challenge\q1_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     1    172.0 MiB    172.0 MiB           1   def q1_time(file_path):
     2    172.0 MiB      0.0 MiB           1       import pandas as pd
     3   1967.6 MiB   1773.2 MiB           2       df = pd.read_json(file_path, lines=True)\
     4   1988.2 MiB     20.6 MiB           3           .assign(date=lambda x: x.date.dt.date,
     5   1988.2 MiB     -1.8 MiB      234817                   username=lambda x: x.user.apply(lambda y: y.get("username")))
     6                                         
     7   1965.8 MiB     -1.8 MiB           4       df_top_ten_date=df["date"]\
     8                                                 .value_counts()\
     9   1965.8 MiB      0.0 MiB           1           .nlargest(10)\
    10   1965.8 MiB      0.0 MiB           1           .rename("twits")\
    11   1965.8 MiB      0.0 MiB           1           .rename_axis("date")
   

In [92]:
%%write_and_run q1_memory.py
def q1_memory(file_path):
    import ujson
    import zipfile
    from datetime import datetime as dt
    from collections import defaultdict

    date_count=defaultdict(int)
    user_date_count=defaultdict(lambda: defaultdict(int))
    with zipfile.ZipFile(file_path, 'r') as f:
        with f.open(f.filelist[0]) as g:
            for line in g:
                tweet = ujson.loads(line)
                date_count[tweet['date'].split("T")[0]] += 1
                user_date_count[tweet['date'].split("T")[0]][tweet['user']['username']] += 1


    
    return [ 
        (dt.strptime(key,"%Y-%m-%d").date(),max(user_date_count[key],key=user_date_count[key].get))
        for key in sorted(date_count, key=date_count.get,reverse=True)[0:10]
        ]
    

In [91]:
%lprun -f q1_memory q1_memory(FILE_PATH)

Timer unit: 1e-07 s

Total time: 34.2094 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\707789264.py
Function: q1_memory at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q1_memory(file_path):
     2         1         42.0     42.0      0.0      import ujson
     3         1         25.0     25.0      0.0      import zipfile
     4         1         68.0     68.0      0.0      from datetime import datetime as dt
     5         1        147.0    147.0      0.0      from collections import defaultdict
     6                                           
     7         1         39.0     39.0      0.0      date_count=defaultdict(int)
     8         1         21.0     21.0      0.0      user_date_count=defaultdict(lambda: defaultdict(int))
     9         1       7784.0   7784.0      0.0      with zipfile.ZipFile(file_path, 'r') as f:
    10         1       3017.0   3017.0      0.0          with f.open(f

In [93]:

from q1_memory import q1_memory

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q1_memory q1_memory(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn



Filename: c:\Users\Juantc93\DEV\latam_de_challenge\q1_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     1    511.4 MiB    511.4 MiB           1   def q1_memory(file_path):
     2    511.4 MiB      0.0 MiB           1       import ujson
     3    511.4 MiB      0.0 MiB           1       import zipfile
     4    511.4 MiB      0.0 MiB           1       from datetime import datetime as dt
     5    511.4 MiB      0.0 MiB           1       from collections import defaultdict
     6                                         
     7    511.4 MiB      0.0 MiB           1       date_count=defaultdict(int)
     8    511.4 MiB      0.0 MiB          27       user_date_count=defaultdict(lambda: defaultdict(int))
     9    511.4 MiB      0.0 MiB           1       with zipfile.ZipFile(file_path, 'r') as f:
    10    511.4 MiB      0.0 MiB           1           with f.open(f.filelist[0]) as g:
    11    511.4 MiB      0.0 MiB      117408               for line in g:
    12  

## Top 10 Emojis

In [99]:
%%write_and_run q2_time.py

def q2_time(file_path):
    
    import pandas as pd
    import emoji
    from collections import Counter
    import re
    
    df = pd.read_json(file_path, lines=True)\

    return Counter(
        (
            (
                i.chars for i in emoji.analyze(("|").join(list(df.content.values)))
                )
            )
            )\
            .most_common(10)



In [100]:
q2_time(FILE_PATH)

[('🙏', 5049),
 ('😂', 3072),
 ('🚜', 2972),
 ('🌾', 2182),
 ('🇮🇳', 2086),
 ('🤣', 1668),
 ('✊', 1651),
 ('❤️', 1382),
 ('🙏🏻', 1317),
 ('💚', 1040)]

In [101]:
%lprun -f q2_time q2_time(FILE_PATH)

Timer unit: 1e-07 s

Total time: 432.403 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\3599924225.py
Function: q2_time at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q2_time(file_path):
     2                                               
     3         1         64.0     64.0      0.0      import pandas as pd
     4         1         38.0     38.0      0.0      import emoji
     5         1        234.0    234.0      0.0      from collections import Counter
     6         1         40.0     40.0      0.0      import re
     7                                               
     8         1  226598523.0    2e+08      5.2      df = pd.read_json(file_path, lines=True)\
     9                                           
    10         3 4096713905.0    1e+09     94.7      return Counter(
    11                                                   (
    12         2         82.0     41.0      0.0    

In [98]:
from q2_time import q2_time

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q2_time q2_time(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn
*** KeyboardInterrupt exception caught in code being profiled.


Filename: c:\Users\Juantc93\DEV\latam_de_challenge\q2_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     2    607.6 MiB    607.6 MiB           1   def q2_time(file_path):
     3                                             
     4    607.6 MiB      0.0 MiB           1       import pandas as pd
     5    607.6 MiB      0.0 MiB           1       import emoji
     6    607.6 MiB      0.0 MiB           1       from collections import Counter
     7    607.6 MiB      0.0 MiB           1       import re
     8                                             
     9   1982.9 MiB   1375.3 MiB           1       df = pd.read_json(file_path, lines=True)\
    10                                         
    11   1982.9 MiB      0.0 MiB           1       return Counter(
    12                                                 (
    13   2048.3 MiB  -4039.8 MiB       41015               (
    14   2048.3 MiB     65.4 MiB           1                   i.chars for i in emoji.analyze((

In [102]:
%%write_and_run q2_memory.py

def q2_memory(file_path):

    import ujson
    import emoji
    from collections import Counter
    import re
    import zipfile


    with zipfile.ZipFile(file_path, 'r') as f:
        with f.open(f.filelist[0]) as g:
            content_list=[ujson.loads(line).get("content") for line in g]

    return Counter(
        (
            (
                i.chars for i in emoji.analyze(("|").join(content_list))
                )
            )
            )\
            .most_common(10)


In [103]:
q2_memory(FILE_PATH)

[('🙏', 5049),
 ('😂', 3072),
 ('🚜', 2972),
 ('🌾', 2182),
 ('🇮🇳', 2086),
 ('🤣', 1668),
 ('✊', 1651),
 ('❤️', 1382),
 ('🙏🏻', 1317),
 ('💚', 1040)]

In [104]:
%lprun -f q2_memory q2_memory(FILE_PATH)

Timer unit: 1e-07 s

Total time: 448.338 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\3131767586.py
Function: q2_memory at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q2_memory(file_path):
     2                                           
     3         1         48.0     48.0      0.0      import ujson
     4         1         34.0     34.0      0.0      import emoji
     5         1        208.0    208.0      0.0      from collections import Counter
     6         1         32.0     32.0      0.0      import re
     7         1         33.0     33.0      0.0      import zipfile
     8                                           
     9                                           
    10         1       7874.0   7874.0      0.0      with zipfile.ZipFile(file_path, 'r') as f:
    11         1       3704.0   3704.0      0.0          with f.open(f.filelist[0]) as g:
    12         1  325305421.0  

In [105]:
from q2_memory import q2_memory

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q2_memory q2_memory(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn



Filename: c:\Users\Juantc93\DEV\latam_de_challenge\q2_memory.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     2    226.3 MiB    226.3 MiB           1   def q2_memory(file_path):
     3                                         
     4    226.3 MiB      0.0 MiB           1       import ujson
     5    226.3 MiB      0.0 MiB           1       import emoji
     6    226.3 MiB      0.0 MiB           1       from collections import Counter
     7    226.3 MiB      0.0 MiB           1       import re
     8    226.3 MiB      0.0 MiB           1       import zipfile
     9                                         
    10                                         
    11    226.3 MiB      0.0 MiB           1       with zipfile.ZipFile(file_path, 'r') as f:
    12    226.3 MiB      0.0 MiB           1           with f.open(f.filelist[0]) as g:
    13    238.1 MiB     11.8 MiB      117410               content_list=[ujson.loads(line).get("content") for line in g]
    14         

## Top 10 Influencers

In [106]:
%%write_and_run q3_time.py
def q3_time(file_path):

    import pandas as pd
    from collections import Counter
    import re

    df = pd.read_json(file_path, lines=True)
    mention_pattern=re.compile('@(\w+)')
    return Counter([i.upper() for i in re.findall(mention_pattern,("|").join(list(df.content.values)))]).most_common(10)

In [107]:
q3_time(FILE_PATH)

[('NARENDRAMODI', 2266),
 ('KISANEKTAMORCHA', 1840),
 ('RAKESHTIKAITBKU', 1642),
 ('PMOINDIA', 1427),
 ('RAHULGANDHI', 1146),
 ('GRETATHUNBERG', 1048),
 ('RAVISINGHKA', 1019),
 ('RIHANNA', 986),
 ('UNHUMANRIGHTS', 962),
 ('MEENAHARRIS', 926)]

In [108]:
%lprun -f q3_time q3_time(FILE_PATH)

Timer unit: 1e-07 s

Total time: 23.6493 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\3287970270.py
Function: q3_time at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q3_time(file_path):
     2                                           
     3         1         38.0     38.0      0.0      import pandas as pd
     4         1        143.0    143.0      0.0      from collections import Counter
     5         1         21.0     21.0      0.0      import re
     6                                           
     7         1  231719860.0    2e+08     98.0      df = pd.read_json(file_path, lines=True)
     8         1        292.0    292.0      0.0      mention_pattern=re.compile('@(\w+)')
     9         1    4772595.0    5e+06      2.0      return Counter([i.upper() for i in re.findall(mention_pattern,("|").join(list(df.content.values)))]).most_common(10)

In [109]:
from q3_time import q3_time

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q3_time q3_time(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn



Filename: c:\Users\Juantc93\DEV\latam_de_challenge\q3_time.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     1                                         def q3_time(file_path):
     2                                         
     3    715.0 MiB    715.0 MiB           1       import pandas as pd
     4    715.0 MiB      0.0 MiB           1       from collections import Counter
     5                                             import re
     6                                         
     7                                             df = pd.read_json(file_path, lines=True)
     8                                             mention_pattern=re.compile('@(\w+)')
     9                                             return Counter([i.upper() for i in re.findall(mention_pattern,("|").join(list(df.content.values)))]).most_common(10)

In [110]:
%%write_and_run q3_memory.py

def q3_memory(file_path):

    import ujson
    from collections import Counter
    import re
    import zipfile


    with zipfile.ZipFile(file_path, 'r') as f:
        with f.open(f.filelist[0]) as g:
            content_list=[ujson.loads(line).get("content") for line in g]
        


    mention_pattern=re.compile('@(\w+)')
    return Counter([i.upper() for i in re.findall(mention_pattern,(' ').join(content_list))]).most_common(10)

In [111]:
q3_memory(FILE_PATH)

[('NARENDRAMODI', 2266),
 ('KISANEKTAMORCHA', 1840),
 ('RAKESHTIKAITBKU', 1642),
 ('PMOINDIA', 1427),
 ('RAHULGANDHI', 1146),
 ('GRETATHUNBERG', 1048),
 ('RAVISINGHKA', 1019),
 ('RIHANNA', 986),
 ('UNHUMANRIGHTS', 962),
 ('MEENAHARRIS', 926)]

In [112]:
%lprun -f q3_memory q3_memory(FILE_PATH)

Timer unit: 1e-07 s

Total time: 34.6026 s
File: C:\Users\Juantc93\AppData\Local\Temp\ipykernel_10444\860603813.py
Function: q3_memory at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def q3_memory(file_path):
     2                                           
     3         1         54.0     54.0      0.0      import ujson
     4         1        186.0    186.0      0.0      from collections import Counter
     5         1         29.0     29.0      0.0      import re
     6         1         30.0     30.0      0.0      import zipfile
     7                                           
     8                                           
     9         1       8317.0   8317.0      0.0      with zipfile.ZipFile(file_path, 'r') as f:
    10         1       3040.0   3040.0      0.0          with f.open(f.filelist[0]) as g:
    11         1  342232836.0    3e+08     98.9              content_list=[ujson.loads(line).get("

In [89]:
from q3_memory import q3_memory

print(r"\n\n --- MEMORY_PROFILING ---\nn")
%mprun -f q3_memory q3_memory(FILE_PATH)

\n\n --- MEMORY_PROFILING ---\nn



Filename: c:\Users\Juantc93\DEV\latam_test\q3_mem_opt.py

Line #    Mem usage    Increment  Occurrences   Line Contents
     2    232.1 MiB    232.1 MiB           1   def q3_mem_opt(file_path):
     3                                         
     4    232.1 MiB      0.0 MiB           1       import ujson
     5    232.1 MiB      0.0 MiB           1       from collections import Counter
     6    232.1 MiB      0.0 MiB           1       import re
     7    232.1 MiB      0.0 MiB           1       import zipfile
     8                                         
     9                                         
    10    232.1 MiB      0.0 MiB           1       with zipfile.ZipFile(file_path, 'r') as f:
    11    232.1 MiB      0.0 MiB           1           with f.open(f.filelist[0]) as g:
    12    232.1 MiB      0.0 MiB      117410               content_list=[ujson.loads(line).get("content") for line in g]
    13                                                 
    14                       