In [2]:
import twint
import numpy as np
import pickle
import pandas as pd
import time
import math
from collections import Counter
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

print(pd.__version__)

1.2.3


In [71]:
def changeTimezone(df):
    df["times"] = df["date"] + df["timezone"]
    df["times"] = pd.to_datetime(df["times"], format="%Y-%m-%d %H:%M:%S%z", utc=True)
    df["times"] = df["times"].dt.tz_convert("Europe/Madrid")
    
    df["weekday"] = df["times"].dt.dayofweek
    
    return df

def getStats(df):
    stats = dict()
    sample_size = len(df)
    
    stats["size"] = sample_size
    
    if (sample_size == 0):
        return stats
    
    df = changeTimezone(df)
    lang_counts = Counter(df["language"])
    lang_counts =  dict(sorted(lang_counts.items(), reverse=True, key=lambda item: item[1]))
    for k,v in lang_counts.items():
        proportion = v / sample_size
        raw_and_proportion = str(v) + " " + str(proportion)
        lang_counts[k] = raw_and_proportion
    stats["language"] = dict(lang_counts)
    
    days_counts = Counter(df["weekday"])
    days = dict()
    days["weekends"] = days_counts[5] + days_counts[6]
    days["weekdays"] = days_counts[0] + days_counts[1] + days_counts[2] + days_counts[3] + days_counts[4]
    
    total_days = (int(days["weekends"]) + int(days["weekdays"]))
    for k,v in days.items():
        proportion = v / total_days
        raw_and_proportion = str(v) + " " + str(proportion)
        days[k] = raw_and_proportion
    stats["days"] = days
    
    return stats

## Foreign data

In [4]:
# read data
foreign_tweets_by_location = pickle.load( open( "foreign_freq_tweets.p", "rb" ) )

In [72]:
for k,v in foreign_tweets_by_location.items():
    print(k)
    if (len(set(v["username"])) != 0):
        print("frequency:", len(v), "/ user count:", len(set(v["username"])), "/ freq per user:",len(v) / len(set(v["username"])))
    df_modified = getStats(v)
    for k1, v1 in df_modified.items():
        print(k1)
        if (type(v1) == type(dict())):
            for k2, v2 in v1.items():
                print(k2, v2)
        else: print(v1)
        print()
    print("-----------------------------")

Santa Coloma de Gramanet
frequency: 599 / user count: 137 / freq per user: 4.372262773722627
size
599

language
en 159 0.2654424040066778
ja 134 0.22370617696160267
pt 58 0.09682804674457429
ca 50 0.08347245409015025
es 48 0.08013355592654424
pl 24 0.04006677796327212
ro 23 0.038397328881469114
it 17 0.028380634390651086
fr 17 0.028380634390651086
und 15 0.025041736227045076
in 13 0.021702838063439065
fi 10 0.01669449081803005
tr 9 0.015025041736227046
et 5 0.008347245409015025
sv 5 0.008347245409015025
no 2 0.00333889816360601
sl 2 0.00333889816360601
vi 2 0.00333889816360601
da 2 0.00333889816360601
lv 1 0.001669449081803005
de 1 0.001669449081803005
tl 1 0.001669449081803005
hu 1 0.001669449081803005

days
weekends 137 0.2287145242070117
weekdays 462 0.7712854757929883

-----------------------------
Fluvial del Besos
frequency: 16 / user count: 6 / freq per user: 2.6666666666666665
size
16

language
es 16 1.0

days
weekends 4 0.25
weekdays 12 0.75

-----------------------------
Parq

## Native data

In [9]:
n1 = pickle.load(open("native_freq_tweets1.p", "rb"))
n2 = pickle.load(open("native_freq_tweets2.p", "rb"))
n3 = pickle.load(open("native_freq_tweets3.p", "rb"))
n4 = pickle.load(open("native_freq_tweets4.p", "rb"))
n5 = pickle.load(open("native_freq_tweets5.p", "rb"))
n6 = pickle.load(open("native_freq_tweets6.p", "rb"))
n7 = pickle.load(open("native_freq_tweets7.p", "rb"))
n8 = pickle.load(open("native_freq_tweets8.p", "rb"))
n9 = pickle.load(open("native_freq_tweets9.p", "rb"))
n10 = pickle.load(open("native_freq_tweets10.p", "rb"))
n11 = pickle.load(open("native_freq_tweets11.p", "rb"))
n12 = pickle.load(open("native_freq_tweets12.p", "rb"))
n13 = pickle.load(open("native_freq_tweets13.p", "rb"))
n14 = pickle.load(open("native_freq_tweets14.p", "rb"))
n15 = pickle.load(open("native_freq_tweets15.p", "rb"))
n16 = pickle.load(open("native_freq_tweets16.p", "rb"))
n17 = pickle.load(open("native_freq_tweets17.p", "rb"))
n18 = pickle.load(open("native_freq_tweets18.p", "rb"))
n18_5 = pickle.load(open("native_freq_tweets18_5.p", "rb"))
n19 = pickle.load(open("native_freq_tweets19.p", "rb"))
n20 = pickle.load(open("native_freq_tweets20.p", "rb"))
n21 = pickle.load(open("native_freq_tweets21.p", "rb"))
n22 = pickle.load(open("native_freq_tweets22.p", "rb"))
n23 = pickle.load(open("native_freq_tweets23.p", "rb"))
n24 = pickle.load(open("native_freq_tweets24.p", "rb"))

In [10]:
native_tweets_by_location = {
    "Santa Coloma de Gramanet" : pd.DataFrame(),
    "Fluvial del Besos" : pd.DataFrame(),
    "Parque Molinet" : pd.DataFrame(),
    "Plaza del Rellotge" : pd.DataFrame(),
    "Rambla San Sebastian" : pd.DataFrame(),
    "Parque Can Zam" : pd.DataFrame(),
    "Instituto Can Peixauet" : pd.DataFrame(),
    "Parque Gran Sol" : pd.DataFrame(),
    "Escuela Tanit" : pd.DataFrame(),
    "Instituto Terra Roja" : pd.DataFrame(),
    "Instituto Gassol" : pd.DataFrame(),
    "CAP Santa Rosa" : pd.DataFrame(),
    "Cinto Verdaguer" : pd.DataFrame(),
    "Mercado del Fondo" : pd.DataFrame(),
    "Nus de la Trinitat" : pd.DataFrame(),
    "Macanet str" : pd.DataFrame(),
    "Iglesia Evangelica" : pd.DataFrame()
}

n_list = [n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15,n16,n17,n18,n18_5,n19,n20,n21,n22,n23,n24]
for n in n_list:
    for k,v in native_tweets_by_location.items():
        native_tweets_by_location[k] = v.append(n[k])

In [73]:
for k,v in native_tweets_by_location.items():
    print(k)
    if (len(set(v["username"])) != 0):
        print("frequency:", len(v), "/ user count:", len(set(v["username"])), "/ freq per user:",len(v) / len(set(v["username"])))
    df_modified = getStats(v)
    for k1, v1 in df_modified.items():
        print(k1)
        if (type(v1) == type(dict())):
            for k2, v2 in v1.items():
                print(k2, v2)
        else: print(v1)
        print()
    print("-----------------------------")

Santa Coloma de Gramanet
frequency: 7874 / user count: 1088 / freq per user: 7.237132352941177
size
7874

language
ca 4901 0.6224282448564897
es 2867 0.3641097282194564
it 34 0.004318008636017272
en 22 0.0027940055880111762
fr 12 0.001524003048006096
ro 10 0.00127000254000508
pt 9 0.001143002286004572
und 8 0.001016002032004064
fi 6 0.000762001524003048
in 2 0.000254000508001016
da 1 0.000127000254000508
cy 1 0.000127000254000508
et 1 0.000127000254000508

days
weekends 1824 0.2316484632969266
weekdays 6050 0.7683515367030734

-----------------------------
Fluvial del Besos
frequency: 1608 / user count: 626 / freq per user: 2.5686900958466454
size
1608

language
es 1558 0.9689054726368159
in 34 0.021144278606965175
ca 7 0.004353233830845771
lt 5 0.003109452736318408
pt 2 0.0012437810945273632
ja 1 0.0006218905472636816
en 1 0.0006218905472636816

days
weekends 446 0.277363184079602
weekdays 1162 0.722636815920398

-----------------------------
Parque Molinet
frequency: 741 / user count