In [1]:
'''libraries list with help showing the version of the libraries being used in this nodebook'''
libraries = []

'''Datasets, arrays and json files'''
import json
import pandas as pd
import numpy as np
import pickle
libraries.append('json')
libraries.append('pandas')
libraries.append('numpy')
libraries.append('pickle')

'''Following progress'''
from tqdm.notebook import tqdm
libraries.append('tqdm')
                 
'''Pthon and library version'''
import types
import pkg_resources
import sys
from platform import python_version

In [2]:
'''To display version of Software being used'''
print('Version of python installed: {}' .format(sys.version))
print('Version of python being used: {}' .format(python_version()))
print('\nNon-built in libraries being used:')

for m in pkg_resources.working_set:
    if m.project_name.lower() in libraries:
        print('{}, version {}'.format(m.project_name,m.version))

Version of python installed: 3.8.10 (default, May 19 2021, 11:01:55) 
[Clang 10.0.0 ]
Version of python being used: 3.8.10

Non-built in libraries being used:
tqdm, version 4.62.3
pandas, version 1.3.3
numpy, version 1.19.2


In [15]:
def json_to_df():
    '''It returns a dataset with the information retreived from the files contained in
    the "cities" dictionary. It will print the shape of the dataset extrated from each
    city.'''
    
    cities = {'Chicago':"../data/Chicago.json" , 'Dallas':"../data/Dallas.json", 
              'Houston':"../data/Houston.json", 'LosAngeles':"../data/LosAngeles.json",
              'NewYork':"../data/NewYork.json", 'Philadelphia': "../data/Philadelphia.json",
              'Phoenix':"../data/Phoenix.json",'SanAntonio': "../data/SanAntonio.json",
              'SanDiego': "../data/SanDiego.json", 'Toronto':"../data/Toronto.json"}
    
    total_rows = 0
    
    for city, file in cities.items():
        with open(file) as f:
            json_data = json.loads(f.read())
        df_tmp = pd.DataFrame(json_data)
        df_tmp['City_file'] = city
        print('The {} dataset contains {} rows and {} columns.' .format(city, 
                                                                        df_tmp.shape[0],
                                                                        df_tmp.shape[1]))
        total_rows += df_tmp.shape[0]
        if city == 'Chicago':
            '''If it is the first city in the dictionary, create the df'''
            df = df_tmp
    
        else:
            '''If it is any other city, merge the df'''
            df = pd.concat([df, df_tmp])
    df.sample(20)
    '''To check the final shape is correct'''
    assert df.shape[0] == total_rows #To check that the shape of the df co
    
    '''Make each hashtag in the list one row and drop duplicates'''
    df = df.explode('hashtags') #to make each hashtag on the list one row
    df = df.drop_duplicates().reset_index(drop=True) #There are 'id' duplicates 
    df = df.dropna(subset=['hashtags']).reset_index(drop=True) #To drop the tweets with no Hashtag
    return df

In [16]:
'''Retrieve the data'''
df =json_to_df()
df.head()

The Chicago dataset contains 3171 rows and 8 columns.
The Dallas dataset contains 1624 rows and 8 columns.
The Houston dataset contains 2220 rows and 8 columns.
The LosAngeles dataset contains 13893 rows and 8 columns.
The NewYork dataset contains 20979 rows and 8 columns.
The Philadelphia dataset contains 1413 rows and 8 columns.
The Phoenix dataset contains 1123 rows and 8 columns.
The SanAntonio dataset contains 697 rows and 8 columns.
The SanDiego dataset contains 1411 rows and 8 columns.
The Toronto dataset contains 5505 rows and 8 columns.


Unnamed: 0,id,sentiment,date,hashtags,city,state,place_type,City_file
0,1240733265461272578,0.0,Mar 19 2020,corona,Chicago,IL,city,Chicago
1,1240733265461272578,0.0,Mar 19 2020,cov19,Chicago,IL,city,Chicago
2,1240733265461272578,0.0,Mar 19 2020,notMyVirus,Chicago,IL,city,Chicago
3,1240733265461272578,0.0,Mar 19 2020,quarantinefitness,Chicago,IL,city,Chicago
4,1240781405623390209,0.0,Mar 19 2020,TBT,Chicago,IL,city,Chicago


In [17]:
df.shape 

(178084, 8)

In [18]:
df.sample(20)

Unnamed: 0,id,sentiment,date,hashtags,city,state,place_type,City_file
128218,1301367382405976064,0.0,Sep 03 2020,workfromhome,Brooklyn,NY,city,NewYork
153210,1326385789773737984,0.8,Nov 11 2020,hallwaypics,San Diego,CA,city,SanDiego
8549,1319080500242284545,0.0,Oct 22 2020,beTheChange,Chicago,IL,city,Chicago
12653,1292277098074132481,0.25,Aug 09 2020,quarantinecooking,Irving,TX,city,Dallas
51671,1288194982419918848,0.0,Jul 28 2020,unemployment,Topanga,CA,city,LosAngeles
881,1249071140833820674,-0.1,Apr 11 2020,corona,Chicago,IL,city,Chicago
48769,1284020670670741504,0.0,Jul 17 2020,ogPapi,Los Angeles,CA,city,LosAngeles
153860,1245153131366842369,0.333333,Apr 01 2020,watercolor,Toronto,Ontario,city,Toronto
6722,1295804482828341255,-0.125,Aug 18 2020,BeatThePandemic,Chicago,IL,city,Chicago
1541,1255895725650608128,0.6,Apr 30 2020,chicago,Glendale Heights,IL,city,Chicago


In [19]:
print('The dataset contains {} rows and {} columns.' .format(df.shape[0], df.shape[1]))

The dataset contains 178084 rows and 8 columns.


In [20]:
'''Count frequency of each hashtags (total in the df). add a new column: total frequency'''
hashtag_counts = dict(df['hashtags'].value_counts()) #47,244 items
print(len(hashtag_counts))
df['freq'] = np.nan
i = 0
tqdm.pandas()
for h in tqdm(df['hashtags']):
    df['freq'][i] = hashtag_counts[h] 
    i += 1

47244


  0%|          | 0/178084 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['freq'][i] = hashtag_counts[h]


In [21]:
'''To save the dataframe in a file named CovidTwitter.pkl'''
df.to_pickle("../_generated_data/CovidTwitter.pkl")
'''To read the dataframe in a file named CovidTwitter.pkl'''
df = pd.read_pickle("../_generated_data/CovidTwitter.pkl")

In [22]:
'''To work with hashtags that have a frequency larger than 30 mentions only'''
df['freq_boo'] = np.where(df['freq'] > 30, True, False)

In [23]:
'''To save the dataframe filtered by requency in a file named CovidTwitter_filtfreq.pkl'''
df = df[df['freq_boo']].reset_index(drop=True)
df.to_pickle("../_generated_data/CovidTwitter_filtfreq.pkl")

df = pd.read_pickle("../_generated_data/CovidTwitter_filtfreq.pkl")
df.head()

Unnamed: 0,id,sentiment,date,hashtags,city,state,place_type,City_file,freq,freq_boo
0,1240733265461272578,0.0,Mar 19 2020,corona,Chicago,IL,city,Chicago,2129.0,True
1,1240781405623390209,0.0,Mar 19 2020,TBT,Chicago,IL,city,Chicago,34.0,True
2,1240781405623390209,0.0,Mar 19 2020,newnormal,Chicago,IL,city,Chicago,80.0,True
3,1240781405623390209,0.0,Mar 19 2020,corona,Chicago,IL,city,Chicago,2129.0,True
4,1240786307539857409,0.033333,Mar 19 2020,corona,Chicago,IL,city,Chicago,2129.0,True


In [24]:
def count_co_ocurrences(h1, h2):
    '''It returns the count of co-occurreces for two given hashtags'''
    count = 0
    df_both = df[(df['hashtags']==h1)|(df['hashtags']==h2)]
    for n in list(df_both['id'].value_counts()):
        if n > 1:
            count += 1
    return count

In [25]:
'''To weight the edges between two hashtags accordingly to their co-occurences'''
hashtags_list = list(df['hashtags'].unique())
weighted_edges = []
i = 0
j = 1
tqdm.pandas()
while i < len(hashtags_list):
    tqdm.pandas()
    while j < len(hashtags_list):
        if i != j:
            edge1 = hashtags_list[i]
            edge2 = hashtags_list[j]
            counts = count_co_ocurrences(edge1, edge2)
            if counts > 0:
                tmp = (edge1, edge2, counts)
                weighted_edges.append(tmp)
        j += 1
    i += 1
    j = i + 1
'''To change the date column into a datetime datatype'''
df['date'] = pd.to_datetime(df['date']) 

df.head()

Unnamed: 0,id,sentiment,date,hashtags,city,state,place_type,City_file,freq,freq_boo
0,1240733265461272578,0.0,2020-03-19,corona,Chicago,IL,city,Chicago,2129.0,True
1,1240781405623390209,0.0,2020-03-19,TBT,Chicago,IL,city,Chicago,34.0,True
2,1240781405623390209,0.0,2020-03-19,newnormal,Chicago,IL,city,Chicago,80.0,True
3,1240781405623390209,0.0,2020-03-19,corona,Chicago,IL,city,Chicago,2129.0,True
4,1240786307539857409,0.033333,2020-03-19,corona,Chicago,IL,city,Chicago,2129.0,True


In [26]:
def mask_grouped(df, hashtag):
    '''It returns a dataframe filtered by a given hashtag. The dataframe will contain
    the total count of mentions grouped by date'''
    mask_hash = df['hashtags'] == hashtag
    df_masked = df[mask_hash]
    f = df_masked.groupby([pd.Grouper(key='date', freq='D')]).count()
    return f

def correlation(df, hash1, hash2):
    '''It returns the correlation between two hastags hash1 and hash2 using the information
    provided in df. The df must contain dates in a datetype format'''
    
    df = df.loc[:,['date','hashtags']]
    df = df.sort_values(by=['date'])
    
    f1 = mask_grouped(df, hash1)
    f2 = mask_grouped(df, hash2)
    
    correl = list(f1.corrwith(f2, axis=0, method='pearson'))[0]
    return correl

In [27]:
''' To save correlation in a format that is easy to set as attribute afterwards, such as:
attr = {(edge1, edge2): {"correlation": n, "weight": x}, (edge2, edge3): {"attr2": 3}}'''

attr = {}
tqdm.pandas()
for edge1, edge2, weight in tqdm(weighted_edges):
    tup = (edge1, edge2)
    c = correlation(df, edge1, edge2)
    attr[tup] = {'correlation': c}
    
'''Store the weighted edges count'''
file_name = "../_generated_data/edges_counts.pkl"

open_file = open(file_name, "wb")
pickle.dump(weighted_edges, open_file)
open_file.close()

'''Store the edges correlation attribute'''
file_name = "../_generated_data/edges_corr.pkl"

open_file = open(file_name, "wb")
pickle.dump(attr, open_file)
open_file.close()

  0%|          | 0/19180 [00:00<?, ?it/s]