In [1]:
import json_lines
import pandas as pd
import networkx as nx
from datetime import datetime

In [2]:
# set language (it or en)
LANG = 'it' 

# set paths
NET_5G_IT_PATH = 'data/network/net_5g_{}'.format(LANG)
NET_COVID_IT_PATH = 'data/network/net_5gANDcovid_{}'.format(LANG)
NET_5GANDCOVID_IT_PATH = 'data/network/net_covid_{}'.format(LANG)

## Centralities

In [4]:
# Define networks container, indexed by periods
networks = {
    '5g': nx.read_gexf(NET_5G_IT_PATH),
    'covid': nx.read_gexf(NET_COVID_IT_PATH),
    '5gANDcovid': nx.read_gexf(NET_5GANDCOVID_IT_PATH)
}

seed_list = {
    '5g': ['5g'],
    'covid': ['covid','covid19','coronavirus'],
    '5gANDcovid': ['5g','covid','covid19','coronavirus']
}


cc = {}
for net in networks.keys():
    # Remove seed hashtags and save the giant component
    networks[net].remove_nodes_from(seed_list[net])
    largest_cc = max(nx.connected_components(networks[net]), key=len)
    cc[net] = networks[net].subgraph(largest_cc).copy()
    ratio = int(len(largest_cc)/len(networks[net].nodes) * 100)
    print(f'Removing the seed hashtags, the {net} network has a largest component of size {len(largest_cc)}, {ratio}% of the total.')
    print()
    
    # save degree centrality
    deg = dict(cc[net].degree)
    deg = {k: deg[k] for k in sorted(deg, key=deg.get, reverse=True)}
    deg = pd.DataFrame.from_dict(deg, orient='index', columns=['degree']).reset_index()
    deg = deg.rename(columns={'index':'hashtag'})
    deg.to_csv('data/network/degree_{}_{}.csv'.format(net, LANG),index=False)
    
    # Save PageRank centrality
    PR = nx.pagerank_numpy(cc[net])
    PR = {k: PR[k] for k in sorted(PR, key=PR.get, reverse=True)}
    PR = pd.DataFrame.from_dict(PR, orient='index', columns=['PR']).reset_index()
    PR = PR.rename(columns={'index':'hashtag'})
    PR.to_csv('data/network/pagerank_{}_{}.csv'.format(net, LANG),index=False)
    
    # Save edgelist
    nx.write_edgelist(cc[net],'data/network/net_{}_edgelist_{}.csv'.format(net, LANG),delimiter=',',data=['weight'])

Removing the seed hashtags, the 5g network has a largest component of size 1463, 89% of the total.

Removing the seed hashtags, the covid network has a largest component of size 295, 100% of the total.

Removing the seed hashtags, the 5gANDcovid network has a largest component of size 197, 71% of the total.



## Text dataframes - debug

In [20]:
df_cov, df_5g = pd.DataFrame(),pd.DataFrame()


def parse_tweet(retrieved_tweet, datetime_format='%a %b %d %H:%M:%S %z %Y'):
    # Initialize parsed tweet object
    parsed_tweet = dict()
    # Get tweet id
    parsed_tweet['tweet_id'] = str(retrieved_tweet.get('id_str'))
    # Get tweet date
    parsed_tweet['tweet_date'] = datetime.strptime(
        retrieved_tweet.get('created_at'),
        datetime_format
    )
    # Initialize parsed tweet text
    tweet_text = ''
    # Case tweet is a retweet
    if 'retweeted_status' in set(retrieved_tweet.keys()):
        # Get inner tweet
        retrieved_tweet = retrieved_tweet['retweeted_status']
    # Check if current tweet is an extended tweet
    if 'extended_tweet' in set(retrieved_tweet.keys()):
        tweet_text = retrieved_tweet['extended_tweet']['full_text']
    # Case current tweet is not an extended one
    else:
        tweet_text = retrieved_tweet['text']
    # Store tweet text
    parsed_tweet['tweet_text'] = tweet_text
    # Return tweet
    return parsed_tweet

tweets = list()
# Load input file
with open('./data/20200215_only_5g_it.jsonl', 'rb') as in_file:
    # Loop through each line in input .jsonl formatted file
    for retrieved_tweet in json_lines.reader(in_file, broken=True):
        # Format retrieved tweet according to inner DataFrame
        parsed_tweet = parse_tweet(retrieved_tweet)
        # Append parsed tweet to tweets list
        tweets.append(parsed_tweet)
# Append list of retrieved tweets to inner Dataframe
df_5g = df_5g.append(tweets, ignore_index=True)

tweets = list()
# Load input file
with open('./data/20200215_only_covid_it.jsonl', 'rb') as in_file:
    # Loop through each line in input .jsonl formatted file
    for retrieved_tweet in json_lines.reader(in_file, broken=True):
        # Format retrieved tweet according to inner DataFrame
        parsed_tweet = parse_tweet(retrieved_tweet)
        # Append parsed tweet to tweets list
        tweets.append(parsed_tweet)
# Append list of retrieved tweets to inner Dataframe
df_cov = df_cov.append(tweets, ignore_index=True)

In [2]:
df_5g.shape

(4349, 3)

In [21]:
df_cov.shape

(5000, 3)

## Common hashtags

In [5]:
common = pd.DataFrame()
common['hashtag'] = list(set(cc['covid'].nodes) & set(cc['5g'].nodes))

deg_cov = dict(cc['covid'].degree)
PR_cov = nx.pagerank_numpy(cc['covid'])

deg_5g = dict(cc['5g'].degree)
PR_5g = nx.pagerank_numpy(cc['5g'])


l = [deg_cov, deg_5g, PR_cov, PR_5g]
lab = ['deg_cov', 'deg_5g', 'PR_cov', 'PR_5g']
for i, d in enumerate(l):
    d_new = {k: d[k] for k in d if k in common.hashtag.values}
    df = pd.DataFrame.from_dict(d_new, orient='index', columns=[lab[i]])
    common = pd.merge(common, df, left_on = 'hashtag', right_index = True)

In [6]:
common = common.sort_values(by = 'PR_cov', ascending = False)
common = common.sort_values(by = 'deg_cov', ascending = False)
common = common.sort_values(by = 'deg_5g', ascending = False)
common = common.sort_values(by = 'PR_5g', ascending = False)
common.to_csv('data/network/common_hashtags_it.csv', index=False)