In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import community

## Create network

In [2]:
# set language (it or en)

LANG = 'it' 

In [3]:
NET_5G_IT_PATH = 'data/network/net_5g_{}'.format(LANG)
NET_COVID_IT_PATH = 'data/network/net_5gANDcovid_{}'.format(LANG)
NET_5GANDCOVID_IT_PATH = 'data/network/net_covid_{}'.format(LANG)

In [4]:
# Define networks container, indexed by periods
networks = {
    '5g': nx.read_gexf(NET_5G_IT_PATH),
    'covid': nx.read_gexf(NET_COVID_IT_PATH),
    '5gANDcovid': nx.read_gexf(NET_5GANDCOVID_IT_PATH)
}

seed_list = {
    '5g': ['5g'],
    'covid': ['covid','covid19','coronavirus'],
    '5gANDcovid': ['5g','covid','covid19','coronavirus']
}


# Remove seed hashtags and save the giant component
cc = {}
for net in networks.keys():
    networks[net].remove_nodes_from(seed_list[net])
    largest_cc = max(nx.connected_components(networks[net]), key=len)
    cc[net] = networks[net].subgraph(largest_cc).copy()
    ratio = int(len(largest_cc)/len(networks[net].nodes) * 100)
    print(f'Removing the seed hashtags, the {net} network has a largest component of size {len(largest_cc)}, {ratio}% of the total.')
    print()

Removing the seed hashtags, the 5g network has a largest component of size 1463, 89% of the total.

Removing the seed hashtags, the covid network has a largest component of size 295, 100% of the total.

Removing the seed hashtags, the 5gANDcovid network has a largest component of size 197, 71% of the total.



## Community detection

In [5]:
def getCommunitiesLouvain(network, resolution=1.0, threshold = 100):
    """
    Filter communities with less than threshold hashtags
    return: 
        comm: map community_id -> list of hashtags
        partitions: map hashtag -> community_id
    """ 
    # compute best partitions (fixed random state for reproducibility)
    partition = community.best_partition(graph=network, weight='weight', resolution=resolution, random_state=100)
    size = float(len(set(partition.values())))
    print('There are {} communities'.format(size))
    
    # partition is a dictionary in the form {'hashtag':community_id, ...}
    # we want ot transform it in the form {'community_id':[hashtag1, hashtag2, ...]}
    communities = {}
    for p in partition:
        if partition[p] in communities:
            communities[partition[p]].append(p)
        else:
            communities[partition[p]] = [p]

    # delete small communities (size(community)<threshold)
    communities = {k:v for k, v in communities.items() if len(v)>threshold}
    print('-> {} communities remaining after filtering'.format(len(communities)))
    return communities, partition

In [6]:
communities = {
    '5g': None,
    'covid': None,
    '5gANDcovid': None
}

# threshold = approx 5% of the size
parameters = {
    '5g': {'resolution':1, 'threshold':70},
    'covid': {'resolution':1, 'threshold':15},
    '5gANDcovid':  {'resolution':1, 'threshold':10},
} 

for net in cc.keys():
    print("--- Network: {} ---".format(net))
    comm, partitions = getCommunitiesLouvain(
        networks[net],
        resolution=parameters[net]['resolution'],
        threshold=parameters[net]['threshold']
    )
    communities[net] = {
        'communities': comm,
        'partitions': partitions
    }
    print()

--- Network: 5g ---
There are 106.0 communities
-> 8 communities remaining after filtering

--- Network: covid ---
There are 19.0 communities
-> 6 communities remaining after filtering

--- Network: 5gANDcovid ---
There are 67.0 communities
-> 7 communities remaining after filtering



In [7]:
def sort_dict(d, descending=True):
    """
    Sort a dictionary based on items
    """
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

def printCommunitiElemnts(
    network, communities, num_communities=5,
    community_id=None, metric_name='centrality',
    showTopK=5, print_metric=True
):
    """
    Print most important nodes based on certain metric
    """
    if metric_name=='centrality':
        centrality = nx.degree_centrality(network)
        metric = centrality
        
    if metric_name=='degree':
        degree = nx.degree(network, weight='weight')
        metric = degree
        
    # find most relevant (based on metric) terms for each community
    if community_id == None:
        counter = 1
        for k,v in communities.items():
            print('---Community: {}, Size: {}---'.format(k, len(v)))
            tag_dict = {tag: metric[tag] for tag in v}
            tag_dict = sort_dict(tag_dict)
            tag_list = list(tag_dict.keys())
            for tag in tag_list[:showTopK]:
                if print_metric:
                    print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
                else:
                    print(tag)
            print()
            ## Show only num_communities
            if counter == num_communities:
                break
            counter += 1
    else:
        k = community_id
        if k not in communities:
            print("Community {} has been discarded (too small)".format(k))
            return
        v = communities[community_id]
        print('---Community: {}---'.format(k))
        tag_dict = {tag: metric[tag] for tag in v}
        tag_dict = sort_dict(tag_dict)
        tag_list = list(tag_dict.keys())
        for tag in tag_list[:showTopK]:
            if print_metric:
                print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
            else:
                print(tag)
        print()

In [12]:
net = '5g'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=True, showTopK=5, community_id=None
)

---Community: 0, Size: 204---
cybersecurity ,centrality: 0.0677
innovazione ,centrality: 0.0615
tecnologia ,centrality: 0.0603
internet ,centrality: 0.0511
intelligenzaartificiale ,centrality: 0.0443

---Community: 1, Size: 133---
italia ,centrality: 0.1175
huawei ,centrality: 0.0892
cina ,centrality: 0.0622
usa ,centrality: 0.0511
ue ,centrality: 0.0406

---Community: 2, Size: 270---
smartphone ,centrality: 0.1028
xiaomi ,centrality: 0.0468
mobile ,centrality: 0.0468
qualcomm ,centrality: 0.0351
mi10 ,centrality: 0.0289

---Community: 3, Size: 76---
salute ,centrality: 0.0265
4g ,centrality: 0.0240
network ,centrality: 0.0172
telco ,centrality: 0.0160
healthcare ,centrality: 0.0117

---Community: 6, Size: 113---
covid19 ,centrality: 0.0929
fakenews ,centrality: 0.0228
vaccini ,centrality: 0.0191
coronavirusupdate ,centrality: 0.0185
europa ,centrality: 0.0142

---Community: 9, Size: 243---
coronavirus ,centrality: 0.2148
covid19italia ,centrality: 0.0418
virus ,centrality: 0.0388
covi

In [13]:
net = '5gANDcovid'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=True, showTopK=5, community_id=None
)

---Community: 1, Size: 17---
coronavirusitalia ,centrality: 0.1282
coronaviruschina ,centrality: 0.0366
coronaviruscanada ,centrality: 0.0366
coronavirusoutbreak ,centrality: 0.0330
coronavirusjapan ,centrality: 0.0330

---Community: 3, Size: 17---
cina ,centrality: 0.1099
viruscorona ,centrality: 0.0476
pechino ,centrality: 0.0403
aggiornamento ,centrality: 0.0366
viruscina ,centrality: 0.0366

---Community: 4, Size: 44---
covid19italia ,centrality: 0.0952
coronavirusitalla ,centrality: 0.0586
covidー19 ,centrality: 0.0513
covid_19 ,centrality: 0.0513
portaaporta ,centrality: 0.0366

---Community: 6, Size: 47---
conte ,centrality: 0.0879
lombardia ,centrality: 0.0733
governo ,centrality: 0.0623
radiosavana ,centrality: 0.0586
fontana ,centrality: 0.0549

---Community: 10, Size: 27---
italia ,centrality: 0.0879
mattarella ,centrality: 0.0366
drittoerovescio ,centrality: 0.0366
iraq ,centrality: 0.0293
turchia ,centrality: 0.0293

---Community: 14, Size: 15---
migranti ,centrality: 0.040

In [14]:
net = 'covid'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=True, showTopK=5, community_id=None
)

---Community: 0, Size: 92---
5g ,centrality: 1.0000
cina ,centrality: 0.0986
wuhan ,centrality: 0.0680
smartcity ,centrality: 0.0680
pandemia ,centrality: 0.0374

---Community: 1, Size: 20---
tecnologia ,centrality: 0.0680
snapdragon ,centrality: 0.0646
meizu ,centrality: 0.0408
android ,centrality: 0.0408
smartphone ,centrality: 0.0408

---Community: 2, Size: 21---
italia ,centrality: 0.0680
germania ,centrality: 0.0442
lezzi ,centrality: 0.0306
crisieconomica ,centrality: 0.0306
ue ,centrality: 0.0306

---Community: 7, Size: 24---
iot ,centrality: 0.0884
robot ,centrality: 0.0714
guidaautonoma ,centrality: 0.0612
selfdrivingcars ,centrality: 0.0612
ai ,centrality: 0.0612

---Community: 8, Size: 19---
usa ,centrality: 0.0680
dazi ,centrality: 0.0476
guerra ,centrality: 0.0408
attacco ,centrality: 0.0408
bioterrorismo ,centrality: 0.0408

---Community: 9, Size: 19---
iorestoacasa ,centrality: 0.0442
covid2019 ,centrality: 0.0374
telco ,centrality: 0.0306
vodafone ,centrality: 0.0272
ho

## Save parameters

In [11]:
# save map hashtag-community

SAVE_SELECTED = False
out_path = "data/communities/hashtags_community_{}_{}.csv".format("full" if not SAVE_SELECTED else "selected", LANG)

hashtags_map = pd.DataFrame([])

for net in communities.keys():
    hashtags_ = []
    communities_ = []

    if SAVE_SELECTED:
        selected_comm = list(communities[net]['communities'].keys())
        
    for h, c in communities[net]['partitions'].items():
        if SAVE_SELECTED and (c not in selected_comm):
            continue
        hashtags_.append(h)
        communities_.append(c)
        
    hashtags_map = hashtags_map.append(pd.DataFrame({
        'hashtag':hashtags_,
        'community':communities_,
        'net':net
    }))
    
hashtags_map.to_csv(out_path, index=False)