In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import community

## Create network

In [2]:
# set language (it or en)

LANG = 'it' 

In [3]:
NET_5G_IT_PATH = 'data/network/net_5g_{}'.format(LANG)
NET_5GANDCOVID_IT_PATH = 'data/network/net_5gANDcovid_{}'.format(LANG)
NET_COVID_IT_PATH = 'data/network/net_covid_{}'.format(LANG)

In [4]:
# Define networks container, indexed by periods
networks = {
    '5g': nx.read_gexf(NET_5G_IT_PATH),
    'covid': nx.read_gexf(NET_COVID_IT_PATH),
    '5gANDcovid': nx.read_gexf(NET_5GANDCOVID_IT_PATH)
}

seed_list = {
    '5g': ['5g'],
    'covid': ['covid','covid19','coronavirus'],
    '5gANDcovid': ['5g','covid','covid19','coronavirus']
}


# Remove seed hashtags and save the giant component
cc = {}
for net in networks.keys():
    networks[net].remove_nodes_from(seed_list[net])
    largest_cc = max(nx.connected_components(networks[net]), key=len)
    cc[net] = networks[net].subgraph(largest_cc).copy()
    ratio = int(len(largest_cc)/len(networks[net].nodes) * 100)
    print(f'Removing the seed hashtags, the {net} network has a largest component of size {len(largest_cc)}, {ratio}% of the total.')
    print()

Removing the seed hashtags, the 5g network has a largest component of size 1463, 89% of the total.

Removing the seed hashtags, the covid network has a largest component of size 197, 71% of the total.

Removing the seed hashtags, the 5gANDcovid network has a largest component of size 207, 70% of the total.



## Community detection

In [5]:
def getCommunitiesLouvain(network, resolution=1.0, threshold = 100):
    """
    Filter communities with less than threshold hashtags
    return: 
        comm: map community_id -> list of hashtags
        partitions: map hashtag -> community_id
    """ 
    # compute best partitions (fixed random state for reproducibility)
    partition = community.best_partition(graph=network, weight='weight', resolution=resolution, random_state=100)
    size = float(len(set(partition.values())))
    print('There are {} communities'.format(size))
    
    # partition is a dictionary in the form {'hashtag':community_id, ...}
    # we want ot transform it in the form {'community_id':[hashtag1, hashtag2, ...]}
    communities = {}
    for p in partition:
        if partition[p] in communities:
            communities[partition[p]].append(p)
        else:
            communities[partition[p]] = [p]

    # delete small communities (size(community)<threshold)
    communities = {k:v for k, v in communities.items() if len(v)>threshold}
    print('-> {} communities remaining after filtering'.format(len(communities)))
    return communities, partition

In [6]:
communities = {
    '5g': None,
    'covid': None,
    '5gANDcovid': None
}

# threshold = approx 5% of the size
parameters = {
    '5g': {'resolution':1, 'threshold':70},
    'covid': {'resolution':1, 'threshold':10},
    '5gANDcovid':  {'resolution':1, 'threshold':10},
} 

for net in cc.keys():
    print("--- Network: {} ---".format(net))
    comm, partitions = getCommunitiesLouvain(
        networks[net],
        resolution=parameters[net]['resolution'],
        threshold=parameters[net]['threshold']
    )
    communities[net] = {
        'communities': comm,
        'partitions': partitions
    }
    print()

--- Network: 5g ---
There are 106.0 communities
-> 8 communities remaining after filtering

--- Network: covid ---
There are 67.0 communities
-> 7 communities remaining after filtering

--- Network: 5gANDcovid ---
There are 50.0 communities
-> 10 communities remaining after filtering



In [7]:
def sort_dict(d, descending=True):
    """
    Sort a dictionary based on items
    """
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

def printCommunitiElemnts(
    network, communities, num_communities=5,
    community_id=None, metric_name='centrality',
    showTopK=5, print_metric=True
):
    """
    Print most important nodes based on certain metric
    """
    if metric_name=='centrality':
        centrality = nx.degree_centrality(network)
        metric = centrality
        
    if metric_name=='degree':
        degree = nx.degree(network, weight='weight')
        metric = degree
        
    # find most relevant (based on metric) terms for each community
    if community_id == None:
        counter = 1
        for k,v in communities.items():
            print('---Community: {}, Size: {}---'.format(k, len(v)))
            tag_dict = {tag: metric[tag] for tag in v}
            tag_dict = sort_dict(tag_dict)
            tag_list = list(tag_dict.keys())
            for tag in tag_list[:showTopK]:
                if print_metric:
                    print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
                else:
                    print(tag)
            print()
            ## Show only num_communities
            if counter == num_communities:
                break
            counter += 1
    else:
        k = community_id
        if k not in communities:
            print("Community {} has been discarded (too small)".format(k))
            return
        v = communities[community_id]
        print('---Community: {}---'.format(k))
        tag_dict = {tag: metric[tag] for tag in v}
        tag_dict = sort_dict(tag_dict)
        tag_list = list(tag_dict.keys())
        for tag in tag_list[:showTopK]:
            if print_metric:
                print(tag, ',{}: {:.4f}'.format(metric_name, metric[tag]))
            else:
                print(tag)
        print()

In [8]:
net = '5g'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=False, showTopK=20, community_id=None
)

---Community: 0, Size: 204---
cybersecurity
innovazione
tecnologia
internet
intelligenzaartificiale
bigdata
tlc
digitaltransformation
digitale
blockchain
cloud
genova
mise
tecnologie
infrastrutture
fibraottica
robotica
wifi
machinelearning
virtualreality

---Community: 1, Size: 133---
italia
huawei
cina
usa
ue
china
copasir
trump
inquinamento
zte
nato
dimaio
dis
vecchione
cinese
pd
appgallery
germania
francia
lega

---Community: 2, Size: 270---
smartphone
xiaomi
mobile
qualcomm
mi10
android
mi10pro
video
apple
snapdragon
technews
iphone
smartphones
connettività
snapdragon865
techie
samsung
lg
phone
innovation

---Community: 3, Size: 76---
salute
4g
network
telco
healthcare
health
icnirp
networking
microonde
connessioni
com
cellular
bigdate
gsma
luissbusiness
velocità
3g
capacità
wifi6
homeoffice

---Community: 6, Size: 113---
covid19
fakenews
vaccini
coronavirusupdate
europa
droni
scienza
mes
tso
report
cia
covid2019italia
ar
brevetti
salvatorerainò
trattamentosanitarioobbligatorio
far

In [9]:
net = '5gANDcovid'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=False, showTopK=20, community_id=None
)

---Community: 1, Size: 21---
droni
robotica
china
telematica
ospedali
fabbriche
automazione
trasporti
videosorveglianza
security
tim
pd
coronarovirus
censorship
wuhancoronavirus
blackswan
ia
chine
bigdata
umanoidi

---Community: 3, Size: 27---
tecnologia
snapdragon
meizu
android
smartphone
flyme8
meizu17
smartphones
techno
technews
fotografia
selfie
game
rogphone
gaming
googlestadia
smarphone
asus
android10
quarantena

---Community: 4, Size: 14---
italia
germania
lezzi
crisieconomica
ue
eurogroupe
italexit
francia
adessodibba
huawei
barcellona
pillolesuldigitale
19febbraio
italy

---Community: 7, Size: 25---
tlc
covid2019
telco
checkout
amazon
computervision
deeplearning
sensorfusion
palermo
coronavirusitalia
coronavirusitaly
digiteconomy24
luissbusiness
networking
capacità
network
ultrabroadband
twitter
facebook
zonarossa

---Community: 10, Size: 17---
iot
robot
guidaautonoma
selfdrivingcars
ai
autonomousvehicles
selfdriving
autonomous
robotics
tech
driverless
driverlesscars
startups


In [10]:
net = 'covid'

printCommunitiElemnts(
    networks[net], communities[net]['communities'],
    num_communities=10, metric_name='centrality',
    print_metric=False, showTopK=20, community_id=None
)

---Community: 1, Size: 17---
coronavirusitalia
coronaviruschina
coronaviruscanada
coronavirusoutbreak
coronavirusjapan
coronavirusmexico
coronaviruswuhan
coronavirusthailand
coronavirususa
coronavirusfrance
bassignana
rete
infodemia
quartogrado
ctcf
zonerosse
narcoscanada

---Community: 3, Size: 17---
cina
viruscorona
pechino
aggiornamento
viruscina
contagio
wuhan
viruscinese
viruschinois
viruschina
virus
hatchett
grippe
epidemia
polmonite
xijinping
nellatestadeldragone

---Community: 4, Size: 44---
covid19italia
coronavirusitalla
covidー19
covid_19
portaaporta
coronavirusitaly
covid19italy
tonicapuozzo
nonelarena
la7
giletti
sileri
covid_2019
coronavirues
facciamorete
coronavid19
coronavirusupdate
milanononsiferma
milanoêtuttainfetta
lineanotte

---Community: 6, Size: 47---
conte
lombardia
governo
radiosavana
fontana
codogno
salvini
zaia
bertolaso
orban
governodelcontagio
roma
contedimettiti
lega
democrazia
lamorgese
padova
schettino
basta
speranza

---Community: 10, Size: 27---
italia

## Save parameters

In [11]:
# save map hashtag-community

SAVE_SELECTED = False
out_path = "data/communities/hashtags_community_{}_{}.csv".format("full" if not SAVE_SELECTED else "selected", LANG)

hashtags_map = pd.DataFrame([])

for net in communities.keys():
    hashtags_ = []
    communities_ = []

    if SAVE_SELECTED:
        selected_comm = list(communities[net]['communities'].keys())
        
    for h, c in communities[net]['partitions'].items():
        if SAVE_SELECTED and (c not in selected_comm):
            continue
        hashtags_.append(h)
        communities_.append(c)
        
    hashtags_map = hashtags_map.append(pd.DataFrame({
        'hashtag':hashtags_,
        'community':communities_,
        'net':net
    }))
    
hashtags_map.to_csv(out_path, index=False)