In [13]:
from networkx.algorithms import community as nx_comm
from glob import glob
import numpy as np
import pandas as pd
import networkx as nx
from python_paris import paris
from sknetwork.hierarchy import cut_straight

# Hierarchical Clustering
From the (weighted) cosharing and retweets edgelists in folder_EU_AM, we detect the communities with hierarchical clustering.
For each country and period, for CO and RT edgelist:
- With Paris we create the dendrogram
- Compare the partitions obtained with the first 5 cuts, keep the partition with highest modularity
- If 90% of nodes are in the same community, study the following 5 cuts, and repeat the procedure until the largest community covers <90% of nodes
- assign users to the communities
- Save the communities in the same folder in the format IT_it_period1_RT_communities.csv

In [2]:
periods = {"period1": ["201910","201911","201912"],
           "period2": ["202007","202008","202009"], 
           "period3": ["202010","202011","202012"], 
           "period4":  ["202101","202102","202103"]}

In [3]:
folder_EU_AM = "/data/public/jlenti/multilang-vax/EuropeAmerica_RTCO"

In [4]:
#dataframes with all the pairs lang-countries, with the selected countries
selected_pairs = pd.read_csv("/home/jlenti/Files/country_langs_selected_2104.csv", index_col = 0)
selected_pairs.head()

Unnamed: 0,country,lang
0,US,en
1,BR,pt
2,AR,es
3,GB,en
4,ES,es


### Functions

In [5]:
#given a dendrogram and a cutoff, return a dataframe with columns user - community, with the communities obtained at the given cut      
def nodes_community_dendro(dendro, n, G):  
    nodes_G = list(G.nodes)
     #flatten the dendrogram, with a list of labels, associating nodes to communities
    l = cut_straight(dendro, n_clusters = n)
     #transform in l + 1 in order to start from community 1 (instead of commuity 0)
    df = pd.DataFrame({"user": nodes_G, "community": l + 1})
    return df


In [6]:
#given a dendrogram from a hierarchical clustering, return (and print) the modularities obtained at different cuts. cuts is the list of cutoff heights we want to compare 
def dendrogram_modularity(dendro, G, cuts, plot = True):    
    nodes_G = list(G.nodes)
    modularities = {}                                                   
    for n in cuts:                                                                              
        coms = []
        l = cut_straight(dendro, n_clusters = n)
        for i in range(n):
            ind_i = list(np.where(l == i)[0])
            coms.append([nodes_G[k] for k in ind_i])
        modularities[n] = nx_comm.modularity(G, coms)
    return modularities


In [7]:
#given a graph and its hierarchical clustering, compute the modularity for all the cuts in cuts
#pick the cutoff with highest modularity
#return df with nodes labels and modularity selected
def best_modularity_community_label(dendro, G, cuts):
    #compute all the modularities for all the cuts in cuts
    mods = dendrogram_modularity(dendro, G, cuts, plot = False)
    #cut that gives the highest modularity
    best_cut = max(mods, key = mods.get)
    #label the nodes
    community_df = nodes_community_dendro(dendro, best_cut, G)
    return (community_df, mods[best_cut])



In [8]:
#input: the graph, the cuts compared in the dendrogram and information about the folder
#build the dendrogram with paris. If it does not work, try paris from a different package, and ward (save used algorithm).
#best_modularity_community_label find the best cutoff (highest modularity), returns community labelling and modularity
#save modularity, used algorithm and community label
#when paris return only one big community try the 5 higher cuts!
def save_communities_modularity_paris(G, folder, country, lang, period, layer, cuts = np.arange(2,6)):    
        #need undirected graph
    try:
        dendro = paris(nx.Graph(G))
        communities, modularity = best_modularity_community_label(dendro, G, cuts)
        algo = "paris"
    except:
        #if paris and ward do not work (too small graphs), save null data in the predefined folders
        algo, modularity = "null", -1
        communities = pd.DataFrame({"user": [], "community": []})
        print(period, "null")
                
    if (len(communities) > 0) & ((communities["community"].value_counts() / len(communities)).max() > 0.9):
            print("next cuts")
            save_communities_modularity_paris(G, folder, country, lang, period, layer, cuts + 5)
        
    else:
        #save data in a way such as
        #/data/public/jlenti/multilang-vax/multilayer_RT_CO/IT/period1/IT_it_period1_community_RT.csv.gz
        #where folder = /data/public/jlenti/multilang-vax/multilayer_RT_CO/
        a = 0
        
        #with open("/".join([folder, period, "_".join([country, lang, period, layer, "modularity.txt"])]), "w") as f:
        #    f.write(str(modularity))
        #with open("/".join([folder, period, "_".join([country, lang, period, layer, "algo.txt"])]), "w") as f:
        #    f.write(str(algo))
        #communities.to_csv("/".join([folder, period,  "_".join([country, lang, period, layer, "communities.csv.gz"])]),
        #                   compression = "gzip", index = False)
        return communities


### Example

In [9]:
layer = "RT"
period = "period1"
country, lang = "IT", "it"
edges = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*".join([country, layer, "edg", ""])])))[0])

In [10]:
G_full = nx.from_pandas_edgelist(edges, 
                                 source = edges.columns[0],
                                 target = edges.columns[1], 
                                 edge_attr = "weight")
#need to extract giant component (while in RT and CO we had only data in giant component)
G = G_full.subgraph(max(nx.connected_components(G_full), key = len))
                

In [11]:
coms = save_communities_modularity_paris(G, "", country, lang, period, layer, cuts = np.arange(2,6))

In [12]:
coms.head()

Unnamed: 0,user,community
0,000Salvatore,1
1,CriticaScient,1
2,DavideFalchieri,1
3,FmMosca,1
4,GavinoSanna1967,1


### RT Communities - All countries

In [None]:
#for country in countries 
#-extract edgelist files
#-build graph (giant component)
#-save communities
layer = "RT"

for period in periods:
    for _, (country, lang) in selected_pairs.iterrows():
        edges = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*".join([country, layer, "edg", ""])])))[0])        
        
        G_full = nx.from_pandas_edgelist(edges, 
                                         source = "user_screen_name",
                                         target = "RT_user_screen_name",
                                         edge_attr = "weight")
        #need to extract giant component (while in RT and CO we had only data in giant component)
        G = G_full.subgraph(max(nx.connected_components(G_full), key = len))
        
        with open("/".join([folder_0612, period, "_".join([country, lang, period, layer, "n_nodes.txt"])]), "w") as f:
            f.write(" ".join([str(G_full.number_of_nodes()), str(G.number_of_nodes())]))
            
        #find communities and save
        save_communities_modularity_paris(G, folder_EU_AM, country, lang, period, layer)

### CO Communities - All countries

In [None]:
#for country in countries 
#-extract edgelist files
#-build graph (giant component)
#-save communities
layer = "CO"

for period in periods:
    for _, (country, lang) in selected_pairs.iterrows():
        edges = pd.read_csv(sorted(glob("/".join([folder_EU_AM, period, "*".join([country, layer, "edg", ""])])))[0])        
        
        G_full = nx.from_pandas_edgelist(edges, 
                                         source = "user1",
                                         target = "user2",
                                         edge_attr = "weight")
        #need to extract giant component (while in RT and CO we had only data in giant component)
        G = G_full.subgraph(max(nx.connected_components(G_full), key = len))
        
        with open("/".join([folder_0612, period, "_".join([country, lang, period, layer, "n_nodes.txt"])]), "w") as f:
            f.write(" ".join([str(G_full.number_of_nodes()), str(G.number_of_nodes())]))
            
        #find communities and save
        save_communities_modularity_paris(G, folder_EU_AM, country, lang, period, layer)