In [1]:
'''libraries list with help showing the version of the libraries being used in this nodebook'''
libraries = []

'''Datasets, arrays and files '''
import pandas as pd
import numpy as np
import pickle
libraries.append('pandas')
libraries.append('numpy')
libraries.append('pickle')

'''Following progress'''
from tqdm.notebook import tqdm
libraries.append('tqdm')

'''Plots'''
%matplotlib notebook

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpldatacursor import datacursor
from mpldatacursor import HighlightingDataCursor
import seaborn as sns
plt.style.use('seaborn-paper')
plt.rcParams["figure.facecolor"] = "w"

libraries.append('matplotlib')
libraries.append('mpldatacursor')
libraries.append('seaborn')
                 
'''Networks and community detection'''
import networkx as nx
import igraph as ig
import community as community_louvain
from networkx.algorithms import community
from sklearn import cluster
libraries.append('networkx')
libraries.append('igraph')

'''Measures and utilities'''
from Levenshtein import distance
import statistics
from sklearn.metrics import normalized_mutual_info_score 
from sklearn.metrics.cluster import adjusted_rand_score
from collections import defaultdict
import math
from statistics import mean, stdev
from operator import truediv

libraries.append('collections')
libraries.append('scikit-learn')
libraries.append('math')
libraries.append('statistics')

'''Pthon and library version'''
import types
import pkg_resources
import sys
from platform import python_version

In [2]:
'''To display version of Software being used'''
print('Version of python installed: {}' .format(sys.version))
print('Version of python being used: {}' .format(python_version()))
print('\nNon-built in libraries being used:')

for m in pkg_resources.working_set:
    if m.project_name.lower() in libraries:
        print('{}, version {}'.format(m.project_name,m.version))

Version of python installed: 3.8.10 (default, May 19 2021, 11:01:55) 
[Clang 10.0.0 ]
Version of python being used: 3.8.10

Non-built in libraries being used:
tqdm, version 4.62.3
seaborn, version 0.11.2
scikit-learn, version 0.22.1
pandas, version 1.3.3
numpy, version 1.19.2
networkx, version 2.6.3
mpldatacursor, version 0.7.1
matplotlib, version 3.4.3


In [3]:
def total_weight_attribute(G):
    '''It returns the total of the weight attributes in a given network G'''
    edges = list(G.edges(data=True))
    total_weight = 0
    for e in tqdm(range(len(edges))):
        total_weight += edges[e][2]['weight']
    return total_weight

def expectation_sharing_edge(G, node1, node2, L):
    '''Returns a float indicating the proportion of expected weight or co-ocurrences
    between two hashtags in a network'''
    #print(node1, node2)
    p = (G.degree[node1]*G.degree[node2])/(2*L)
    return p

def normalize_weights(G, weighted_edges):
    '''Given a graph G and a list of weighted edges in the way:
    weighted_edges = [('corona', 'TBT', 1), ('corona', 'newnormal', 4), ... ]
    it returns normalized weights n_weighted_edges in the same structure
    '''
    total_p = []
    n_weighted_edges = []
    total_w = []
    L = nx.number_of_edges(G)
    i = 0
    for node1, node2, weight in tqdm(weighted_edges):
        p = expectation_sharing_edge(G, node1, node2, L)
        n_weight =  weight/p  
        if i < 10:
            print('n1:', node1,' n2:', node2, ' p:',p,' w:',weight , 'n_w:',n_weight)
        total_p.append(p)
        total_w.append(n_weight)
        tup = (node1, node2, n_weight)
        n_weighted_edges.append(tup)
        i += 1
    
    w_min = min(total_w)
    w_max = max(total_w)
    print('Sum p: ',sum(total_p), ' min: ', w_min, ' max:', w_max)
    nn_weighted_edges = []
    #Normalize in the range between 0-1
    for n1, n2, w in n_weighted_edges:
        norm_weight = (w - w_min)/(w_max-w_min)
        tup = (n1, n2, norm_weight)
        nn_weighted_edges.append(tup)
    return nn_weighted_edges

def add_similarity_attr(G):
    '''Given a network G, it will calculate the Levenshtein distance between each pair of nodes
    that share an edge in the network. It will add the distances as an attribute called 
    similarity'''
    attr = {}
    for e1, e2 in G.edges():
        d = distance(e1, e2)
        attr[(e1,e2)] = {'similarity':d}
    nx.set_edge_attributes(G, attr)

def add_correlation_attr(G):
    '''Given a network G, it will calculate the Levenshtein distance between each pair of nodes
    that share an edge in the network. It will add the distances as an attribute called 
    similarity'''
    
    '''To add correlation attribute'''
    file_name = "../_generated_data/edges_corr.pkl"
    open_file = open(file_name, "rb")
    corr_dict = pickle.load(open_file)
    open_file.close()
    attr = {}
    for e1, e2 in tqdm(G.edges()):
        try:
            c = corr_dict[(e1,e2)]
        except KeyError:
            c = corr_dict[(e2,e1)]
        attr[(e1,e2)] = {'correlation':c}
    nx.set_edge_attributes(G, attr)

In [4]:
'''To open the information of weighted edges'''
file_name = "../_generated_data/edges_counts.pkl"
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

print('''The hashtags {} and {} have the maximum number of co-occurrences: {}.'''
      .format(max(loaded_list,key=lambda item:item[2])[0],
        max(loaded_list,key=lambda item:item[2])[1],
        max(loaded_list,key=lambda item:item[2])[2])) #Maximum weight

The hashtags Coronavirus and SaveTheWorld have the maximum number of co-occurrences: 2308.


In [5]:
G = nx.Graph()  
G.add_weighted_edges_from(loaded_list)
print('Size: ', G.size())
print('Number of nodes: ', G.number_of_nodes())

Size:  19180
Number of nodes:  518


In [6]:
nx.transitivity(G)

0.3728978094137729

In [7]:
list(G.edges(data=True))[:5]

[('corona', 'TBT', {'weight': 1}),
 ('corona', 'newnormal', {'weight': 4}),
 ('corona', 'quarantine', {'weight': 379}),
 ('corona', 'coronavirus', {'weight': 600}),
 ('corona', 'covid_19', {'weight': 316})]

# Normalize weights

In [8]:
edges_list = list(G.edges(data=True))
node_1 = edges_list[1][0]
node_2 = edges_list[1][1]
edge_weight = edges_list[1][2]['weight']
edge_weight

4

In [9]:
loaded_list[:5]

[('corona', 'TBT', 1),
 ('corona', 'newnormal', 4),
 ('corona', 'quarantine', 379),
 ('corona', 'coronavirus', 600),
 ('corona', 'covid_19', 316)]

In [11]:
normalized_weights= normalize_weights(G, loaded_list)
G = nx.empty_graph()
G.add_weighted_edges_from(normalized_weights)
add_correlation_attr(G)
print('Size: ', G.size())
print('Number of nodes: ', G.number_of_nodes())

  0%|          | 0/19180 [00:00<?, ?it/s]

n1: corona  n2: TBT  p: 0.25351929092805003  w: 1 n_w: 3.9444730077120824
n1: corona  n2: newnormal  p: 1.0140771637122001  w: 4 n_w: 3.9444730077120824
n1: corona  n2: quarantine  p: 4.076590198123045  w: 379 n_w: 92.96985509470642
n1: corona  n2: coronavirus  p: 4.309827945776851  w: 600 n_w: 139.2166943898382
n1: corona  n2: covid_19  p: 3.711522419186653  w: 316 n_w: 85.14026437411324
n1: corona  n2: stayhome  p: 3.4275808133472365  w: 250 n_w: 72.93774052722046
n1: corona  n2: washyourhands  p: 1.2270333680917622  w: 13 n_w: 10.59465890501179
n1: corona  n2: lockdown  p: 2.7075860271115744  w: 69 n_w: 25.483954825106153
n1: corona  n2: black  p: 0.6591501564129302  w: 6 n_w: 9.102630017797113
n1: corona  n2: photography  p: 2.0687174139728883  w: 26 n_w: 12.568173799082615
Sum p:  8109.4242700730365  min:  0.4004426164477942  max: 469910.0


  0%|          | 0/19180 [00:00<?, ?it/s]

Size:  19180
Number of nodes:  518


# Check connected components

In [12]:
len(list(nx.connected_components(G)))

2

In [13]:
'''To keep the largest connected component'''
#G = G.subgraph(max(nx.connected_components(G), key=len))
#print('Size: ', G.size())
#print('Number of nodes: ', G.number_of_nodes())

'To keep the largest connected component'

# Set a threshold for weights

In [14]:
'''Delete the edges with weight < 0''' 

edges_list = list(G.edges(data=True))
i = 0
for element in edges_list:
    node_1 = edges_list[i][0]
    node_2 = edges_list[i][1]
    edge_weight = edges_list[i][2]['weight']
    
    if edge_weight <= 0:
        if G.has_edge(node_1, node_2):
            G.remove_edge(node_1, node_2)
        else:
            G.remove_edge(node_2, node_1)
    i += 1
print('Size: ', G.size())
print('Number of nodes: ', G.number_of_nodes())

Size:  19179
Number of nodes:  518


In [15]:
len(list(nx.connected_components(G)))

2

In [16]:
final_weights = []
only_weights = []
edges = list(G.edges())
count_under = 0

for e1, e2 in tqdm(edges):
    w = G[e1][e2]['weight']
    #print(w)
    if w <= 0:
        count_under +=1 
    if w > 0:
        tup = (e1, e2, w)
        final_weights.append(tup)
        only_weights.append(w)

print('Number of edges with weight under 0 is {}'.format(count_under))

  0%|          | 0/19179 [00:00<?, ?it/s]

Number of edges with weight under 0 is 0


In [None]:
print(min(only_weights), max(only_weights))
#0 weight means no edge

In [23]:
'''How to set the threshold? Lets take a look at the distribution of the weights'''

sns.distplot(only_weights, color="dodgerblue", label="Weights", bins=100)

plt.title('Weights distribution')
plt.xlabel('Weight')
plt.savefig('Weights_distribution', dpi=1000)

plt.show()



<IPython.core.display.Javascript object>

In [24]:
print('max: ', max(only_weights)) 
print('min: ', min(only_weights))
print('median: ',statistics.median(only_weights)) 

max:  1.0
min:  4.793922427079444e-07
median:  1.396096634166118e-05


In [25]:
print('Total number of weights (links):', len(only_weights))

Total number of weights (links): 19179


In [26]:
sorted([w for w in only_weights if w > 0.5])[int(len([w for w in only_weights if w > 0.5])*80/100)]

0.8908423978217936

In [27]:
def cut_weights_threshold(final_weights, threshold):
    '''It returns the edges that holds in the network after using a given threshold'''
    final_edges = []
    for e1, e2, w in final_weights:
        if w > threshold: #weights that are above a defined threshold
            tmp = (e1, e2, w)
            final_edges.append(tmp)
    return final_edges

def network_with_threshold(g, threshold = 0, prints = True):
    '''It returns a network g using a given threshold.
    If no threshold is provided, the default value is 0. If prints
    is False, the prints will be desactivated'''
    
    G = g.copy()
    final_edges = cut_weights_threshold(final_weights, threshold)
    G = nx.empty_graph()
    G.add_weighted_edges_from(final_edges)
    #G = G.subgraph(max(nx.connected_components(G), key=len))

    if prints == True:
        print('Size: ', G.size())
        print('Number of nodes: ', G.number_of_nodes())

    return G

In [29]:
G_plot = network_with_threshold(G, -1)
plt.figure(figsize=(10,10))
nx.draw(G, pos=nx.spring_layout(G_plot),with_labels = True) 

Size:  19179
Number of nodes:  518


<IPython.core.display.Javascript object>

# To characterize the network
Produce some measures to characterize the network at different weight thresholds.


In [30]:
j = 0
nodes = list(G_n.nodes())
print(G_n.degree(nodes[j]))

389


In [31]:
def average_degree(G):
    avg_degree = 0
    j = 0 
    sumDeg = 0 #This is going to work if the Network has at least one node, which is the case.
    nodes = list(G.nodes())
    #G.degree(nodes[j])

    while j < G.number_of_nodes(): 
        #sumDeg = sumDeg + nx.degree(G)[j]
        sumDeg = sumDeg + G.degree(nodes[j])
        j += 1
    avg_degree = sumDeg / G.number_of_nodes()
    return avg_degree

def sigma_z(G):
    z_2 = 0
    j = 0 
    sumDeg = 0 #This is going to work if the Network has at least one node, which is the case.
    nodes = list(G.nodes())
    
    while j < G.number_of_nodes(): #becareful with the end
        sumDeg = sumDeg + G.degree(nodes[j])**2
        j += 1

    z_2 = sumDeg / G.number_of_nodes()

    z__2 = 0
    j = 0
    sumDeg = 0 #This is going to work if the Network has at least one node, which is the case.

    while j < G.number_of_nodes():
        sumDeg = sumDeg + G.degree(nodes[j])
        j += 1

    z__2 = (sumDeg / G.number_of_nodes())**2

    sigma_z = math.sqrt(z_2 - z__2)

    return(sigma_z)

def measures(G):
    N = G.number_of_nodes()

    #L = number of edges
    L = G.number_of_edges()

    #⟨z⟩ = avg_degree
    avg_degree = average_degree(G)

    #σ_z = sig_z
    sig_z = sigma_z(G)

    #Cnet
    Cnet = nx.transitivity(G)
    
    to_check_size = G.subgraph(max(nx.connected_components(G), key=len))
    N_largest_comp = nx.number_of_nodes(to_check_size)
    
    if nx.connected.is_connected(G):
        #⟨⟨d⟩⟩
        avg_avgd = nx.average_shortest_path_length(G)

        #d_diam
        d_diam = nx.diameter(G)

    else:
        G_con_subg = G.subgraph(max(nx.connected_components(G), key=len))

        #⟨⟨d⟩⟩
        avg_avgd = nx.average_shortest_path_length(G_con_subg)

        #d_diam
        d_diam = nx.diameter(G_con_subg)

    return N, L, avg_degree, sig_z, Cnet, N_largest_comp, avg_avgd, d_diam

def calculate_measures(G, list_thresholds):
    N_g = []
    L_g = []
    avg_degree_g = []
    sig_z_g = []
    Cnet_g = []
    N_largest_comp_g = []
    avg_avgd_g = []
    d_diam_g = []
    
    for t in list_thresholds:
        g_threshold = network_with_threshold(G, t, False)
        N, L, avg_degree, sig_z, Cnet, N_largest_comp, avg_avgd, d_diam = measures(g_threshold)
        N_g.append(N)
        L_g.append(L)
        avg_degree_g.append(avg_degree)
        sig_z_g.append(sig_z)
        Cnet_g.append(Cnet)
        N_largest_comp_g.append(N_largest_comp)
        avg_avgd_g.append(avg_avgd)
        d_diam_g.append(d_diam)
        
    return N_g, L_g, avg_degree_g, sig_z_g, Cnet_g, N_largest_comp_g, avg_avgd_g, d_diam_g

In [32]:
average_degree(G)

74.05019305019304

In [34]:
list_thresholds = np.linspace(0.000002, 0.9999, 1000)
measures_tup = calculate_measures(G, list_thresholds)
with open('measuresv3.pickle', 'wb') as f:
    pickle.dump(measures_tup, f)

In [35]:
list_thresholds = np.linspace(0.000002, 0.9999, 1000)
with open('measuresv3.pickle', 'rb') as f:
     measures_tup = pickle.load(f)
fig, ((ax1, ax6), (ax2, ax3), (ax4, ax5), (ax7, ax8)) = plt.subplots(4, 2, figsize = (10,10), 
                                                                     sharex=True)
fig.suptitle('Network measures for different weight thresholds')
ax1.set_title('Number of nodes & Number of nodes in the largest component')
ax1.plot(list_thresholds, measures_tup[0], alpha = 0.6, color = '#1A5276', 
         label='N')
ax1.plot(list_thresholds, measures_tup[5], alpha = 0.6, color = '#D4AC0D', 
         label='N in largest comp.')

ax1.legend()

ax6.set_title('Proportion of nodes  in the largest component')
prop = list(map(truediv, measures_tup[5], measures_tup[0]))
ax6.plot(list_thresholds, prop, alpha = 0.6, color = '#023130', label='N in largest comp.')


ax2.set_title('Number of Links')
ax2.plot(list_thresholds, measures_tup[1], alpha = 0.6, color = '#023130')

ax3.set_title('Average degree')
ax3.plot(list_thresholds, measures_tup[2], alpha = 0.6, color = '#023130')

ax4.set_title('Degree variability')
ax4.plot(list_thresholds, measures_tup[3], alpha = 0.6, color = '#023130')

ax5.set_title('Global clustering')
ax5.plot(list_thresholds, measures_tup[4], alpha = 0.6, color = '#023130')

ax7.set_title('Average average geodesic distance')
ax7.plot(list_thresholds, measures_tup[6], alpha = 0.6, color = '#023130')


ax8.set_title('Diameter')
ax8.plot(list_thresholds, measures_tup[7], alpha = 0.6, color = '#023130')


plt.xlabel('Weight threshold', loc='left')
plt.tight_layout()
plt.savefig('Measures.png', dpi=1000)
plt.show()
#for ax in fig.get_axes():
 #   ax.label_outer()

<IPython.core.display.Javascript object>

In [36]:
list_thresholds = np.linspace(0.00002, 0.1, 1000)
measures_tup = calculate_measures(G, list_thresholds)
with open('measures_2v4.pickle', 'wb') as f:
    pickle.dump(measures_tup, f)

In [37]:
with open('measures_2v4.pickle', 'rb') as f:
    measures_tup2 = pickle.load(f)
list_thresholds = np.linspace(0.00002, 0.1, 1000)

t1 = 0.003
t2 = 0.009
t3 = 0.012

fig, ((ax1, ax6), (ax2, ax3), (ax4, ax5), (ax7, ax8)) = plt.subplots(4, 2, figsize = (10,10), 
                                                                     sharex=True)
fig.suptitle('Network measures for different weight thresholds')
ax1.set_title('Number of nodes & Number of nodes in the largest component')
ax1.plot(list_thresholds, measures_tup[0], alpha = 0.6, color = '#1A5276', 
         label='N')
ax1.plot(list_thresholds, measures_tup[5], alpha = 0.6, color = '#D4AC0D', 
         label='N in largest comp.')
ax1.vlines(t1, 0, max(measures_tup[0]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax1.vlines(t2, 0, max(measures_tup[0]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax1.vlines(t3, 0, max(measures_tup[0]), colors='#A569BD', 
           label = 't={}'.format(t3), linestyles='solid')
ax1.legend()

ax6.set_title('Proportion of nodes  in the largest component')
prop = list(map(truediv, measures_tup[5], measures_tup[0]))
ax6.plot(list_thresholds, prop, alpha = 0.6, color = '#023130', label='N in largest comp.')
ax6.vlines(t1, min(prop), max(prop), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax6.vlines(t2, min(prop), max(prop), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax6.vlines(t3, min(prop), max(prop),colors='#A569BD', 
           label = 't={}'.format(t3),linestyles='solid')


ax2.set_title('Number of Links')
ax2.plot(list_thresholds, measures_tup[1], alpha = 0.6, color = '#023130')
ax2.vlines(t1, min(measures_tup[1]), max(measures_tup[1]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax2.vlines(t2, min(measures_tup[1]), max(measures_tup[1]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax2.vlines(t3, min(measures_tup[1]), max(measures_tup[1]), colors='#A569BD', 
           label = 't={}'.format(t3),linestyles='solid')
ax3.set_title('Average degree')
ax3.plot(list_thresholds, measures_tup[2], alpha = 0.6, color = '#023130')
ax3.vlines(t1, min(measures_tup[2]), max(measures_tup[2]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax3.vlines(t2, min(measures_tup[2]), max(measures_tup[2]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax3.vlines(t3, min(measures_tup[2]), max(measures_tup[2]), colors='#A569BD', 
           label = 't={}'.format(t3), linestyles='solid')
ax4.set_title('Degree variability')
ax4.plot(list_thresholds, measures_tup[3], alpha = 0.6, color = '#023130')
ax4.vlines(t1, min(measures_tup[3]), max(measures_tup[3]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax4.vlines(t2, min(measures_tup[3]), max(measures_tup[3]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax4.vlines(t3, min(measures_tup[3]), max(measures_tup[3]), colors='#A569BD', 
           label = 't={}'.format(t3),
           linestyles='solid')
ax5.set_title('Global clustering')
ax5.plot(list_thresholds, measures_tup[4], alpha = 0.6, color = '#023130')
ax5.vlines(t1, min(measures_tup[4]), max(measures_tup[4]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax5.vlines(t2, min(measures_tup[4]), max(measures_tup[4]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax5.vlines(t3, min(measures_tup[4]), max(measures_tup[4]), colors='#A569BD', 
           label = 't={}'.format(t1),linestyles='solid')

ax7.set_title('Average average geodesic distance')
ax7.plot(list_thresholds, measures_tup[6], alpha = 0.6, color = '#023130')
ax7.vlines(t1, min(measures_tup[6]), max(measures_tup[6]), colors='#EC7063', 
           label = 't={}'.format(t2), linestyles='solid')
ax7.vlines(t2, min(measures_tup[6]), max(measures_tup[6]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax7.vlines(t3, min(measures_tup[6]), max(measures_tup[6]), colors='#A569BD', 
           label = 't={}'.format(t3),linestyles='solid')
ax8.set_title('Diameter')
ax8.plot(list_thresholds, measures_tup[7], alpha = 0.6, color = '#023130')
ax8.vlines(t1, min(measures_tup[7]), max(measures_tup[7]), colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
ax8.vlines(t2, min(measures_tup[7]), max(measures_tup[7]), colors='#F39C12', 
           label = 't={}'.format(t2),linestyles='solid')
ax8.vlines(t3, min(measures_tup[7]), max(measures_tup[7]), colors='#A569BD', 
           label = 't={}'.format(t1), linestyles='solid')

plt.xlabel('Weight threshold', loc='left')
plt.tight_layout()
plt.savefig('Measures.png', dpi=1000)
plt.show()
#for ax in fig.get_axes():
 #   ax.label_outer()

<IPython.core.display.Javascript object>

In [39]:
t = 0.0003
g_threshold = network_with_threshold(G, t, False)
N, L, avg_degree, sig_z, Cnet, N_largest_comp, avg_avgd, d_diam = measures(g_threshold)
print('With threshold = {}:'.format(t))
print('The number of nodes N is {}' .format(N))
print('The number of edges is {}' .format(L))
print('The average degree ⟨z⟩ is {:.4f}' .format(avg_degree))
print('The degree variability σz is {:.4f}' .format(sig_z))
print('The global clustering Cnet is {:.4f}' .format(Cnet))
print('The average average geodesic distance ⟨⟨d⟩⟩ is {:.4f}' .format(avg_avgd))
print('The diameter d_diam is {} ' .format(d_diam))
print('Note that the number of nodes of the largest component is {}, and the proportion with respect the total nodes is {:.1f}' .format(N_largest_comp, (N_largest_comp/N)))

With threshold = 0.0003:
The number of nodes N is 283
The number of edges is 916
The average degree ⟨z⟩ is 6.4735
The degree variability σz is 4.9430
The global clustering Cnet is 0.7490
The average average geodesic distance ⟨⟨d⟩⟩ is 5.7470
The diameter d_diam is 14 
Note that the number of nodes of the largest component is 215, and the proportion with respect the total nodes is 0.8


In [40]:
t = 0.0009
g_threshold = network_with_threshold(G, t, False)
N, L, avg_degree, sig_z, Cnet, N_largest_comp, avg_avgd, d_diam = measures(g_threshold)
print('With threshold = {}:'.format(t))
print('The number of nodes N is {}' .format(N))
print('The number of edges is {}' .format(L))
print('The average degree ⟨z⟩ is {:.4f}' .format(avg_degree))
print('The degree variability σz is {:.4f}' .format(sig_z))
print('The global clustering Cnet is {:.4f}' .format(Cnet))
print('The average average geodesic distance ⟨⟨d⟩⟩ is {:.4f}' .format(avg_avgd))
print('The diameter d_diam is {} ' .format(d_diam))
print('Note that the number of nodes of the largest component is {}, and the proportion with respect the total nodes is {:.1f}' .format(N_largest_comp, (N_largest_comp/N)))

With threshold = 0.0009:
The number of nodes N is 200
The number of edges is 618
The average degree ⟨z⟩ is 6.1800
The degree variability σz is 4.4088
The global clustering Cnet is 0.8140
The average average geodesic distance ⟨⟨d⟩⟩ is 3.3255
The diameter d_diam is 8 
Note that the number of nodes of the largest component is 51, and the proportion with respect the total nodes is 0.3


In [41]:
t = 0.0012
g_threshold = network_with_threshold(G, t, False)
N, L, avg_degree, sig_z, Cnet, N_largest_comp, avg_avgd, d_diam = measures(g_threshold)
print('With threshold = {}:'.format(t))
print('The number of nodes N is {}' .format(N))
print('The number of edges is {}' .format(L))
print('The average degree ⟨z⟩ is {:.4f}' .format(avg_degree))
print('The degree variability σz is {:.4f}' .format(sig_z))
print('The global clustering Cnet is {:.4f}' .format(Cnet))
print('The average average geodesic distance ⟨⟨d⟩⟩ is {:.4f}' .format(avg_avgd))
print('The diameter d_diam is {} ' .format(d_diam))
print('Note that the number of nodes of the largest component is {}, and the proportion with respect the total nodes is {:.1f}' .format(N_largest_comp, (N_largest_comp/N)))

With threshold = 0.0012:
The number of nodes N is 183
The number of edges is 549
The average degree ⟨z⟩ is 6.0000
The degree variability σz is 4.3394
The global clustering Cnet is 0.8400
The average average geodesic distance ⟨⟨d⟩⟩ is 2.6111
The diameter d_diam is 5 
Note that the number of nodes of the largest component is 37, and the proportion with respect the total nodes is 0.2


In [42]:
''' To convert to igraph'''
h = ig.Graph.from_networkx(G)
h.es[0].attributes()
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6))
layout = h.layout_kamada_kawai()
#ig.plot(h, layout='auto', target='test.pdf') to print to a file
ig.plot(h, layout='auto', 
        edge_width=.2,
        edge_color='grey',
        #vertex_size=h.degree(),
        target=ax,
       label = h.vs['_nx_name'])
plt.axis("off")
plt.show()

<IPython.core.display.Javascript object>

In [43]:
sorted_clustering = dict(nx.clustering(G))
sorted_clustering = dict(sorted(sorted_clustering.items(), key=lambda item: item[1], reverse = True))
first_30_sorted_clustering = list(sorted_clustering)[:30]
for node in first_30_sorted_clustering:
    print(node, ': ', sorted_clustering[node])

visitLosAngeles :  1.0
foodbesat :  1.0
eatLA :  1.0
feastagrem :  1.0
lovefood :  1.0
onthetable :  1.0
seriouseats :  1.0
stayhomesafer :  1.0
yumYum :  1.0
Tasty :  1.0
kjmg :  1.0
growinsilence :  1.0
mindonamillion :  1.0
yourpage :  1.0
sequester :  1.0
abandoned :  1.0
w8time :  1.0
selfpublish :  1.0
angrybirdstransformers :  1.0
angrybirds :  1.0
decepticons :  1.0
mit :  1.0
financialprofessional :  1.0
supersinghs :  1.0
BeatThePandemic :  1.0
support81worldwide :  1.0
support81longisland :  1.0
support81newyorkcity :  1.0
classicnegative :  0.9848484848484849
qurantinelife :  0.978021978021978


In [44]:
def create_palette(partition):
    '''To create an appropiate palette containing the number of colors needed'''
    n = max(partition.values()) + 1
    palette = sns.color_palette("husl", n)
    palette.as_hex()
    i = 0
    color_palette = {}
    for n in set(partition.values()):
        color_palette[n] = palette[i]
        i += 1
    return color_palette
    
def find_best_girvan_newman(G):
    '''To find the best partition using Girvan and Newman algorith.
    It will cut the following level using betweeness. If the modularity is not
    higher, it will retrieve the best result so far (the previous one)'''
    i = 0
    m = 0
    communities_generator = community.girvan_newman(G)
    next_level_communities = next(communities_generator)
    while True:
        m_tmp = community.quality.modularity(G, next_level_communities)
        if (m_tmp < m) | (abs(m_tmp - m) < 0.0001) :
            return next_level_communities
        else:
            m = m_tmp
        #print(i, ': ',m)
        next_level_communities = next(communities_generator)
        i+= 1
        
def partition_dictionary(communities):
    '''Given a list of nodes forming communities, it will retrieve a dictionary
    mentioning every node and its respective number of community'''
    partition = {}
    i = 0
    for group in list(communities):
        for node in group:
            partition[node] = i
        i += 1
    return partition

def dic_to_partition(dic):
    '''Transform a dictionary containing nodes as keys and groups or communities as values
    into a partition'''
    partition = []
    groups= set(dic.values())
    for group in groups:
        set_tmp = set([node for node, g in dic.items() if g == group])
        partition.append(set_tmp)
    return partition

def best_partition(G, algorithm, k=3, prints = True):
    ''' Compute the best partition using algorithms available:
    algorithm = ['louvain','fluid','label','girvan_newman']
    Note that 'fluid' need to set a number of groups k.'''
    
    G_c = G.copy()
    if algorithm == 'louvain':
        partition = community_louvain.best_partition(G_c, weight = 'weight', random_state = 13)
        ######
    elif algorithm == 'fluid':
        #Needs connected graph
        fluid = community.asyn_fluidc(G_c, k, max_iter=100, seed=13)
        partition = partition_dictionary(fluid)
    
    elif algorithm == 'label':
        label = list(community.label_propagation.label_propagation_communities(G_c))
        partition = partition_dictionary(label)
    
    elif algorithm == 'girvan_newman':
        girvan_newman = list(find_best_girvan_newman(G_c))
        partition = partition_dictionary(girvan_newman)
        
    number_groups = len(set(partition.values()))
    color_palette = create_palette(partition)
    groups = {}
    colors = {}
    
    for n in range(number_groups):
        l = [name for name, group in partition.items() if group == n]
        for name in l:
            groups[name] = n
            colors[name] = color_palette[n]
    
    '''Set appropiate atributes for group and associated color '''
    nx.set_node_attributes(G_c, groups, 'group')
    nx.set_node_attributes(G_c, colors, 'color')
    
    if prints == True:
        pos = nx.spring_layout(G_c)
        nx.draw_networkx_nodes(G_c, pos, partition.keys(), node_size=40,
                               node_color = list(nx.get_node_attributes(G_c,'color').values()),
                               label=list(G_c.nodes()))
        nx.draw_networkx_edges(G_c, pos, alpha=0.5)
        plt.suptitle(algorithm.capitalize() +' Community detection')
        plt.title('Number of groups:' + str(number_groups))
        plt.savefig(algorithm.capitalize() +' Community detection', dpi=300)
    
    return G_c, partition

In [45]:
nets = {}
partitions = {}
algorithm = 'label'

''' It uses an iterative method (again just like k-means): the target label will be assigned 
with the most “vote” of the lables from its neighbors; until the current label is the most 
frequent label.'''

G_label, label_partition = best_partition(G, algorithm)
nets[algorithm] = G_label
partitions[algorithm] = label_partition
plt.show()

algorithm = 'girvan_newman'
G_gn, gn_partition = best_partition(G, algorithm)
nets[algorithm] = G_gn
partitions[algorithm] = gn_partition
plt.show()

algorithm = 'louvain'
G_louvain, louvain_partition = best_partition(G, algorithm)
nets[algorithm] = G_louvain
partitions[algorithm] = louvain_partition
plt.show()

<IPython.core.display.Javascript object>

In [46]:
'''To store the partitions'''
with open('Partitionsv3.pickle', 'wb') as handle:
    pickle.dump(partitions, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [47]:
len(partitions)

3

In [48]:
def create_df_partitions(partitions):
    algorithm = list(partitions.keys())
    for a in algorithm:
        if a == algorithm[0]:
            df = pd.DataFrame(set(partitions[a].items()), columns=['Hashtag', a])
        else:
            df_tmp = pd.DataFrame(set(partitions[a].items()), columns=['Hashtag', a])
            df = df.merge(df_tmp, how='inner', on='Hashtag', copy=False).reset_index(drop=True)
    return df

def group(df, algorithm, hashtag):
    group = df.loc[df['Hashtag'] == hashtag][algorithm].to_numpy()[0]
    return group

def edge_coincidences_community(G, df, algorithm):
    '''Given a network G, a dataframe df containing communities with different algorithms,
    and an algorithm, it will return a list n. The list will contain a 1 per edges that 
    are in the same community, and a 0 if they are at different communities '''
    edges = list(G.edges())

    n = []
    for u, v in edges:

        group_u = group(df, algorithm, u)
        group_v = group(df, algorithm, v)

        if group_u == group_v:
            n.append(1)
        else:
            n.append(0)
    return n

def generate_edge_coincidences_community(G, df, partitions):
    algorithm = list(partitions.keys())
    edge_coincidences = {}
    for a in tqdm(algorithm):
        n = edge_coincidences_community(G, df, a)
        edge_coincidences[a] = n
    return edge_coincidences

def generate_df_nmi_ars_partitions(G, partitions, measure= 'MI_score'):
    
    '''measure = ['MI_score', 'Adj_rand_score']'''
    df = create_df_partitions(partitions)
    edge_coincidences = generate_edge_coincidences_community(G, df, partitions)
    algorithm = list(partitions.keys())
    data = []
    i = 0
    j = 0

    while i < len(algorithm):
        nmi_tmp = []
        ars_tmp = []
        while j < len(algorithm):

            a1 = algorithm[i]
            a2 = algorithm[j]

            nmi_tmp = normalized_mutual_info_score(edge_coincidences[a1], 
                                                   edge_coincidences[a2])
            ars_tmp = adjusted_rand_score(edge_coincidences[a1], edge_coincidences[a2])

            data.append([algorithm[i], algorithm[j], nmi_tmp, ars_tmp])
            j += 1
        j = 0
        i += 1
        
    df_a = pd.DataFrame(data, columns = ['Algorithm1', 'Algorithm2', 'MI_score', 'Adj_rand_score'])
    
    df_a = df_a.pivot('Algorithm1', 'Algorithm2', measure)
    
    return df_a

In [49]:
df_a = generate_df_nmi_ars_partitions(G, partitions)
df_a.head()
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(df_a, annot=True, linewidths=.5, ax=ax)
plt.title('Mutual Information Score')
plt.show()

  0%|          | 0/3 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

# Quality of the partitions

In [50]:
modularity = {}
coverage = {}
performance = {}
n_communities = {}
#The performance of a partition is the number of intra-community edges plus inter-community 
#non-edges divided by the total number of potential edges.
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.quality.performance.html#networkx.algorithms.community.quality.performance
for algorithm, partition in partitions.items():
    p = dic_to_partition(partition)
    coverage[algorithm] = community.quality.coverage(G, p)
    performance[algorithm] = community.quality.partition_quality(G, p)[1]
    modularity[algorithm] = community.quality.modularity(G, p)
    n_communities[algorithm] = len(set(partitions[algorithm].values()))

In [51]:
plt.figure()

colors = sns.color_palette("magma", 3)
plt.plot(modularity.keys(), modularity.values(), label = 'modularity', color = colors[0])
x = 0
y = 0

plt.plot(coverage.keys(), coverage.values(), label = 'coverage', color = colors[1])
plt.plot(performance.keys(), performance.values(), label = 'performance', color = colors[2])
plt.plot()
plt.legend()
i = 0
l = []
for v in n_communities.values():
    plt.annotate('N:'+ str(v), (list(modularity.keys())[i], 0.0))
    l.append('N:'+ str(v))
    i += 1
plt.title('Comparison of Community detection algorithms')
plt.show()

<IPython.core.display.Javascript object>

# Different thresholds

In [52]:
thresholds = np.linspace(0.00002, 0.005, 50)

nets_t = {}
partitions_t = {}
algorithm = 'label'
for t in thresholds:
    g_tmp = nx.empty_graph()
    g_tmp = network_with_threshold(G, t, False)
    G_label, label_partition = best_partition(g_tmp, algorithm, False)
    nets_t[algorithm+str(t)] = G_label
    partitions_t[algorithm+str(t)] = label_partition

In [53]:
modularity = {}
coverage = {}
performance = {}
n_communities = {}
for algorithm, partition in partitions_t.items():
    p = dic_to_partition(partition)
    coverage[algorithm] = community.quality.coverage(nets_t[algorithm], p)
    performance[algorithm] = community.quality.partition_quality(nets_t[algorithm], p)[1]
    modularity[algorithm] = community.quality.modularity(nets_t[algorithm], p)
    n_communities[algorithm] = len(set(partitions_t[algorithm].values()))

In [54]:
plt.figure()
colors = sns.color_palette("magma", 3)
plt.plot(thresholds, modularity.values(), label = 'modularity', color = colors[0])
plt.plot(thresholds, coverage.values(), label = 'coverage', color = colors[1])
plt.plot(thresholds, performance.values(), label = 'performance', color = colors[2])
plt.legend()
i = 0
l = []
for v in n_communities.values():
    #plt.annotate('N:'+ str(v), (list(modularity.keys())[i], 0.0))
    l.append('N:'+ str(v))
    i += 1
print(l)
plt.title('Label propagation algorithm, different thresholds')
plt.savefig('Label propagation, different thresholds', dpi=300)
plt.show()

<IPython.core.display.Javascript object>

['N:6', 'N:64', 'N:54', 'N:51', 'N:44', 'N:43', 'N:41', 'N:39', 'N:38', 'N:33', 'N:30', 'N:32', 'N:32', 'N:32', 'N:32', 'N:32', 'N:31', 'N:31', 'N:30', 'N:30', 'N:29', 'N:30', 'N:29', 'N:29', 'N:30', 'N:27', 'N:27', 'N:27', 'N:29', 'N:29', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:26', 'N:25', 'N:25', 'N:25', 'N:25', 'N:25', 'N:25', 'N:24', 'N:24', 'N:24', 'N:23', 'N:23']


In [55]:
thresholds = np.linspace(0.00002, 0.005, 50)
nets_tt = {}
partitions_tt = {}
algorithm = 'louvain'
for t in thresholds:
    g_tmp = nx.empty_graph()
    g_tmp = network_with_threshold(G, t, False)
    G_louvain, louvain_partition = best_partition(g_tmp, algorithm, False)
    nets_tt[algorithm+str(t)] = G_louvain
    partitions_tt[algorithm+str(t)] = louvain_partition

In [58]:
modularity = {}
coverage = {}
performance = {}
n_communities = {}
t1 = 0.0003
t2 = 0.0009
t3 = 0.0012
#The performance of a partition is the number of intra-community edges plus inter-community 
#non-edges divided by the total number of potential edges.
#https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.quality.performance.html#networkx.algorithms.community.quality.performance
for algorithm, partition in partitions_tt.items():
    p = dic_to_partition(partition)
    coverage[algorithm] = community.quality.coverage(nets_tt[algorithm], p)
    performance[algorithm] = community.quality.partition_quality(nets_tt[algorithm], p)[1]
    modularity[algorithm] = community.quality.modularity(nets_tt[algorithm], p)
    n_communities[algorithm] = len(set(partitions_tt[algorithm].values()))
plt.figure()
colors = sns.color_palette("magma", 3)
plt.plot(thresholds, modularity.values(), label = 'modularity', color = colors[0])
plt.plot(thresholds, coverage.values(), label = 'coverage', color = colors[1])
plt.plot(thresholds, performance.values(), label = 'performance', color = colors[2])
plt.legend()
l = []
i = 0
for v in n_communities.values():
    #plt.annotate('N:'+ str(v), (list(modularity.keys())[i], 0.0))
    l.append('N:'+ str(v))
    i += 1
print(l)
plt.title('Louvain algorithm, different thresholds')
plt.savefig('Louvain, different thresholds1', dpi=600)
#plt.vlines(0.92, 0, 1, colors='red', linestyles='solid',)
plt.vlines(t1, 0, 1, colors='#EC7063', 
           label = 't={}'.format(t1), linestyles='solid')
plt.vlines(t2, 0, 1, colors='#F39C12', label = 't={}'.format(t2),
           linestyles='solid')
plt.vlines(t3, 0, 1, colors='#A569BD', label = 't={}'.format(t3),
           linestyles='solid')
plt.xlabel('Weight Threshold')
plt.ylabel('Measure')
plt.legend()#loc='right')
plt.show()

<IPython.core.display.Javascript object>

['N:12', 'N:27', 'N:33', 'N:38', 'N:36', 'N:33', 'N:34', 'N:34', 'N:34', 'N:30', 'N:27', 'N:29', 'N:29', 'N:29', 'N:27', 'N:28', 'N:26', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:27', 'N:28', 'N:28', 'N:26', 'N:26', 'N:26', 'N:26', 'N:26', 'N:26', 'N:26', 'N:26', 'N:25', 'N:24', 'N:24', 'N:24', 'N:24', 'N:24', 'N:24', 'N:23', 'N:23', 'N:23', 'N:22', 'N:23']


In [116]:
'''Selected Net, using threshold weight = 0.001 and Louvain algorithm'''
algorithm = 'louvain'
g = nx.empty_graph()
g = network_with_threshold(G, 0.001, False)
t_01, p_01 = best_partition(g, algorithm)
plt.show()
add_similarity_attr(t_01)
add_correlation_attr(t_01)
nx.write_gpickle(t_01, "Net_1v3.gpickle")

<IPython.core.display.Javascript object>

  0%|          | 0/591 [00:00<?, ?it/s]

In [115]:
'''Selected Net, using threshold weight = 0.002 and Louvain algorithm'''
algorithm = 'louvain'
g = nx.empty_graph()
g = network_with_threshold(G, 0.002, False)
t_02, p_02 = best_partition(g, algorithm)
plt.show()
add_similarity_attr(t_02)
add_correlation_attr(t_02)
nx.write_gpickle(t_02, "Net_2v3.gpickle")

<IPython.core.display.Javascript object>

  0%|          | 0/458 [00:00<?, ?it/s]

In [114]:
'''Selected Net, using threshold weight = 0.003 and Louvain algorithm'''
algorithm = 'louvain'
g = nx.empty_graph()
g = network_with_threshold(G, 0.003, False)
t_03, p_03 = best_partition(g, algorithm)
add_similarity_attr(t_03)
add_correlation_attr(t_03)
nx.write_gpickle(t_03, "Net_3v3.gpickle")

<IPython.core.display.Javascript object>

  0%|          | 0/392 [00:00<?, ?it/s]

# Compare algorithms with defined thresholds

In [62]:
'''threshold weight = 0.0003 all the algorithms. Label k=3 by default'''
algorithms = ['louvain','label','girvan_newman']
partitions_01 = {}
for a in algorithms:
    g = nx.empty_graph()
    g = network_with_threshold(G, 0.0003, False)
    t_01, p_01 = best_partition(g, a)
    partitions_01[a] = p_01

In [63]:
df_t01 = generate_df_nmi_ars_partitions(g, partitions_01)

  0%|          | 0/3 [00:00<?, ?it/s]

In [64]:
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(df_t01, annot=True, linewidths=.5, ax=ax)
plt.title('Mutual Information Score, t=0.0003')
plt.show()

<IPython.core.display.Javascript object>

In [65]:
'''threshold weight = 0.0009 all the algorithms. Label k=3 by default'''
algorithms = ['louvain','label','girvan_newman']
partitions_02 = {}
for a in algorithms:
    g = nx.empty_graph()
    g = network_with_threshold(G, 0.0009, False)
    t_02, p_02 = best_partition(g, a)
    partitions_02[a] = p_02

In [66]:
df_t02 = generate_df_nmi_ars_partitions(g, partitions_02)

  0%|          | 0/3 [00:00<?, ?it/s]

In [67]:
df_t02.head()

Algorithm2,girvan_newman,label,louvain
Algorithm1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
girvan_newman,1.0,0.257105,0.174573
label,0.257105,1.0,0.509949
louvain,0.174573,0.509949,1.0


In [68]:
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(df_t02, annot=True, linewidths=.5, ax=ax)
plt.title('Mutual Information Score, t=0.0009')
plt.show()

<IPython.core.display.Javascript object>

In [69]:
'''threshold weight = 0.0012 all the algorithms. Label k=3 by default'''
algorithms = ['louvain','label','girvan_newman']
partitions_03 = {}
for a in algorithms:
    g = nx.empty_graph()
    g = network_with_threshold(G, 0.0012, False)
    t_03, p_03 = best_partition(g, a)
    plt.show()
    partitions_03[a] = p_03

In [70]:
df_t03 = generate_df_nmi_ars_partitions(g, partitions_03)
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(df_t03, annot=True, linewidths=.5, ax=ax)
plt.title('Mutual Information Score, t=0.0012')
plt.show()

  0%|          | 0/3 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [71]:
def find_hashtag_community(hashtag, algorithm):
    p = partitions[algorithm]
    #print(p)
    com = p[hashtag]
    return dic_to_partition(p)[com]

In [72]:
for algorithm, net in nets.items():
    add_similarity_attr(net)
    add_correlation_attr(net)
    nx.write_gpickle(G, "Net_"+algorithm+"v3.gpickle")

  0%|          | 0/19179 [00:00<?, ?it/s]

  0%|          | 0/19179 [00:00<?, ?it/s]

  0%|          | 0/19179 [00:00<?, ?it/s]

# Plot different thresholds

In [102]:
%matplotlib notebook
import matplotlib.pyplot as plt
algorithm = 'louvain'
g_01 = nx.empty_graph()
g_01 = network_with_threshold(G, 0.001, False)
g_01, g_01_partition = best_partition(g_01, algorithm, True)
plt.show()
add_similarity_attr(g_01)
add_correlation_attr(g_01)

<IPython.core.display.Javascript object>

  0%|          | 0/591 [00:00<?, ?it/s]

In [103]:
algorithm = 'louvain'
g_02 = nx.empty_graph()
g_02 = network_with_threshold(G, 0.002, False)
g_02, g_02_partition = best_partition(g_02, algorithm, True)
plt.show()
add_similarity_attr(g_02)
add_correlation_attr(g_02)

<IPython.core.display.Javascript object>

  0%|          | 0/458 [00:00<?, ?it/s]

In [104]:
algorithm = 'louvain'
g_03 = nx.empty_graph()
g_03 = network_with_threshold(G, 0.003, False)
g_03, g_03_partition = best_partition(g_03, algorithm, True)
plt.show()
add_similarity_attr(g_03)
add_correlation_attr(g_03)

<IPython.core.display.Javascript object>

  0%|          | 0/392 [00:00<?, ?it/s]

In [98]:
algorithm = 'louvain'
g_0 = nx.empty_graph()
g_0 = network_with_threshold(G, 0.005, False)
g_0, g_0_partition = best_partition(g_0, algorithm, True)
plt.show()
add_similarity_attr(g_0)
add_correlation_attr(g_0)

groups_dic =nx.get_node_attributes(g_0,'group')
groups = list(set(groups_dic.values()))
selected_nodes = []
for g in groups:
    list_nodes = [node for node, v in groups_dic.items() if v == g]
    sub_g = g_0.subgraph(list_nodes)
    sorted_nodes = sorted(sub_g.degree, key=lambda x: x[1], reverse=True)
    #print(sorted_nodes)
    i = 0
    for node, degree in sorted_nodes:
        if i == 5:
            break
        selected_nodes.append(node)
        i += 1

<IPython.core.display.Javascript object>

  0%|          | 0/311 [00:00<?, ?it/s]

In [105]:
print(len(g_0.edges()))
print(len(selected_nodes)) 

311
82


In [106]:
g_0.number_of_nodes() 

114

In [107]:
g_0.remove_nodes_from([n for n in g_0 if n not in set(selected_nodes)])
print(g_0.number_of_nodes())
print(g_0.number_of_edges()) #2601

82
121


In [108]:
pos = nx.spring_layout(g_0)
nx.draw_networkx_nodes(g_0, pos, node_size=40,
                       node_color = list(nx.get_node_attributes(g_0,'color').values()),
                       label=list(g_01.nodes()))
nx.draw_networkx_edges(g_0, pos, alpha=0.5)
plt.suptitle('Louvain Community detection')
plt.title('Number of groups:' + str(len(list(set(groups_dic.values())))))
plt.savefig('Louvain Community detection Selected nodes', dpi=300)
nx.write_gpickle(g_0, "Net_Louvain_top5v1.gpickle")

<IPython.core.display.Javascript object>

In [109]:
g_01_partition

{'coronavirus': 0,
 'sequester': 0,
 'abandoned': 0,
 'covid_19': 1,
 'bw': 1,
 'bnw': 1,
 'lensculture': 1,
 'streetphotographer': 1,
 'urbanstreetphotogallery': 1,
 'photodocumentary': 1,
 'ig_street': 1,
 'urbanstreetphotography': 1,
 'spicollective': 1,
 'classicnegative': 1,
 'w8time': 1,
 'lockdown': 2,
 'transformers': 2,
 'angrybirdstransformers': 2,
 'angrybirds': 2,
 'decepticons': 2,
 'iosgames': 2,
 'pandemic': 3,
 'lilbaby': 3,
 'lilwayne': 3,
 'kjmg': 3,
 'growinsilence': 3,
 'mindonamillion': 3,
 'yourpage': 3,
 'shelterinplace': 4,
 'qurantinelife': 4,
 'covid19': 1,
 'faceshield': 5,
 'SaveTheWorld': 5,
 'BillionShieldsChallenge': 5,
 'BillionShields': 5,
 'Bottles2Shields': 5,
 'cityscape': 1,
 'covid': 0,
 'Coronavirus': 5,
 'MayThe4thBeWithYou': 5,
 'ExOWorldNow': 5,
 'Masks4All': 5,
 'TogetherAtHome': 5,
 'earthday': 5,
 'mit': 5,
 'LOCKDOWN2020': 5,
 'COVID19': 6,
 'JamminJo': 7,
 'JoAnnBush': 7,
 'crackilton': 6,
 'crackiltonOMT': 6,
 'squeegeekid': 6,
 'carnysto

In [110]:
g_03_partition

{'covid_19': 0,
 'streetphotographer': 0,
 'urbanstreetphotogallery': 0,
 'photodocumentary': 0,
 'ig_street': 0,
 'urbanstreetphotography': 0,
 'spicollective': 0,
 'w8time': 0,
 'covid19': 0,
 'BillionShieldsChallenge': 1,
 'Coronavirus': 1,
 'MayThe4thBeWithYou': 1,
 'faceshield': 1,
 'SaveTheWorld': 1,
 'ExOWorldNow': 1,
 'BillionShields': 1,
 'Bottles2Shields': 1,
 'Masks4All': 1,
 'TogetherAtHome': 1,
 'earthday': 1,
 'mit': 1,
 'LOCKDOWN2020': 1,
 'hiphop': 2,
 'lilbaby': 2,
 'lilwayne': 2,
 'kjmg': 2,
 'growinsilence': 2,
 'mindonamillion': 2,
 'yourpage': 2,
 'trending': 2,
 'viralvideos': 2,
 'Virus': 3,
 'K': 3,
 'workfromhome': 4,
 'financialprofessional': 4,
 'supersinghs': 4,
 'Covid': 1,
 'Covid19': 5,
 'Google': 5,
 'Featured': 5,
 'NEWS': 5,
 'India': 5,
 'Entertainment': 5,
 'Bollywood': 5,
 'flattenthecurve': 7,
 'dailywalk': 7,
 'itsbetteroutside': 7,
 'motherdaughtertime': 7,
 'masks': 8,
 'sequester': 8,
 'abandoned': 8,
 'doingmypart': 9,
 'coronavirüsü': 9,
 'we

In [111]:
g_02.nodes(data=True)

NodeDataView({'covid_19': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'bw': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'lensculture': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'streetphotographer': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'urbanstreetphotogallery': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'photodocumentary': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'ig_street': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'urbanstreetphotography': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'spicollective': {'group': 0, 'color': (0.9677975592919913, 0.44127456009157356, 0.5358103155058701)}, 'w8time': {'group': 0, 'color': (0.9677975592919913, 0

In [112]:
g_01 = network_with_threshold(G, 0.01, False)
degree_sequence = sorted([(d, n) for n, d in g_01.degree()], reverse=True)
degree_sequence

[(13, 'SaveTheWorld'),
 (13, 'BillionShields'),
 (11, 'Bottles2Shields'),
 (11, 'BillionShieldsChallenge'),
 (10, 'urbanstreetphotography'),
 (10, 'urbanstreetphotogallery'),
 (10, 'streetphotographer'),
 (10, 'spicollective'),
 (10, 'photodocumentary'),
 (10, 'ig_street'),
 (10, 'Coronavirus'),
 (9, 'lensculture'),
 (9, 'cityscape'),
 (9, 'bw'),
 (9, 'bnw'),
 (8, 'faceshield'),
 (7, 'Masks4All'),
 (6, 'yourpage'),
 (6, 'viralvideos'),
 (6, 'squeegeekid'),
 (6, 'setomas'),
 (6, 'selfpublish'),
 (6, 'mindonamillion'),
 (6, 'lilwayne'),
 (6, 'lilbaby'),
 (6, 'kjmg'),
 (6, 'hamont'),
 (6, 'growinsilence'),
 (6, 'crackiltonOMT'),
 (6, 'crackilton'),
 (6, 'classicnegative'),
 (6, 'carnystories'),
 (6, 'ExOWorldNow'),
 (5, 'yumYum'),
 (5, 'stayhomesafer'),
 (5, 'seriouseats'),
 (5, 'lovefood'),
 (5, 'feastagrem'),
 (5, 'Tasty'),
 (5, 'MayThe4thBeWithYou'),
 (4, 'transformers'),
 (4, 'thisis51'),
 (4, 'sweatyselfie'),
 (4, 'iosgames'),
 (4, 'decepticons'),
 (4, 'db365'),
 (4, 'angrybirdstrans

In [95]:
g_02 = network_with_threshold(G, 0.02, False)
degree_sequence = sorted([(d, n) for n, d in g_01.degree()], reverse=True)
degree_sequence

[(13, 'SaveTheWorld'),
 (13, 'BillionShields'),
 (11, 'Bottles2Shields'),
 (11, 'BillionShieldsChallenge'),
 (10, 'urbanstreetphotography'),
 (10, 'urbanstreetphotogallery'),
 (10, 'streetphotographer'),
 (10, 'spicollective'),
 (10, 'photodocumentary'),
 (10, 'ig_street'),
 (10, 'Coronavirus'),
 (9, 'lensculture'),
 (9, 'cityscape'),
 (9, 'bw'),
 (9, 'bnw'),
 (8, 'faceshield'),
 (7, 'Masks4All'),
 (6, 'yourpage'),
 (6, 'viralvideos'),
 (6, 'squeegeekid'),
 (6, 'setomas'),
 (6, 'selfpublish'),
 (6, 'mindonamillion'),
 (6, 'lilwayne'),
 (6, 'lilbaby'),
 (6, 'kjmg'),
 (6, 'hamont'),
 (6, 'growinsilence'),
 (6, 'crackiltonOMT'),
 (6, 'crackilton'),
 (6, 'classicnegative'),
 (6, 'carnystories'),
 (6, 'ExOWorldNow'),
 (5, 'yumYum'),
 (5, 'stayhomesafer'),
 (5, 'seriouseats'),
 (5, 'lovefood'),
 (5, 'feastagrem'),
 (5, 'Tasty'),
 (5, 'MayThe4thBeWithYou'),
 (4, 'transformers'),
 (4, 'thisis51'),
 (4, 'sweatyselfie'),
 (4, 'iosgames'),
 (4, 'decepticons'),
 (4, 'db365'),
 (4, 'angrybirdstrans

In [96]:
g_03 = network_with_threshold(G, 0.03, False)
degree_sequence = sorted([(d, n) for n, d in g_01.degree()], reverse=True)
degree_sequence

[(13, 'SaveTheWorld'),
 (13, 'BillionShields'),
 (11, 'Bottles2Shields'),
 (11, 'BillionShieldsChallenge'),
 (10, 'urbanstreetphotography'),
 (10, 'urbanstreetphotogallery'),
 (10, 'streetphotographer'),
 (10, 'spicollective'),
 (10, 'photodocumentary'),
 (10, 'ig_street'),
 (10, 'Coronavirus'),
 (9, 'lensculture'),
 (9, 'cityscape'),
 (9, 'bw'),
 (9, 'bnw'),
 (8, 'faceshield'),
 (7, 'Masks4All'),
 (6, 'yourpage'),
 (6, 'viralvideos'),
 (6, 'squeegeekid'),
 (6, 'setomas'),
 (6, 'selfpublish'),
 (6, 'mindonamillion'),
 (6, 'lilwayne'),
 (6, 'lilbaby'),
 (6, 'kjmg'),
 (6, 'hamont'),
 (6, 'growinsilence'),
 (6, 'crackiltonOMT'),
 (6, 'crackilton'),
 (6, 'classicnegative'),
 (6, 'carnystories'),
 (6, 'ExOWorldNow'),
 (5, 'yumYum'),
 (5, 'stayhomesafer'),
 (5, 'seriouseats'),
 (5, 'lovefood'),
 (5, 'feastagrem'),
 (5, 'Tasty'),
 (5, 'MayThe4thBeWithYou'),
 (4, 'transformers'),
 (4, 'thisis51'),
 (4, 'sweatyselfie'),
 (4, 'iosgames'),
 (4, 'decepticons'),
 (4, 'db365'),
 (4, 'angrybirdstrans

In [113]:
algorithm = 'louvain'
g_f = nx.empty_graph()
g_f = network_with_threshold(G, 0, False)
g_f, g_f_partition = best_partition(g_f, algorithm, True)
plt.show()
add_similarity_attr(g_f)
add_correlation_attr(g_f)
nx.write_gpickle(g_f, "Net_Louvain_full.gpickle")

<IPython.core.display.Javascript object>

  0%|          | 0/19179 [00:00<?, ?it/s]