# Libraries

In [22]:
import sys
import os
directory_path = os.path.abspath(os.path.join('..'))
utils_path = os.path.abspath(os.path.join('../utils'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    sys.path.append(utils_path)

In [23]:
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from utils.Validator import *
from utils.Combinations import *
from utils.Network import *
pd.set_option('display.max_columns', None)  
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Digraph: User Followers

In [24]:
df_user_follower = pd.read_csv("../data/processed/DiGraph_Followers.csv")
df_user_follower.head()

Unnamed: 0,UserId,FollowerUserId
0,368,993
1,368,1902
2,368,1950
3,368,3258
4,368,3429


In [27]:
GUF = nx.from_pandas_edgelist(df_user_follower, source = "UserId", target = "FollowerUserId", create_using=nx.DiGraph())
print(nx.info(GUF))

Name: 
Type: DiGraph
Number of nodes: 253893
Number of edges: 765508
Average in degree:   3.0151
Average out degree:   3.0151


In [28]:
indeg=GUF.in_degree()
indeg = dict(indeg)
Network.get_top_nodes(indeg)

{4333519: 2756, 87114: 2136, 2357264: 1264, 4348347: 1212, 4956446: 894}

In [29]:
outdeg=GUF.out_degree()
outdeg = dict(indeg)
Network.get_top_nodes(indeg)

{4333519: 2756, 87114: 2136, 2357264: 1264, 4348347: 1212, 4956446: 894}

In [32]:
degree_centrality =nx.degree_centrality(GUF)
Network.get_top_nodes(degree_centrality)

{5309: 0.06633135348888504,
 71388: 0.054345942369196355,
 708283: 0.05205362910213792,
 1723677: 0.045716288815716925,
 54836: 0.04236446993209712}

In [33]:
pagerank_centrality =nx.pagerank(GUF)
Network.get_top_nodes(pagerank_centrality)

{4333519: 0.002126725378651298,
 87114: 0.00099114808114556,
 4956446: 0.0009757464270521383,
 3006723: 0.0006596276295753494,
 2357264: 0.00046691348480868794}

In [34]:
nx.set_node_attributes(GUF, name='in_degree_followers', values=indeg)
nx.set_node_attributes(GUF, name='out_degree_followers', values=outdeg)
nx.set_node_attributes(GUF, name='degree_cent_followers', values = degree_centrality)
nx.set_node_attributes(GUF, name='pagerank_cent_followers', values = pagerank_centrality)

# Graph: Team Members

In [35]:
df_user_team = pd.read_csv("../data/processed/Graph_Teams.csv")
df_user_team.head()

Unnamed: 0,UserId_1,UserId_2,weight
0,10000270,8243312,3
1,10000270,9829581,2
2,1000033,1003122,1
3,1000033,1214207,1
4,1000033,1581982,1


In [36]:
GUT = nx.from_pandas_edgelist(df_user_team, source = "UserId_1", target = "UserId_2",edge_attr=True, create_using=nx.Graph())
print(nx.info(GUT))

Name: 
Type: Graph
Number of nodes: 155730
Number of edges: 196736
Average degree:   2.5266


In [37]:
degrees=GUT.degree()
degrees = dict(degrees)
Network.get_top_nodes(degrees)

{54836: 133, 1192776: 110, 18463: 99, 111640: 92, 5309: 90}

In [38]:
degree_centrality =nx.degree_centrality(GUT)
Network.get_top_nodes(degree_centrality)

{54836: 0.0008540477367734975,
 1192776: 0.0007063552710156746,
 18463: 0.0006357197439141072,
 111640: 0.0005907698630312915,
 5309: 0.0005779270399219156}

In [39]:
eigenvector_centrality = nx.eigenvector_centrality(GUT,weight="weight")
Network.get_top_nodes(eigenvector_centrality)

{637722: 0.7050741444838826,
 2360956: 0.7050741444838826,
 43882: 0.017099873206441504,
 43896: 0.016979948853551405,
 43883: 0.0168582338863929}

In [40]:
nx.set_node_attributes(GUT, name='degree_teams', values=degrees)
nx.set_node_attributes(GUT, name='degree_cent_teams', values=degree_centrality)
nx.set_node_attributes(GUT, name='eigenvector_cent_teams', values = eigenvector_centrality)

# Graph: User Forum Messages

In [41]:
df_user_forum_messages = pd.read_csv("../data/processed/Graph_ForumMessages.csv")
df_user_forum_messages.head()

Unnamed: 0,UserId_1,UserId_2,weight
0,10000014,10030651,7
1,10000014,10135089,1
2,10000014,10275991,4
3,10000014,10352854,1
4,10000014,10454992,1


In [42]:
GUM = nx.from_pandas_edgelist(df_user_forum_messages, source = "UserId_1", target = "UserId_2",edge_attr=True, create_using=nx.Graph())
print(nx.info(GUM))

Name: 
Type: Graph
Number of nodes: 101284
Number of edges: 4306750
Average degree:  85.0430


In [43]:
degrees=GUM.degree()
degrees = dict(degrees)
Network.get_top_nodes(degrees)

{3012786: 15622, 75976: 9998, 1723677: 9822, 898111: 6756, 3399844: 6580}

In [44]:
degree_centrality =nx.degree_centrality(GUM)
Network.get_top_nodes(degree_centrality)

{3012786: 0.1542410868556421,
 75976: 0.09871350572159199,
 1723677: 0.09697580047984361,
 898111: 0.06670418530256805,
 3399844: 0.06496648006081969}

In [45]:
eigenvector_centrality = nx.eigenvector_centrality(GUM,weight="weight")
Network.get_top_nodes(eigenvector_centrality)

{3012786: 0.3803401654659919,
 3399844: 0.24816810564171174,
 3490494: 0.24719264760265058,
 5274255: 0.20270009941050648,
 4956446: 0.1942832969748979}

In [46]:
nx.set_node_attributes(GUM, name='degree_forums', values=degrees)
nx.set_node_attributes(GUM, name='degree_cent_forums', values=degree_centrality)
nx.set_node_attributes(GUM, name='eigenvector_cent_forums', values = eigenvector_centrality)

# Gephi Export

In [47]:
nx.write_gexf(GUF, r"../data/gephi/GUF.gexf")
nx.write_gexf(GUT, r"../data/gephi/GUT.gexf")
nx.write_gexf(GUM, r"../data/gephi/GUM.gexf")

# To Dataframe

In [48]:
df_GUF = pd.DataFrame.from_dict(dict(GUF.nodes(data=True)), orient='index').reset_index()
df_GUF = df_GUF.rename(columns = {'index':'UserId'})

df_GUT = pd.DataFrame.from_dict(dict(GUT.nodes(data=True)), orient='index').reset_index()
df_GUT = df_GUT.rename(columns = {'index':'UserId'})

df_GUM = pd.DataFrame.from_dict(dict(GUM.nodes(data=True)), orient='index').reset_index()
df_GUM = df_GUM.rename(columns = {'index':'UserId'})

df_GUF["UserId"] = df_GUF["UserId"].astype(str)
df_GUT["UserId"] = df_GUT["UserId"].astype(str)
df_GUM["UserId"] = df_GUM["UserId"].astype(str)

In [49]:
df_users = pd.read_csv(r"../data/interim/Users.csv",dtype=str)
df_users.head()

Unnamed: 0,UserId,UserName,DisplayName,PerformanceTier,Flg_Creacion
0,368,antgoldbloom,Anthony Goldbloom,2,5Y
1,381,iguyon,Isabelle,2,5Y
2,389,anandjeyahar,Anand Jeyahar,1,5Y
3,392,jmeynet,Julien Meynet,3,5Y
4,394,m4xl1n,m4xl1n,1,5Y


In [50]:
df_users_net = df_users.merge(df_GUF,how="left",on=["UserId"]).merge(df_GUT,how="left",on=["UserId"]).merge(df_GUM,how="left",on=["UserId"])
df_users_net.head()

Unnamed: 0,UserId,UserName,DisplayName,PerformanceTier,Flg_Creacion,in_degree_followers,out_degree_followers,degree_cent_followers,pagerank_cent_followers,degree_teams,degree_cent_teams,eigenvector_cent_teams,degree_forums,degree_cent_forums,eigenvector_cent_forums
0,368,antgoldbloom,Anthony Goldbloom,2,5Y,6.0,6.0,0.003895,3e-06,4.0,2.6e-05,9.451807999999999e-26,887.0,0.008758,0.001207
1,381,iguyon,Isabelle,2,5Y,0.0,0.0,2.4e-05,3e-06,,,,113.0,0.001116,5.2e-05
2,389,anandjeyahar,Anand Jeyahar,1,5Y,0.0,0.0,8e-06,3e-06,1.0,6e-06,1.171515e-26,59.0,0.000583,3.3e-05
3,392,jmeynet,Julien Meynet,3,5Y,0.0,0.0,4e-06,3e-06,,,,25.0,0.000247,1e-06
4,394,m4xl1n,m4xl1n,1,5Y,,,,,,,,,,


In [55]:
Validator.validar_numerico(df_users_net,df_users_net.columns[5:])

Unnamed: 0,Variable,n,Missing,%Missing,Negativo,%Negativo,Unico,Media,Min,Max,IQRMAX,3STD,P50,P75,P90,P95,P99,Outlier IQR,%Outlier IQR,Outlier 3SD,%Outlier 3SD,Outlier P90,%Outlier P90,Outlier P95,%Outlier P95,Outlier P99,%Outlier P99
0,in_degree_followers,194147,112354,57.87,0,0.0,248,5.13,0.0,2756.0,8.5,64.96,2.0,4.0,11.0,20.0,54.0,10991,5.66,607,0.31,7967,4.1,3856,1.99,813,0.42
0,out_degree_followers,194147,112354,57.87,0,0.0,248,5.13,0.0,2756.0,8.5,64.96,2.0,4.0,11.0,20.0,54.0,10991,5.66,607,0.31,7967,4.1,3856,1.99,813,0.42
0,degree_cent_followers,194147,112354,57.87,0,0.0,596,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9732,5.01,234,0.12,7905,4.07,3985,2.05,815,0.42
0,pagerank_cent_followers,194147,112354,57.87,0,0.0,37369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12103,6.23,181,0.09,7718,3.98,4090,2.11,818,0.42
0,degree_teams,194147,155035,79.85,0,0.0,70,3.33,1.0,133.0,8.5,16.57,2.0,4.0,6.0,10.0,23.0,2392,1.23,701,0.36,3841,1.98,1634,0.84,361,0.19
0,degree_cent_teams,194147,155035,79.85,0,0.0,70,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2392,1.23,701,0.36,3841,1.98,1634,0.84,361,0.19
0,eigenvector_cent_teams,194147,155035,79.85,0,0.0,13350,0.0,0.0,0.71,0.0,0.02,0.0,0.0,0.0,0.0,0.0,9671,4.98,7,0.0,3912,2.01,1956,1.01,392,0.2
0,degree_forums,194147,92863,47.83,0,0.0,1619,85.04,2.0,15622.0,200.0,733.41,30.0,86.0,189.0,282.0,920.17,8761,4.51,1498,0.77,10088,5.2,5061,2.61,1013,0.52
0,degree_cent_forums,194147,92863,47.83,0,0.0,1619,0.0,0.0,0.15,0.0,0.01,0.0,0.0,0.0,0.0,0.01,8761,4.51,1498,0.77,10138,5.22,5061,2.61,1013,0.52
0,eigenvector_cent_forums,194147,92863,47.83,0,0.0,63858,0.0,0.0,0.38,0.0,0.01,0.0,0.0,0.0,0.0,0.0,10522,5.42,481,0.25,10129,5.22,5065,2.61,1013,0.52


In [56]:
# Completamos con -1 todos los valores nulos para las 3 metricas
df_users_net.fillna(-1,inplace=True)

In [57]:
#Dataset Networks
df_users_net.to_csv(r"../data/processed/UserNetworksMetrics.csv",index=False)