# Libraries

In [1]:
import sys
import os
directory_path = os.path.abspath(os.path.join('..'))
utils_path = os.path.abspath(os.path.join('../utils'))
if directory_path not in sys.path:
    sys.path.append(directory_path)
    sys.path.append(utils_path)

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from utils.Validator import *
from utils.Combinations import *
from utils.Network import *
pd.set_option('display.max_columns', None)
sns.set(rc={'figure.figsize':(11.7,8.27)})

# Data Consolidation

In [4]:
df_net_metrics = pd.read_csv(r"../data/processed/UserNetworksMetrics.csv")
df_net_metrics.head()

Unnamed: 0,UserId,UserName,DisplayName,pagerank_cent_followers,eigenvector_cent_teams,eigenvector_cent_forums
0,368,antgoldbloom,Anthony Goldbloom,3e-06,9.451807999999999e-26,0.001207
1,381,iguyon,Isabelle,3e-06,-1.0,5.2e-05
2,389,anandjeyahar,Anand Jeyahar,3e-06,1.171515e-26,3.3e-05
3,392,jmeynet,Julien Meynet,3e-06,-1.0,1e-06
4,394,m4xl1n,m4xl1n,-1.0,-1.0,-1.0


In [6]:
df_user_att = pd.read_csv(r"../data/processed/UserAttributes.csv")
df_user_att.head()

Unnamed: 0,UserId,CTDCOMP,CTDCOMP_DS,CTDCOMP_Supervizado,CTDCOMP_NoSupervizado
0,1,1,1.0,0.0,1.0
1,62,1,0.0,0.0,0.0
2,368,101,63.0,44.0,62.0
3,381,64,2.0,1.0,2.0
4,387,1,0.0,0.0,0.0


In [5]:
df_user_achi = pd.read_csv(r"../data/interim/UserAchievements.csv")
df_user_achi.head()

Unnamed: 0,UserId,TotalGold_Comp,TotalSilver_Comp,TotalBronze_Comp,TotalGold_Scri,TotalSilver_Scri,TotalBronze_Scri,TotalGold_Disc,TotalSilver_Disc,TotalBronze_Disc,TotalGold_Data,TotalSilver_Data,TotalBronze_Data
0,368,0.0,0.0,0.0,0.0,3.0,13.0,14.0,13.0,139.0,1.0,3.0,6.0
1,381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,53.0,0.0,0.0,0.0
2,389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,392,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,394,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_user_final = df_net_metrics.merge(df_user_att,how="left",on=["UserId"]).merge(df_user_achi,how="left",on=["UserId"])
df_user_final.head()

Unnamed: 0,UserId,UserName,DisplayName,pagerank_cent_followers,eigenvector_cent_teams,eigenvector_cent_forums,CTDCOMP,CTDCOMP_DS,CTDCOMP_Supervizado,CTDCOMP_NoSupervizado,TotalGold_Comp,TotalSilver_Comp,TotalBronze_Comp,TotalGold_Scri,TotalSilver_Scri,TotalBronze_Scri,TotalGold_Disc,TotalSilver_Disc,TotalBronze_Disc,TotalGold_Data,TotalSilver_Data,TotalBronze_Data
0,368,antgoldbloom,Anthony Goldbloom,3e-06,9.451807999999999e-26,0.001207,101.0,63.0,44.0,62.0,0.0,0.0,0.0,0.0,3.0,13.0,14.0,13.0,139.0,1.0,3.0,6.0
1,381,iguyon,Isabelle,3e-06,-1.0,5.2e-05,64.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,53.0,0.0,0.0,0.0
2,389,anandjeyahar,Anand Jeyahar,3e-06,1.171515e-26,3.3e-05,17.0,13.0,12.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,392,jmeynet,Julien Meynet,3e-06,-1.0,1e-06,10.0,1.0,1.0,1.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,394,m4xl1n,m4xl1n,-1.0,-1.0,-1.0,8.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_user_final.columns[3:]

Index(['pagerank_cent_followers', 'eigenvector_cent_teams',
       'eigenvector_cent_forums', 'CTDCOMP', 'CTDCOMP_DS',
       'CTDCOMP_Supervizado', 'CTDCOMP_NoSupervizado', 'TotalGold_Comp',
       'TotalSilver_Comp', 'TotalBronze_Comp', 'TotalGold_Scri',
       'TotalSilver_Scri', 'TotalBronze_Scri', 'TotalGold_Disc',
       'TotalSilver_Disc', 'TotalBronze_Disc', 'TotalGold_Data',
       'TotalSilver_Data', 'TotalBronze_Data'],
      dtype='object')

In [10]:
Validator.validar_numerico(df_user_final,df_user_final.columns[3:])

Unnamed: 0,Variable,n,Missing,%Missing,Negativo,%Negativo,Unico,Media,Min,Max,IQRMAX,3STD,P50,P75,P90,P95,P99,Outlier IQR,%Outlier IQR,Outlier 3SD,%Outlier 3SD,Outlier P90,%Outlier P90,Outlier P95,%Outlier P95,Outlier P99,%Outlier P99
0,pagerank_cent_followers,194147,0,0.0,112354,57.87,37295,-0.58,-1.0,0.0,1.5,0.9,-1.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,19412,10.0,7718,3.98,1942,1.0
0,eigenvector_cent_teams,194147,0,0.0,155035,79.85,13191,-0.8,-1.0,0.71,-1.0,0.4,-1.0,-1.0,0.0,0.0,0.0,39112,20.15,2,0.0,18209,9.38,9708,5.0,1942,1.0
0,eigenvector_cent_forums,194147,0,0.0,92863,47.83,63809,-0.48,-1.0,0.38,1.5,1.02,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,19408,10.0,9708,5.0,1942,1.0
0,CTDCOMP,194147,39320,20.25,0,0.0,225,8.49,1.0,994.0,22.0,50.32,4.0,10.0,20.0,30.0,65.0,12831,6.61,2771,1.43,14833,7.64,7736,3.98,1542,0.79
0,CTDCOMP_DS,194147,39320,20.25,0,0.0,179,6.33,0.0,277.0,16.0,38.24,3.0,7.0,15.0,24.0,51.0,14124,7.27,2982,1.54,15442,7.95,7414,3.82,1548,0.8
0,CTDCOMP_Supervizado,194147,39320,20.25,0,0.0,144,5.04,0.0,212.0,13.5,30.01,2.0,6.0,12.0,19.0,40.0,13650,7.03,2937,1.51,15278,7.87,7396,3.81,1502,0.77
0,CTDCOMP_NoSupervizado,194147,39320,20.25,0,0.0,172,6.07,0.0,267.0,16.0,36.69,3.0,7.0,15.0,23.0,50.0,13215,6.81,3088,1.59,14559,7.5,7420,3.82,1466,0.76
0,TotalGold_Comp,194147,0,0.0,0,0.0,30,0.04,0.0,60.0,0.0,1.37,0.0,0.0,0.0,0.0,1.0,3492,1.8,1108,0.57,3492,1.8,3492,1.8,1108,0.57
0,TotalSilver_Comp,194147,0,0.0,0,0.0,39,0.13,0.0,59.0,0.0,2.59,0.0,0.0,0.0,1.0,3.0,12656,6.52,2321,1.2,12656,6.52,4624,2.38,1408,0.73
0,TotalBronze_Comp,194147,0,0.0,0,0.0,33,0.13,0.0,47.0,0.0,2.18,0.0,0.0,0.0,1.0,3.0,14740,7.59,1995,1.03,14740,7.59,4616,2.38,1117,0.58


In [11]:
# Completamos con -1 todos los valores nulos
df_user_final.fillna(-1,inplace=True)
Validator.validar_numerico(df_user_final,df_user_final.columns[3:])

Unnamed: 0,Variable,n,Missing,%Missing,Negativo,%Negativo,Unico,Media,Min,Max,IQRMAX,3STD,P50,P75,P90,P95,P99,Outlier IQR,%Outlier IQR,Outlier 3SD,%Outlier 3SD,Outlier P90,%Outlier P90,Outlier P95,%Outlier P95,Outlier P99,%Outlier P99
0,pagerank_cent_followers,194147,0,0.0,112354,57.87,37295,-0.58,-1.0,0.0,1.5,0.9,-1.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,19412,10.0,7718,3.98,1942,1.0
0,eigenvector_cent_teams,194147,0,0.0,155035,79.85,13191,-0.8,-1.0,0.71,-1.0,0.4,-1.0,-1.0,0.0,0.0,0.0,39112,20.15,2,0.0,18209,9.38,9708,5.0,1942,1.0
0,eigenvector_cent_forums,194147,0,0.0,92863,47.83,63809,-0.48,-1.0,0.38,1.5,1.02,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,19408,10.0,9708,5.0,1942,1.0
0,CTDCOMP,194147,0,0.0,39320,20.25,226,6.57,-1.0,994.0,18.5,45.64,3.0,8.0,17.0,27.0,59.0,17265,8.89,3473,1.79,18695,9.63,9307,4.79,1941,1.0
0,CTDCOMP_DS,194147,0,0.0,39320,20.25,180,4.85,-1.0,277.0,15.0,34.68,2.0,6.0,13.0,21.0,47.0,15442,7.95,3781,1.95,18705,9.63,9291,4.79,1884,0.97
0,CTDCOMP_Supervizado,194147,0,0.0,39320,20.25,145,3.81,-1.0,212.0,12.5,27.27,1.0,5.0,11.0,17.0,37.0,15278,7.87,3668,1.89,17240,8.88,8928,4.6,1821,0.94
0,CTDCOMP_NoSupervizado,194147,0,0.0,39320,20.25,173,4.64,-1.0,267.0,12.5,33.28,2.0,5.0,13.0,20.0,45.0,19656,10.12,3687,1.9,17676,9.1,9377,4.83,1900,0.98
0,TotalGold_Comp,194147,0,0.0,0,0.0,30,0.04,0.0,60.0,0.0,1.37,0.0,0.0,0.0,0.0,1.0,3492,1.8,1108,0.57,3492,1.8,3492,1.8,1108,0.57
0,TotalSilver_Comp,194147,0,0.0,0,0.0,39,0.13,0.0,59.0,0.0,2.59,0.0,0.0,0.0,1.0,3.0,12656,6.52,2321,1.2,12656,6.52,4624,2.38,1408,0.73
0,TotalBronze_Comp,194147,0,0.0,0,0.0,33,0.13,0.0,47.0,0.0,2.18,0.0,0.0,0.0,1.0,3.0,14740,7.59,1995,1.03,14740,7.59,4616,2.38,1117,0.58
