# Parte 1: Dados - Users_Followers

In [19]:
import pandas as pd
import numpy as np
import re

def fs_to_dataframe(fs):
    return pd.DataFrame({'User': re.split('\W+', str(fs))})


In [20]:
# load dataset

gh_users_followers = pd.read_csv('../data/users__followers.csv')
gh_users_followers.head()

Unnamed: 0,User,All_Followers,nF
0,tarruda,"Sannis,danielmahon,csjaba,FergusRedican,Victor...",570
1,mairatma,"brunocoelho,henvic,eduardolundgren,aperrelli,a...",363
2,joselitojunior1,"renatooliveira,jeffesonmaia,jotaefe,duartefq,J...",350
3,marcelcaraciolo,"thiagoarrais,brunojm,henriquebastos,macndesign...",330
4,luanfonceca,"brunohenrique,luizvarela,gladson,lucasbibiano,...",301


In [21]:
# normalizing

gh_users_followers['All_Followers'] = gh_users_followers['All_Followers'].fillna('')
gh_users_followers['All_Followers'] = gh_users_followers['All_Followers'].apply(fs_to_dataframe) # apenas para simplificar operacoes 

gh_users_followers.head()

Unnamed: 0,User,All_Followers,nF
0,tarruda,User 0 S...,570
1,mairatma,User 0 brunocoelho ...,363
2,joselitojunior1,User 0 renatooliveira 1 ...,350
3,marcelcaraciolo,User 0 thiagoarrais 1 ...,330
4,luanfonceca,User 0 brunohenrique 1 ...,301


In [22]:
# Dataframe incluindo colunas contendo apenas seguidores recifenses

gh_users_followers['Followers'] = gh_users_followers['All_Followers'].apply(pd.DataFrame({'User':gh_users_followers['User']}).merge)

gh_users_followers['nFs'] = gh_users_followers['Followers'].apply(len)

gh_users_followers.head()

Unnamed: 0,User,All_Followers,nF,Followers,nFs
0,tarruda,User 0 S...,570,User 0 henriquemenezes 1 paul...,5
1,mairatma,User 0 brunocoelho ...,363,User 0 simoneas02 1 ...,19
2,joselitojunior1,User 0 renatooliveira 1 ...,350,User 0 luanfonceca 1 rena...,23
3,marcelcaraciolo,User 0 thiagoarrais 1 ...,330,User 0 luanfonceca 1 ...,28
4,luanfonceca,User 0 brunohenrique 1 ...,301,User 0 deividazevedo2 1 ...,9


In [23]:
# foco na visualização dos Followers recifenses 
# (mantendo no DataFrame todos os seguidores(recifenses e não recifenses))

gh_users_followers = gh_users_followers.loc[:, ['User', 'Followers', 'nFs', 'All_Followers', 'nF']]
gh_users_followers.sort_values('nFs', axis=0, ascending=False, inplace=True)

gh_users_followers.head()

Unnamed: 0,User,Followers,nFs,All_Followers,nF
12,fernandocastor,User 0 joselitojunior1 1 ...,48,User 0 guiocavalcanti ...,129
8,filipeximenes,User 0 luanfonceca ...,40,User 0 luisgabr...,177
7,simoneas02,User 0 talitaolive...,37,User 0 ElsonBarcelos ...,250
9,renatooliveira,User 0 joselitojunior1 1 ...,35,User 0 adrianomelo 1 ...,166
26,lailsonbm,User 0 luanfonceca 1 ...,29,User 0 kraudio 1 ...,73


In [24]:
# todos os usuarios recifenses (nem todos serao considerados como nos do graph - ver Parte 2) 
nodes_preview = gh_users_followers.loc[:,['User','nFs', 'nF']]
nodes_preview.columns = ['Id', 'In-Degree', 'All-In-Degree']

nodes_preview.head()

Unnamed: 0,Id,In-Degree,All-In-Degree
12,fernandocastor,48,129
8,filipeximenes,40,177
7,simoneas02,37,250
9,renatooliveira,35,166
26,lailsonbm,29,73


In [25]:
# criacao das arestas

def create_edges_df(batches):
    return pd.concat(batches,
        ignore_index=True)

pairs = zip(gh_users_followers.User, gh_users_followers.Followers)

edges = [pd.DataFrame([{'Source':str(f), 'Target':str(u)} for f in fs['User']]) for (u,fs) in pairs]

edges_df = create_edges_df(edges)
edges_df.head()

Unnamed: 0,Source,Target
0,joselitojunior1,fernandocastor
1,filipeximenes,fernandocastor
2,renatooliveira,fernandocastor
3,henvic,fernandocastor
4,fjsj,fernandocastor


In [26]:
# formatando

gh_users_followers['Followers'] = gh_users_followers['Followers'].apply(lambda x: x.apply(','.join))
gh_users_followers['All_Followers'] = gh_users_followers['All_Followers'].apply(lambda x: x.apply(','.join))

gh_users_followers.head()

Unnamed: 0,User,Followers,nFs,All_Followers,nF
12,fernandocastor,"joselitojunior1,filipeximenes,renatooliveira,h...",48,"guiocavalcanti,adrianomelo,tacsio,x8lucas8x,fj...",129
8,filipeximenes,"luanfonceca,gileno,renatooliveira,luiztiago,fe...",40,"luisgabriel,renatooliveira,marciobarbosa,anton...",177
7,simoneas02,"talitaoliveira,ktquez,pcstl,karlafalcao,kessia...",37,"ElsonBarcelos,IsabelaDePaula,sergiockd,douglas...",250
9,renatooliveira,"joselitojunior1,luanfonceca,gileno,filipeximen...",35,"adrianomelo,x8lucas8x,luisgabriel,joselitojuni...",166
26,lailsonbm,"luanfonceca,gileno,filipeximenes,renatooliveir...",29,"kraudio,lucasmncastro,chillicoder,lmarinho,gvc...",73


In [27]:
# gh_users_followers.to_csv('../data/recife/users__followers.csv', index=False)
# edges_df.to_csv('../data/recife/edges.csv', index=False)


# Parte 2: Análise dos Dados - Users_Followers

In [28]:
# numero de usuarios recifenses 
gh_users_followers['User'].count()

1992

In [29]:
# numero de usuarios recifenses com seguidores recifenses 
users_0_fs = [str(n) for n in gh_users_followers.User[gh_users_followers['nFs'] != 0]]
len(users_0_fs)

692

In [30]:
# numero de usuarios recifenses sem seguidores recifenses
users_0_fs = [str(n) for n in gh_users_followers.User[gh_users_followers['nFs'] == 0]]
len(users_0_fs)

1300

In [31]:
# numero de usuarios recifenses com seguidores (recifenses ou nao)
users_0_fs = [str(n) for n in gh_users_followers.User[gh_users_followers['nF'] != 0]]
len(users_0_fs)

1103

In [32]:
# numero de usuarios recifenses sem seguidores (recifenses ou nao)
users_0_fs = [str(n) for n in gh_users_followers.User[gh_users_followers['nF'] == 0]]
len(users_0_fs)

889

**- Número de nós considerados**

Note que como estes nós pertencem a uma aresta, eles representam os usuários que tem seguidores e/ou seguem alguém.

In [33]:
# (usuarios recifenses, excluidos os que não tem seguidores se e somente se não seguem ninguem)
len(pd.unique(edges_df[['Source','Target']].values.ravel()))

828

In [34]:
# numero de arestas (somatorio do total de seguidores por usuario)
gh_users_followers.nFs.sum()

2320

In [36]:
# revendo o numero de arestas
edges = pd.read_csv('../data/recife/edges.csv')
edges.count()

source    2320
target    2320
dtype: int64

In [37]:
# distribuicao do Grau
gh_users_followers.nFs.value_counts()

0     1300
1      310
2      142
3       67
4       42
5       27
7       19
6       18
8       14
9       14
11       6
14       5
12       3
23       3
20       2
13       2
18       2
10       2
15       2
24       2
25       2
29       1
37       1
35       1
16       1
28       1
19       1
40       1
48       1
Name: nFs, dtype: int64

In [47]:
# distribuicao do grau - usuarios por ocorrencias da quantidade de seguidores 
groupby_nfs = gh_users_followers.groupby('nFs')
groupby_nfs['User'].apply( ','.join)

nFs
0     eliene-mb,anniewalker,lucasvsr,nataliaalves,th...
1     filipe-torres,ericksantana,Kassio-Ferreira,koo...
2     sudorafa,gilesv,andreneto,marcoshmendes,tonnyv...
3     luhanlacerda,walber,mauLeal,tcostam,bebetocf,v...
4     vmms16,wwcoderecife,andreldm,lhew,brunovpl,cit...
5     favasconcelos,alinedoleron,djalmaafilho,Master...
6     vanessa,gcaraciolo,luizlago,omailson,rvlb-19,L...
7     pedroqueiroz,t0rr3sp3dr0,leticiamachado,felipe...
8     iagobelo,dvro,nielsonsantana,tomersimis,JoaoGF...
9     patrickrbc,luanfonceca,jeftarmascarenhas,ovict...
10                              miguelarauj1o,Cisneiros
11    lmarinho,leopoldomt,karlafalcao,irgmedeiros,br...
12           brunnogomes,jordanamorais,marcellustavares
13                            brunofarache,alexpessoajr
14    vinicius3w,thiagodiniz,diegonvs,victorlaerte,a...
15                                interaminense,dakerfp
16                                           pauloborba
18                                        gv

# Parte 3: Métricas obtidas no Gephi

In [87]:
nodes_data = pd.read_csv('../data/recife/nodes_data.csv')

# nodes_data.columns = ['User', 'In_Degree', 'Out_Degree', 'Degree', 'Modularity_Class', 'Page_Rank', 'Eccentricity_Centrality', 'Closness_Centrality', 'Betweeness_Centrality', 'EigenVector_Centrality']
# nodes_data.to_csv('../data/recife/nodes_data.csv', index=False)

nodes_data.head()

Unnamed: 0,User,In_Degree,Out_Degree,Degree,Modularity_Class,Page_Rank,Eccentricity_Centrality,Closness_Centrality,Betweeness_Centrality,EigenVector_Centrality
0,joselitojunior1,23,2,25,6,0.005161,9,0.219025,7591.265971,0.30229
1,fernandocastor,48,7,55,6,0.01252,8,0.253469,37787.361925,0.751562
2,filipeximenes,40,32,72,0,0.014185,9,0.267709,40297.561868,1.0
3,renatooliveira,35,28,63,0,0.01055,9,0.258613,26763.575801,0.994672
4,henvic,25,15,40,5,0.011471,8,0.248752,34297.005176,0.585929


In [88]:
nodes_data.describe()

Unnamed: 0,In_Degree,Out_Degree,Degree,Modularity_Class,Page_Rank,Eccentricity_Centrality,Closness_Centrality,Betweeness_Centrality,EigenVector_Centrality
count,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0,828.0
mean,2.793478,2.793478,5.586957,7.602657,0.001208,6.55314,0.256711,1419.676329,0.042428
std,4.567072,4.667115,8.360292,6.902523,0.001698,4.938395,0.295489,4458.272707,0.107496
min,0.0,0.0,1.0,-3.0,0.000329,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,1.75,0.000444,1.0,0.127562,0.0,0.00129
50%,1.0,1.0,3.0,6.0,0.000682,9.0,0.191944,0.0,0.006063
75%,3.0,3.0,6.0,12.0,0.001235,11.0,0.224246,692.926692,0.029582
max,48.0,46.0,72.0,33.0,0.020541,14.0,1.0,40894.607104,1.0


**Legenda Modularity_Class**. Colocamos:  
- \-2 para as "comunidades" de tamanho 2
- \-3 para as "comunidades" de tamanho 3
