# Detección de Comunidades en Twitter

## Cargar el archivo
Se extrajeron tweets del año 2016 en los que se menciona al BBVA de la solución GNIP

In [None]:
pip install openpyxl

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import networkx as nx
from networkx.algorithms import community
import time

In [None]:
df = pd.read_excel('../data/Tweets_BBVA.xlsx')
df.head()

Revisamos la información del DataFrame

In [None]:
df.info()

In [None]:
g1 = df[['preferredUsername', 'actor.preferredUsername']][df['actor.preferredUsername'].notnull()]
g1.head()

In [None]:
g2 = df[['preferredUsername', 'user_mention_screen_names']][df['user_mention_screen_names'].notnull()]
g2.head()

In [None]:
s = g2['user_mention_screen_names'].str.split(',').apply(pd.Series, 1).stack()
s.index = s.index.droplevel(-1)
s.name = 'actor.preferredUsername'

In [None]:
del g2['user_mention_screen_names']

In [None]:
g2=g2.join(s)

In [None]:
h = pd.concat([g1, g2]).drop_duplicates()
h.info()

Transformamos el DataFrame en un Grafo Dirigido

In [None]:
G = nx.from_pandas_edgelist(h, source = 'preferredUsername', target = 'actor.preferredUsername', create_using=nx.DiGraph())
print(nx.info(G))

Crear la función top_nodes que mostrará los valores más altos de un diccionario

In [None]:
def get_top_nodes(cdict, num=5):
    top_nodes ={}
    for i in range(num):
        top_nodes =dict(
            sorted(cdict.items(), key=lambda x: x[1], reverse=True)[:num]
            )
        return top_nodes

#### Grado

Guardar el grado de cada nodo en un diccionario

In [None]:
gdeg=G.degree()

In [None]:
get_top_nodes(dict(gdeg))

#### In-Degree

In [None]:
indeg=G.in_degree()
get_top_nodes(dict(indeg))

#### Out-Degree

In [None]:
outdeg=G.out_degree()
get_top_nodes(dict(outdeg))

#### Degree Centrality

In [None]:
degree_centrality =nx.degree_centrality(G)
nx.set_node_attributes(G,degree_centrality, 'dc')
get_top_nodes(degree_centrality)

#### Betweenness

In [None]:
t0= time.process_time()

betweenness_centrality = nx.betweenness_centrality(G)
nx.set_node_attributes(G,betweenness_centrality, 'bc')

t1 = time.process_time() - t0
print("Time elapsed: ", t1)

In [None]:
get_top_nodes(betweenness_centrality)

#### Closeness

In [None]:
closeness_centrality =nx.closeness_centrality(G)
nx.set_node_attributes(G,closeness_centrality, 'cc')

In [None]:
get_top_nodes(closeness_centrality)

#### Eigenvector Centrality

In [None]:
eigenvector_centrality = nx.eigenvector_centrality(G)
nx.set_node_attributes(G, eigenvector_centrality,'ec')

In [None]:
get_top_nodes(eigenvector_centrality)

#### PageRank Centrality

In [None]:
pagerank_centrality =nx.pagerank(G)
nx.set_node_attributes(G, pagerank_centrality, 'pr')

In [None]:
get_top_nodes(pagerank_centrality)

## Métricas de Grafo

#### Densidad

In [None]:
nx.density(G)

#### Local Clustering Coefficient

In [None]:
nx.average_clustering(G)

In [None]:
communities = community.centrality.girvan_newman(G)

In [None]:
node_groups = []
for com in next(communities):
    node_groups.append(list(com))
#print(node_groups)

In [None]:
len(node_groups)

## Pregunta
¿Cuáles son las principales diferencias entre el cálculo de métricas entre grafos dirigidos y no dirigidos? ¿Cuáles ejecutan más rápido? ¿Hay alguna que no se pueda calcular para uno de los tipos? (Explique en no más de 300 palabras)

Elaborado por Luis Cajachahua bajo licencia MIT (2021)