In [6]:
# Mathematical and Data Managment
import numpy as np
import pandas as pd

# Graph Managment
import graph_tool.all as gt
from utils import *

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Miscellaneous
import os
from glob import glob
from tqdm import tqdm
from datetime import datetime, timedelta
import concurrent.futures

In [7]:
# Load graphs
files = glob('/mnt/disk2/Data/3_Day_Graphs/Graphs/*.graphml')
files = np.sort(files)

# Political Affiliations
categories = ['Izquierda', 'Derecha', 'Centro', 'Sin Clasificar']

### Index of Proximity Between Groups

$$Prox_{j\rightarrow k}=\frac{W_{jk}}{(T_k/\sum_{m\in G} T_m)}$$

#### Measure $W_{jk}$

In [8]:
W_jk = pd.DataFrame()
for file in tqdm(files):
    # Importamos el grafo
    g = gt.load_graph(file)
    graph_date = file.split('/')[-1].split('.')[0].split('_')[-1]

    n_individuos = g.num_vertices()
    political_labeling = np.array([g.vp["Political Label"][j] for j in range(n_individuos)])

    # Vamos a crear la matriz de adyacencia teniendo en cuenta los pesos
    adj = np.zeros((n_individuos, n_individuos))
    for e in g.edges():
        s = int(e.source())
        t = int(e.target())
        w = g.ep["Normal Weight"][e]
        adj[s, t] = w
    
    # Vamos a ir nodo por nodo calculando el numerador de la definición de proximidad
    for k in range(n_individuos):
        # Extraemos la matriz de adyacencia NO normalizada (1s y 0s)
        # Vamos a ver todos los enlaces del nodo k con el resto de nodos
        enlaces_jk = adj[k, :]
        # ¿Cuál es la afiliación política del nodo k?
        label_k = political_labeling[k]

        # Suma de todos los pesos de los enlaces de k con tweets de izquierda, derecha, centro, 
        # sin clasificar, a otros como un todo y a su mismo partido
        W_j_izquierda = enlaces_jk[political_labeling == "Izquierda"].sum()
        W_j_derecha = enlaces_jk[political_labeling == "Derecha"].sum()
        W_j_centro = enlaces_jk[political_labeling == "Centro"].sum()
        W_j_sc = enlaces_jk[political_labeling == "Sin Clasificar"].sum()
        W_j_mismo = enlaces_jk[political_labeling == label_k].sum()
        W_j_otros = enlaces_jk[political_labeling != label_k].sum()
        temp = pd.DataFrame({"Ind": k, "Date": graph_date, "Political Afilliation": label_k, 
                            "W_j_izquierda": W_j_izquierda, "W_j_derecha": W_j_derecha, "W_j_centro": W_j_centro, 
                            "W_j_sc": W_j_sc, "W_j_mismo": W_j_mismo, "W_j_otros": W_j_otros}, index = [0])
        W_jk = pd.concat([W_jk, temp], axis = 0)
W_jk = W_jk.reset_index(drop = True)

 11%|█▏        | 7/61 [31:00<5:09:37, 344.03s/it]

In [None]:
os.listdir("../../../Data")

['Daily_graphs',
 'ACLED Colombia (2018-01-01-2023-10-31).csv',
 'nodes.csv',
 'RawData',
 'Pickle',
 'retweeted_text.gzip',
 'W_jk.gzip',
 'count_tweets.gzip',
 'Tweets_DataFrames',
 '3_Day_Graphs']

In [None]:
W_jk.to_pickle(path = "../../../Data/W_jk.gzip", compression = "gzip")

In [2]:
XD = pd.read_pickle("../../../Data/W_jk.gzip", compression='gzip')
XD

Unnamed: 0,Ind,Date,Political Afilliation,W_j_izquierda,W_j_derecha,W_j_centro,W_j_sc,W_j_mismo,W_j_otros
0,0,2021-04-28,Sin Clasificar,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,1,2021-04-28,Izquierda,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,2,2021-04-28,Centro,0.000000,0.0,0.000000,1.000000,0.000000,1.000000
3,3,2021-04-28,Izquierda,0.772727,0.0,0.136364,0.090909,0.772727,0.227273
4,4,2021-04-28,Izquierda,0.862069,0.0,0.068966,0.068966,0.862069,0.137931
...,...,...,...,...,...,...,...,...,...
36959,36959,2021-04-28,Izquierda,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
36960,36960,2021-04-28,Derecha,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
36961,36961,2021-04-28,Sin Clasificar,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
36962,36962,2021-04-28,Izquierda,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
