# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [1]:
# Mathematical and Data Managment
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Graph Managment
import graph_tool.all as gt
from utils.subutils.Functions import *

# Miscellaneous
import pickle
import os
from tqdm import tqdm
from glob import glob



## Retweet network with a 3-day rolling window

In [2]:
save_path = "../../../Data/3_Day_Graphs"

In [3]:
# Load Tweets
tweets = pd.read_pickle('../../../Data/Tweets_DataFrames/tweets_lite.gzip', compression='gzip')
print(f'Cantidad de filas (Tweets): {tweets.shape[0]:,}. Cantidad de Columnas: {tweets.shape[1]:,}')
print('Total usuarios: ' + f"{len(set(tweets['Author ID']).union(set(tweets['Referenced Tweet Author ID']))):,}")
tweets = tweets.astype({
    'Author ID': float,
    'Referenced Tweet Author ID': float
})
tweets.head()

Cantidad de filas (Tweets): 45,330,718. Cantidad de Columnas: 5
Total usuarios: 6,004,544


Unnamed: 0,Author ID,Author Name,Date,Reference Type,Referenced Tweet Author ID
0,138377765.0,hmauriciojg,2021/06/28 16:05:23,replied_to,134855300.0
1,138377765.0,hmauriciojg,2021/06/28 13:10:41,replied_to,1131821000.0
2,138377765.0,hmauriciojg,2021/06/27 19:06:38,replied_to,62337500.0
3,138377765.0,hmauriciojg,2021/06/22 16:12:03,replied_to,576647400.0
4,138377765.0,hmauriciojg,2021/06/21 22:18:04,replied_to,126832600.0


In [4]:
# We will just get the Retweets
retweets_total = tweets[tweets['Reference Type'] == 'retweeted']
retweets_total = retweets_total.drop(columns=['Reference Type'])
retweets_total["Date"] = pd.to_datetime(retweets_total["Date"]).dt.date
#retweets_total = retweets_total.sort_values('Referenced Tweet Author ID').reset_index(drop = True)

users = set(retweets_total['Author ID']).union(set(retweets_total['Referenced Tweet Author ID']))
print(f'Tenemos: {retweets_total.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')
del tweets, users

# This will be our Sorce-Target List. Will include the weights of each tweets
retweets_total.head()

Tenemos: 30,918,011 Retweets y 624,358 Usario con Rts o Retwiteados


Unnamed: 0,Author ID,Author Name,Date,Referenced Tweet Author ID
92,788250746.0,Laura_Milena98,2021-06-28,142491200.0
95,788250746.0,Laura_Milena98,2021-06-26,261704700.0
99,788250746.0,Laura_Milena98,2021-06-24,8.305394e+17
100,788250746.0,Laura_Milena98,2021-06-24,56713270.0
102,788250746.0,Laura_Milena98,2021-06-23,1.140705e+18


In [5]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets_total["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets_total["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes)):,} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

affilliation_df = pd.DataFrame(list(user_to_party_paro.items()), columns=['User ID', 'Political Affiliation'])
affilliation_df['Political Affiliation'].value_counts(dropna=False)

Faltan clasificar 587,246 usuarios


Political Affiliation
NaN            587246
Izquierda       23138
Derecha          6812
No Retweets      3844
Centro           3543
Name: count, dtype: int64

In [6]:
# Create Master Id
#del affilliation_df,ids_faltantes, ids_faltantes1, ids_faltantes2, usuario
users_information = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/users_information.gzip', compression='gzip')
master_id = users_information.reset_index()[['Author ID', 'Author Name']]
master_id = master_id.rename(columns={
    'Author ID': 'User ID',
    'Author Name': 'Label'
})

master_id['Political Affiliation'] = master_id['User ID'].apply(lambda x: user_to_party_paro[x])
print(f"Duplicated IDs: {sum(master_id.duplicated(subset = 'User ID'))}")
master_id = master_id.drop_duplicates(subset = 'User ID')
master_id

Duplicated IDs: 7


Unnamed: 0,User ID,Label,Political Affiliation
0,0.000000e+00,0,No Retweets
1,1.000000e+00,0,No Retweets
2,2.000000e+00,0,No Retweets
3,3.000000e+00,0,No Retweets
5,4.000000e+00,0,No Retweets
...,...,...,...
37339,1.389722e+18,Neoplasticista,Izquierda
37340,1.389737e+18,JC13177979,Derecha
37341,1.389741e+18,JhonatanVRojo,No Retweets
37342,1.389769e+18,VaneLen18,Izquierda


In [7]:
# Adding Source Labels
temp = retweets_total.merge(master_id, how = 'inner', left_on = 'Author ID', right_on='User ID', validate = 'm:1')
temp = temp.rename(columns = {'Label': 'Source Label', 'Political Affiliation': 'Source PA'}).drop(columns='User ID')

# Adding Target Labels
retweets = temp.merge(master_id, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='User ID', validate = 'm:1')
retweets = retweets.rename(columns = {'Label': 'Target Label', 'Political Affiliation': 'Target PA'})

retweets = retweets.drop(columns= ['User ID'])

# Count how many tweets and users we have now
users_rts = list(set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID'])))
non_singletons = master_id[master_id['User ID'].isin(users_rts)] # Nodes conected by a Rt
singletons = master_id[~master_id['User ID'].isin(users_rts)] # Nodes that aren't in Rts during Paro (Perhaps only Twitted or Retwitted with replies) 
print(f'Tenemos: {retweets.shape[0]:,} Retweets y {len(non_singletons):,} Usuarios con Rts o Retwiteados. Usuarios sin Retweets {len(singletons)}')
retweets.head()

Tenemos: 13,784,585 Retweets y 36,964 Usuarios con Rts o Retwiteados. Usuarios sin Retweets 373


Unnamed: 0,Author ID,Author Name,Date,Referenced Tweet Author ID,Source Label,Source PA,Target Label,Target PA
0,788250700.0,Laura_Milena98,2021-06-22,270668814.0,Laura_Milena98,Izquierda,gabocifuentes,Centro
1,126523600.0,sandrapulga,2021-05-15,270668814.0,sandrapulga,Izquierda,gabocifuentes,Centro
2,8.996701e+17,rchicave,2021-06-15,270668814.0,rchicave,Izquierda,gabocifuentes,Centro
3,8.996701e+17,rchicave,2021-06-07,270668814.0,rchicave,Izquierda,gabocifuentes,Centro
4,8.996701e+17,rchicave,2021-05-08,270668814.0,rchicave,Izquierda,gabocifuentes,Centro


In [8]:
# Save this Dataframe for Nodes List
master_id = non_singletons # Aislamos a los singletons del Master ID
master_id = master_id.astype({
    'User ID': int,
    'Label': str,
    'Political Affiliation': str
}).reset_index(drop=True)

id_fecha_counts = retweets.groupby('Author ID')['Date'].nunique().reset_index(name='Fechas Source').rename(columns={'Author ID': 'User ID'})
master_id = master_id.merge(id_fecha_counts, on = 'User ID', how='left')
id_fecha_counts = retweets.groupby('Referenced Tweet Author ID')['Date'].nunique().reset_index(name='Fechas Target').rename(columns={'Referenced Tweet Author ID': 'User ID'})
master_id = master_id.merge(id_fecha_counts, on = 'User ID', how='left')

master_id['Fechas Source'] = master_id['Fechas Source'].fillna(0).astype(int)
master_id['Fechas Target'] = master_id['Fechas Target'].fillna(0).astype(int)

master_id = master_id.reset_index().rename(columns = {'index': 'ID'})
id_to_label = master_id.set_index('User ID').to_dict()['Label']

master_id = master_id[['ID', 'Label', 'Political Affiliation', 'User ID', 'Fechas Source', 'Fechas Target']]

# Save
master_id.to_csv(os.path.join(save_path, 'Nodes' + ".csv"), index = False, sep = ";")
master_id

Unnamed: 0,ID,Label,Political Affiliation,User ID,Fechas Source,Fechas Target
0,0,0,No Retweets,12,0,9
1,1,alerios,Izquierda,12996,57,6
2,2,ranaberden,Centro,777978,17,4
3,3,jsanti,Izquierda,784125,57,2
4,4,thisgoblin,Izquierda,1061601,44,7
...,...,...,...,...,...,...
36959,36959,Neoplasticista,Izquierda,1389721694961651712,16,1
36960,36960,JC13177979,Derecha,1389737202742071296,34,18
36961,36961,JhonatanVRojo,No Retweets,1389741234370064384,5,0
36962,36962,VaneLen18,Izquierda,1389769251704147968,30,2


In [9]:
# Insights
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')

print(f'primer nodo: {min(users_rts)} último nodo: {max(users_rts):,}')
print(f'Total de nodos en conectados: {len(users_rts):,}')
print(f'Total de nodos Singletons: {len(singletons):,}')
print(f'Total de usuarios que TWITTEARON Durante el paro: {len(singletons) + len(non_singletons):,}')

# del nodes_no_receipt, nodes_no_send, self_loops, users_rts

Nodos Retweeteados por alguien pero que no Retweetean: 6,510
Nodos que Retweetean a alguien pero no son Retweeteados: 834
Nodos que tienen un self loop: 8
primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en conectados: 36,964
Total de nodos Singletons: 373
Total de usuarios que TWITTEARON Durante el paro: 37,337


In [10]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Nodes.csv', sep = ';')

# Useful Dicts
idx_to_id = master_id['User ID'].reset_index().to_dict()['User ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Dict for lookup in Graph
id_to_idx = {id:idx for idx, id in idx_to_id.items()}

# dict for color
color = {
    'Izquierda': 'blue',
    'Derecha': 'red',
    'Centro': 'green',
    'No Retweets': 'gray'
}

In [78]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-05-01 23:59:59'
v2_end = '2021-06-30 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

# date_start = date_start[:1]
# date_end = date_end[:1]

# Runtime 5 minutes
for d_s, d_e in tqdm(zip(date_start, date_end)):
    # Get 3 days Retweets
    window = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] < d_e.date())] 

    # Get total of Tweets during that period
    rts_per_user= window.groupby('Author ID').size().reset_index(name = 'total')

    # Create Normalize Weight
    temp = window.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number of rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal weight'] = temp['number of rts']/temp['total']
    temp['normal weight'] = temp['normal weight'].round(3)
    temp.drop(columns='total', inplace=True)

    temp.columns = ["source_user_id", "target_user_id", "Number of rts", "Normal weight"]

    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: id_to_idx[x])
    temp['Target'] = temp['target_user_id'].apply(lambda x: id_to_idx[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'Number of rts': int,
        'Normal weight': float,
        'source_user_id': int,
        'target_user_id': int
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    non_singletons_temp = set(temp['Source']).union(set(temp['Target']))
    if len(non_singletons_temp) > len(master_id):
        print(d_e)
        break
    elif len(non_singletons_temp) == len(master_id):
        print(f"Entre {d_s} y {d_e} Todos los nodos no singletons Retwittearon")
    else:
        pass
    
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source_Target", 'starting_' + str(d_s.date()) + ".csv"), index = False, sep = ";")

#del d_s, d_e

61it [06:16,  6.17s/it]


In [11]:
# Example of our lists
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-06-16.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(set(ej_csv['Source'])):,}. Nodos Target {len(set(ej_csv['Target'])):,}. Total de Nodos {len(nodes):,}")
ej_csv.head(10)

 Numero de Arcos: 234,657. Nodos Source: 24,254. Nodos Target 8,981. Total de Nodos 26,015


Unnamed: 0,Source,Target,source_user_id,target_user_id,Number of rts,Normal weight,source_political_afilliation,target_political_afilliation,source_label,target_label
0,1,1140,12996,53855557,1,0.028,Izquierda,Centro,alerios,CathyJuvinao
1,1,1712,12996,62528273,1,0.028,Izquierda,Centro,alerios,AntonioSanguino
2,1,1939,12996,66740100,1,0.028,Izquierda,No Retweets,alerios,ZuluagaCamila
3,1,3504,12996,98781946,2,0.056,Izquierda,Izquierda,alerios,IvanCepedaCast
4,1,3782,12996,104622520,1,0.028,Izquierda,Izquierda,alerios,valentinabz
5,1,6170,12996,142448269,1,0.028,Izquierda,Izquierda,alerios,maryluzherran
6,1,7882,12996,165748292,1,0.028,Izquierda,Centro,alerios,angelamrobledo
7,1,9727,12996,201256928,1,0.028,Izquierda,Izquierda,alerios,marthaperaltae
8,1,10570,12996,221466950,4,0.111,Izquierda,Izquierda,alerios,wilsonariasc
9,1,11057,12996,233001872,1,0.028,Izquierda,Izquierda,alerios,jorgerojas2022


In [12]:

files = glob('../../../Data/3_Day_Graphs/Source_Target/starting_*.csv')
for file in tqdm(files):
    starting_date_str = file.split('.')[-2].split('/')[-1].split('_')[-1]
    starting_date = datetime.strptime(starting_date_str, '%Y-%m-%d')
    ending_date = starting_date + timedelta(days=3)
    ending_date_str = ending_date.strftime('%Y-%m-%d')
    
    csv = pd.read_csv(file, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weights'] = normal_weight
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    # # # Initialice Directed Graph
    # # g = gt.load_graph_from_csv(file,
    # #                            directed=True,
    # #                            eprop_types=['float',"float"],
    # #                            eprop_names=['Number of rts', 'Normal Weights'],
    # #                            skip_first=True,
    # #                            hashed=True,
    # #                            hash_type='int',
    # #                            csv_options={'delimiter': ';'})
    
    # # Rename VertexPropertyMap with Master Id Index
    # g.vertex_properties['Master Index'] = g.vertex_properties['name']
    # del g.vertex_properties['name']

    # Create a Vertex property map for labels
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('int64_t')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v in g.vertices():
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_starting_date = g.new_graph_property('string')
    graph_ending_date = g.new_graph_property('string')
    graph_starting_date[g] = starting_date_str
    graph_ending_date[g] = ending_date_str
    g.gp['Starting Date'] = graph_starting_date
    g.gp['Ending Date'] = graph_ending_date

    # Save Graphs
    filename = os.path.join(save_path, 'starting_' + starting_date_str + ".graphml")
    g.save(filename)


100%|██████████| 61/61 [07:30<00:00,  7.38s/it]


In [29]:
ej_csv.tail(10)

Unnamed: 0,Source,Target,source_user_id,target_user_id,Number of rts,Normal weight,source_political_afilliation,target_political_afilliation,source_label,target_label
718218,36963,13153,1389784145417678848,286774097,1,0.029,Izquierda,Izquierda,kars0518,Fergicienta
718219,36963,13557,1389784145417678848,296607146,1,0.029,Izquierda,Derecha,kars0518,Caromunozb
718220,36963,18510,1389784145417678848,537428771,1,0.029,Izquierda,Izquierda,kars0518,angustiadiurna
718221,36963,19969,1389784145417678848,715002912,2,0.059,Izquierda,Izquierda,kars0518,adolforivas115
718222,36963,21808,1389784145417678848,1131820958,1,0.029,Izquierda,Izquierda,kars0518,alejarojas_g
718223,36963,23944,1389784145417678848,2221941088,1,0.029,Izquierda,Izquierda,kars0518,Juanaafanador1
718224,36963,27033,1389784145417678848,3883905634,1,0.029,Izquierda,No Retweets,kars0518,EspectadorVideo
718225,36963,29307,1389784145417678848,847118149262741504,1,0.029,Izquierda,Izquierda,kars0518,LaJulietaMM
718226,36963,30226,1389784145417678848,915615485147910144,1,0.029,Izquierda,Izquierda,kars0518,cuestion_p
718227,36963,31708,1389784145417678848,1010113508233699328,3,0.088,Izquierda,Izquierda,kars0518,LevyRincon


In [17]:
print('Edge List')
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(save_path + f'/starting_2021-05-04.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

Edge List
Non Isolate Vertices 32,886 and 718,228 edges
Graph
Non Isolate Vertices 32,886 and 718,228 edges


In [24]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23639
1920140406
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23639
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                     1920140406
Fechas Source                       40
Fechas Target                       62
Name: 23639, dtype: object

In [32]:
# Checking properties of Vertex
test_id_source = 1389784145417678848
test_id_target = 1010113508233699328

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]


test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weights'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (36963, 31708)
Edge In graph: (36963, 31708)
Normal Weights 0.088
Number of rts 3
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,Number of rts,Normal weight,source_political_afilliation,target_political_afilliation,source_label,target_label
718227,36963,31708,1389784145417678848,1010113508233699328,3,0.088,Izquierda,Izquierda,kars0518,LevyRincon


## Daily retweet network (with no rolling window)

In [3]:
save_path = "../../../Data/Daily_graphs"

In [4]:
tweets_paro = glob('/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_*')
tweets_paro

['/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_4.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_5.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_3.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_2.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_1.gzip']

In [5]:
# Import the tweets from Paro but select only the retweets
retweets = pd.DataFrame()
for file in tqdm(tweets_paro):
    tweets = pd.read_pickle(file, compression = "gzip")

    # Select only retweets
    rts = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
    rts = rts.drop(columns = 'Reference Type')
    retweets = pd.concat([retweets, rts], axis = 0)
retweets = retweets.reset_index(drop = True)
del rts, tweets 
print('Shape:', retweets.shape)

100%|██████████| 5/5 [03:45<00:00, 45.03s/it]


Shape: (30918011, 24)


In [6]:
users = set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID']))
print(f'Tenemos: {retweets.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')

Tenemos: 30,918,011 Retweets y 624,358 Usario con Rts o Retwiteados


In [7]:
# Fix data types
retweets["ID"] = retweets["ID"].astype(int)
retweets["Author ID"] = retweets["Author ID"].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)
retweets['Referenced Tweet'] = retweets['Referenced Tweet'].astype(int)

# Remove time from retweets date
retweets["Date"] = pd.to_datetime(retweets["Date"]).dt.date

### Create Source-Target DataFrame

In [8]:
save_path = "../../Data/Daily_graphs"

In [9]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have retweets from 2021-04-28 to 2021-06-29


In [10]:
users_information = pd.read_pickle("../../../Data/Tweets_DataFrames/users_information.gzip", compression = "gzip")

In [11]:
# Label Referenced Tweet Author
retweets = users_information.reset_index()[["Author ID", "Author Name"]] \
    .rename(columns = {"Author ID": "Referenced Tweet Author ID", 
                       "Author Name": "Referenced Tweet Author Name"}) \
                       .merge(retweets, how = "right", on = "Referenced Tweet Author ID")

In [12]:
# Users without name
retweets.iloc[:, 0:2].drop_duplicates().isna().sum()

Referenced Tweet Author ID           0
Referenced Tweet Author Name    587246
dtype: int64

In [None]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    temp = retweets[retweets['Date'] == d.date()]
    temp = temp.groupby(["Author ID", "Author Name", "Date", "Referenced Tweet Author ID", "Referenced Tweet Author Name"]).size().reset_index(name = "w")
    temp.columns = ["source", "source_label", "date", "target", "target_label", "w"]
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d.date()) + ".csv"), index = False, sep = ";")

### Create networks

In [13]:
# Node list
nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))
len(nodes_id)

624358

In [14]:
users_information.shape

(37344, 6)

In [15]:
users_information = users_information.reset_index()

In [16]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

In [17]:
# Convert users_information DataFrame to a dictionary for fast lookup
users_info_dict = users_information.set_index('Author ID')['Author Name'].to_dict()

nodes_label = []
nodes_political_affilitation = []
for i in tqdm(nodes_id):
    # Assign label
    label = users_info_dict.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_label.append(label)

    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'label': nodes_label,
    'pa': nodes_political_affilitation
}

# Save as pickle
with open("../../../Data/Daily_graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

100%|██████████| 624358/624358 [00:00<00:00, 1256998.88it/s]


In [18]:
len(nodes_dict["id"])

624358

In [19]:
# Identify indices where "pa" is not nan
indices_to_keep = [i for i, pa_value in enumerate(nodes_dict["pa"]) if str(pa_value) != "nan"]

# Create a new dictionary with filtered values
nodes_dict_filtered = {key: [value[i] for i in indices_to_keep] for key, value in nodes_dict.items()}

# Fix idx
nodes_dict_filtered["idx"] = [i for i in range(num_nodes)]
# Fix id
nodes_dict_filtered["id"] = [int(i) for i in nodes_dict_filtered["id"]]

In [20]:
num_nodes = len(nodes_dict_filtered["id"])
num_nodes

37112

In [21]:
len(nodes_dict_filtered["id"])

37112

In [22]:
retweets['Author ID'] = retweets['Author ID'].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)

In [25]:
nodes_dict_filtered["id"][0]

12

In [28]:
retweets_filtered = retweets.copy()
retweets_filtered = retweets_filtered.loc[retweets['Author ID'].isin(nodes_dict_filtered["id"]),:]
retweets_filtered = retweets_filtered.loc[retweets['Referenced Tweet Author ID'].isin(nodes_dict_filtered["id"]),:].reset_index(drop = True)

In [34]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict_filtered['id'])}

for d in tqdm(daily_grid):
    # Filter retweets first
    retweets_filtered2 = retweets_filtered[retweets_filtered['Date'] == d.date()]
    
    # Create weights
    test = retweets_filtered2.groupby(['Author ID', 'Referenced Tweet Author ID']).size().reset_index(name="w")
    
    # Use dictionary lookups for index mappings
    test['idx_s'] = [id_to_idx.get(x, np.nan) for x in test['Author ID']]
    test['idx_t'] = [id_to_idx.get(x, np.nan) for x in test['Referenced Tweet Author ID']]
    
    # The edge list should be based on the idx of the nodes and include weights
    edges_list_with_weights = list(zip(test['idx_s'].dropna().astype(int), test['idx_t'].dropna().astype(int), test['w']))
    
    # Create graph
    g = gt.Graph(directed=True)
    g.add_vertex(num_nodes)
    
    # Node attributes
    node_id = g.new_vertex_property("string", vals=nodes_dict_filtered['id'])
    node_label = g.new_vertex_property("string", vals=nodes_dict_filtered['label'])
    node_affiliation = g.new_vertex_property("string", vals=nodes_dict_filtered['pa'])
    
    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Label"] = node_label
    g.vertex_properties["Political Affiliation"] = node_affiliation
    
    # Edge weight attribute
    edge_weights = g.new_edge_property("int")
    
    # Add edges and assign weights
    for source, target, weight in edges_list_with_weights:
        e = g.add_edge(source, target)
        edge_weights[e] = weight

    g.edge_properties["weight"] = edge_weights
    
    g.save(os.path.join("../../../Data/Daily_graphs/Full network", str(d.date()) + ".graphml"))

  3%|▎         | 2/63 [00:07<03:56,  3.88s/it]

In [None]:
print(g)
print(g.vp["ID"][624353])
print(g.vp["Label"][624353])
print(g.vp["Political Affiliation"][624353])