# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
from tqdm import tqdm
import graph_tool.all as gt
import glob

In [10]:
save_path = "../../../Data/3_Day_Graphs"

## Retweet network with a 3-day rolling window

In [None]:
tweets = pd.read_pickle('../../../Data/Tweets_DataFrames/tweets_lite.gzip', compression='gzip')
print(f'Cantidad de filas (Tweets): {tweets.shape[0]:,}. Cantidad de Columns: {tweets.shape[1]:,}')
print('Total usuarios: ' + f"{len(set(tweets['Author ID']).union(set(tweets['Referenced Tweet Author ID']))):,}")
tweets.head()

In [8]:
# We will just get the Retweets
retweets_total = tweets[tweets['Reference Type'] == 'retweeted']
retweets_total = retweets_total.drop(columns=['Reference Type'])
retweets_total["Date"] = pd.to_datetime(retweets_total["Date"]).dt.date
retweets_total = retweets_total.sort_values('Referenced Tweet Author ID').reset_index(drop = True)

users = set(retweets_total['Author ID']).union(set(retweets_total['Referenced Tweet Author ID']))
print(f'Tenemos: {retweets_total.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')
del tweets, users
# This will be our Sorce-Target List. Will include the weights of each tweets
retweets_total.head()

Tenemos: 30,918,011 Retweets y 624,358 Usario con Rts o Retwiteados


Unnamed: 0,Author ID,Date,Referenced Tweet Author ID
0,373097280.0,2021-06-09,12.0
1,411495598.0,2021-06-10,12.0
2,411495598.0,2021-06-10,12.0
3,94589151.0,2021-06-04,12.0
4,36158466.0,2021-06-04,12.0


In [9]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets_total["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets_total["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes)):,} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

df = pd.DataFrame(list(user_to_party_paro.items()), columns=['User ID', 'Political Affiliation'])
df['Political Affiliation'].value_counts(dropna=False)
del df, ids_faltantes, ids_faltantes1, ids_faltantes2, usuario

Faltan clasificar 587,246 usuarios


In [10]:
# Importamos master ids para filtrar
# Bring Master ID file
users_information = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/users_information.gzip', compression='gzip')
master_id = pd.DataFrame(users_information.index.get_level_values('Author ID'))

# Filter by master Id
temp = retweets_total.merge(master_id, how = 'inner', on = 'Author ID')
retweets = temp.merge(master_id, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='Author ID')
print(f'Cantidad de filas (Tweets): {retweets.shape[0]:,}. Cantidad de Columns: {retweets.shape[1]:,}')
retweets = retweets.drop(columns= 'Author ID_y')
retweets = retweets.rename(columns={'Author ID_x': 'Author ID'})

# Count how many tweets and users we have now
users = set(list(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID'])))
print(f'Tenemos: {retweets.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')
retweets.head()

Cantidad de filas (Tweets): 13,784,630. Cantidad de Columns: 4
Tenemos: 13,784,630 Retweets y 36,964 Usario con Rts o Retwiteados


Unnamed: 0,Author ID,Date,Referenced Tweet Author ID
0,373097280.0,2021-06-09,12.0
1,373097280.0,2021-06-09,12.0
2,373097280.0,2021-06-10,12.0
3,373097280.0,2021-06-10,12.0
4,411495598.0,2021-06-10,12.0


In [11]:
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')

nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))
nodes_id.sort()
print(f'primer nodo: {nodes_id[0]} último nodo: {nodes_id[-1]}')
print(f'Total de nodos en la muestra: {len(nodes_id):,}')

del nodes_no_receipt, nodes_no_send, self_loops

Nodos Retweeteados por alguien pero que no Retweetean: 6,510
Nodos que Retweetean a alguien pero no son Retweeteados: 833
Nodos que tienen un self loop: 3
primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en la muestra: 36,964


In [12]:
nodes_political_affilitation = []

for i in tqdm(nodes_id):
    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'pa': nodes_political_affilitation,
}

# Save as pickle
with open("../../../Data/3_Day_Graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

del i, num_nodes, pa, f

100%|██████████| 36964/36964 [00:00<00:00, 1589760.91it/s]


In [13]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-04-30 23:59:59'
v2_end = '2021-06-29 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

In [14]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict['id'])}
id_to_idx

# Runtime 1 Hour!!!!!
for d_s, d_e in tqdm(zip(date_start, date_end)):
    # Get 3 days Retweets
    window = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] <= d_e.date())] 

    # Get total of Tweets during that period
    rts_per_user= window.groupby('Author ID').size().reset_index(name = 'total')

    # Create Normalize Weight
    temp = window.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number of rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal weight'] = temp['number of rts']/temp['total']
    temp.columns = ["source", "target", "number of rts", "total", "normal weight"]

    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
        
    # Save results as csv (1 min. each aprox)
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d_e.date()) + ".csv"), index = False, sep = ";")

del d_s, d_e

61it [06:09,  6.06s/it]


In [5]:
# Example of our lists
save_path = "../../../Data/3_Day_Graphs"
ej_csv = pd.read_csv(save_path + '/Source-Target/2021-04-30.csv', sep = ';')
ej_csv.dropna(inplace=True)
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(ej_csv.source.unique()):,}. Nodos Target {len(ej_csv.target.unique()):,}")
ej_csv.head()

 Numero de Arcos: 466,447. Nodos Source: 28,403. Nodos Target 12,584


Unnamed: 0,source,target,number of rts,total,normal weight,source_political_afilliation,target_political_afilliation
0,777978.0,135629634.0,1,1,1.0,Retweets Centro,No Retweets
1,784125.0,58956408.0,1,22,0.045455,Retweets Izquierda,Retweets Centro
2,784125.0,61028380.0,1,22,0.045455,Retweets Izquierda,Retweets Izquierda
3,784125.0,82531058.0,1,22,0.045455,Retweets Izquierda,Retweets Izquierda
4,784125.0,108371496.0,1,22,0.045455,Retweets Izquierda,Retweets Izquierda


In [17]:
def partir_y_consultar(lista,k):
    import math
    concat = []

    N = len(lista)
    parte = math.floor(N/k)
    slices = [i for i in range(0,N+1,parte)]
    for i in range(len(slices)-1):
        if slices[i+1] == slices[-1]:
            sub = lista[slices[i]:]
        else:
            sub = lista[slices[i]:slices[i+1]]
        concat.append(sub)
        
    return concat

In [8]:
# Create Graph
files = glob.glob('../../../Data/3_Day_Graphs/Source-Target/*.csv')

# Open dict for any case
with open('/mnt/disk2/Data/3_Day_Graphs/nodes_dict.pkl', 'rb') as file:
    nodes_dict = pickle.load(file)

for file in tqdm(files):
    # Read Graph
    date = file.split('/')[-1].split('.')[0]
    temp = pd.read_csv(file, sep = ';')
    print('FASE 1 CHECK')
    # Initialice Directed Graph
    g = gt.Graph(directed = True)
    print('FASE 2 CHECK')
    # Create Source Target Partition
    source_target = [row for row in temp[['source','target']].itertuples(name = None, index=False)]
    print('FASE 3.1 CHECK')
    g.add_edge_list(source_target)
    # partition = partir_y_consultar(source_target, 200000)
    # print('FASE 3.2 CHECK')
    # del source_target
    
    # x = 1
    # n = 0
    # # Add source Target by peaces
    # for i in partition:
    #     if x < 10:
    #         g.add_edge_list(i)
    #         print(f"Termina iteración {x}")
    #         print()
    #         x+=1
    #     else:
    #         g.add_edge_list(i)
    # del i,x,partition
    
    print('FASE 3.3 CHECK')
    # Create an edge property map for weights
    edge_weight_map = g.new_edge_property("double")

    # Create a Vertex property map for labels
    vertex_label_map = g.new_vertex_property('string')

    # Assign weights to the edges using a for loop
    edge_list = g.get_edges()
    weights = list(temp['normal weight'])

    for i, e in enumerate(edge_list):
        #print(f'Property of {e} is {weights[i]}. Type of Edge: {type(e)}')
        edge_weight_map[e] = weights[i]
    
    del edge_list, weights

    # Assign Labels to vertices using loop
    vertex_list = g.get_vertices()
    labels = nodes_dict['pa']
    for i,v in enumerate(vertex_list):
        print(i,v)
        print(f'Property of {v} is {labels[i]}. Type of Edge: {type(v)}')
        label_of_v = nodes_dict['id'].index[v]
        vertex_label_map[v] = labels[label_of_v]
    
    del vertex_list, labels

    # Add properties
    g.vp['Political Label'] = vertex_label_map
    g.ep['Normal Weights'] = edge_weight_map

    # Save Graphs
    filename = os.path.join(save_path, date + ".graphml")
    g.save(filename)
    print(f"Successfully Saved file {filename} SIIIIIUUUUU")

  0%|          | 0/61 [00:00<?, ?it/s]

FASE 1 CHECK
FASE 2 CHECK
FASE 3.1 CHECK


  0%|          | 0/61 [01:07<?, ?it/s]


MemoryError: 

In [None]:
# Example of our graphs
ej_g = gt.load_graph(save_path + f'/{str(date_end[0].date())}.graphml')
ej_g

In [None]:
import graph_tool.all as gt

# Create a graph
g = gt.Graph(directed=False)
v1 = g.add_vertex()
v2 = g.add_vertex()
v3 = g.add_vertex()
v4 = g.add_vertex()

# Add edges
e1 = g.add_edge(v1, v2)
e2 = g.add_edge(v2, v3)
e3 = g.add_edge(v3, v4)

# Create an edge property map for weights
edge_weight_map = g.new_edge_property("double")

# Create a Vertex property map for labels
vertex_label_map = g.new_vertex_property('string')

# Assign weights to the edges using a for loop
edge_list = g.get_edges()
weights = [5.0, 3.2, 7.1]  # Example weights
for i, e in enumerate(edge_list):
    print(f'Property of {e} is {weights[i]}. Type of Edge: {type(e)}')
    edge_weight_map[e] = weights[i]

# Assign Labels to vertices using loop
vertex_list = g.get_vertices()
labels = ['par', 'impar', 'par', 'impar']
for i,v in enumerate(vertex_list):
    print(i,v)
    print(f'Property of {v} is {labels[i]}. Type of Edge: {type(v)}')
    vertex_label_map[v] = labels[i]

# Access and print the weights for each edge
for v in g.vertices():
    print(f"Label of Vertex {v}: {vertex_label_map[v]}")

g.vp['Label'] = vertex_label_map
g.vp['Label']

## Daily retweet network (with no rolling window)

In [None]:
tweets_paro = glob.glob('/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_*')
tweets_paro

In [None]:
# Import the tweets from Paro but select only the retweets
retweets = pd.DataFrame()
for file in tqdm(tweets_paro):
    tweets = pd.read_pickle(file, compression = "gzip")

    # Select only retweets
    rts = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
    rts = rts.drop(columns = 'Reference Type')
    retweets = pd.concat([retweets, rts], axis = 0)
retweets = retweets.reset_index(drop = True)
del rts, tweets 
print('Shape:', retweets.shape)

In [None]:
retweets.head()

In [None]:
# Fix data types
retweets["ID"] = retweets["ID"].astype(int)
retweets["Author ID"] = retweets["Author ID"].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)
retweets['Referenced Tweet'] = retweets['Referenced Tweet'].astype(int)

# Remove time from retweets date
retweets["Date"] = pd.to_datetime(retweets["Date"]).dt.date

In [None]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("/mnt/disk2/Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes))} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

### Create Source-Target DataFrame

In [None]:
save_path = "../../Data/Daily_graphs"

In [None]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

In [None]:
users_information = pd.read_pickle("../../../Data/Tweets_DataFrames/users_information.gzip", compression = "gzip")

In [None]:
users_information

In [None]:
# Label Referenced Tweet Author
retweets = users_information.reset_index()[["Author ID", "Author Name"]] \
    .rename(columns = {"Author ID": "Referenced Tweet Author ID", 
                       "Author Name": "Referenced Tweet Author Name"}) \
                       .merge(retweets, how = "right", on = "Referenced Tweet Author ID")

In [None]:
retweets.head()

In [None]:
# Users without name
retweets.iloc[:, 0:2].drop_duplicates().isna().sum()

In [None]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    temp = retweets[retweets['Date'] == d.date()]
    temp = temp.groupby(["Author ID", "Author Name", "Date", "Referenced Tweet Author ID", "Referenced Tweet Author Name"]).size().reset_index(name = "w")
    temp.columns = ["source", "source_label", "date", "target", "target_label", "w"]
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d.date()) + ".csv"), index = False, sep = ";")

### Create networks

In [None]:
# Node list
nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))
len(nodes_id)

In [None]:
users_information.shape

In [None]:
users_information = users_information.reset_index()

In [None]:
# Convert users_information DataFrame to a dictionary for fast lookup
users_info_dict = users_information.set_index('Author ID')['Author Name'].to_dict()

nodes_label = []
nodes_political_affilitation = []
for i in tqdm(nodes_id):
    # Assign label
    label = users_info_dict.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_label.append(label)

    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'label': nodes_label,
    'pa': nodes_political_affilitation
}

# Save as pickle
with open("../../../Data/Daily_graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

In [None]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict['id'])}

for d in tqdm(daily_grid):
    # Filter retweets first
    filtered_retweets = retweets[retweets['Date'] == d.date()]
    
    # Create weights
    test = filtered_retweets.groupby(['Author ID', 'Referenced Tweet Author ID']).size().reset_index(name="w")
    
    # Use dictionary lookups for index mappings
    test['idx_s'] = [id_to_idx.get(x, np.nan) for x in test['Author ID']]
    test['idx_t'] = [id_to_idx.get(x, np.nan) for x in test['Referenced Tweet Author ID']]
    
    # The edge list should be based on the idx of the nodes and include weights
    edges_list_with_weights = list(zip(test['idx_s'].dropna().astype(int), test['idx_t'].dropna().astype(int), test['w']))
    
    # Create graph
    g = gt.Graph(directed=True)
    g.add_vertex(num_nodes)
    
    # Node attributes
    node_id = g.new_vertex_property("string", vals=nodes_dict['id'])
    node_label = g.new_vertex_property("string", vals=nodes_dict['label'])
    node_affiliation = g.new_vertex_property("string", vals=nodes_dict['pa'])
    
    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Label"] = node_label
    g.vertex_properties["Political Affiliation"] = node_affiliation
    
    # Edge weight attribute
    edge_weights = g.new_edge_property("int")
    
    # Add edges and assign weights
    for source, target, weight in edges_list_with_weights:
        e = g.add_edge(source, target)
        edge_weights[e] = weight

    g.edge_properties["weight"] = edge_weights
    
    g.save(os.path.join(save_path, str(d.date()) + ".graphml"))

In [None]:
print(g)
print(g.vp["ID"][624353])
print(g.vp["Label"][624353])
print(g.vp["Political Affiliation"][624353])