# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from tqdm import tqdm
import graph_tool.all as gt
import glob



In [72]:
save_path = "../../../Data/3_Day_Graphs"

## Retweet network with a 3-day rolling window

In [5]:
tweets = pd.read_pickle('../../../Data/Tweets_DataFrames/tweets_lite.gzip', compression='gzip')
print(f'Cantidad de filas (Tweets): {tweets.shape[0]:,}. Cantidad de Columans: {tweets.shape[1]:,}')
print('Total usuarios: ' + f"{len(set(tweets['Author ID']).union(set(tweets['Referenced Tweet Author ID']))):,}")
tweets.head()

Cantidad de filas (Tweets): 45,330,718. Cantidad de Columans: 4
Total usuarios: 5,759,089


Unnamed: 0,Author ID,Date,Reference Type,Referenced Tweet Author ID
0,9.180596e+17,2021/06/29 23:37:59,retweeted,141943900.0
1,9.180596e+17,2021/06/29 23:37:16,retweeted,8.628063e+17
2,9.180596e+17,2021/06/29 23:31:36,retweeted,1.402301e+18
3,9.180596e+17,2021/06/29 23:30:41,retweeted,382419800.0
4,9.180596e+17,2021/06/29 23:29:39,retweeted,14834300.0


In [6]:
# We will just get the Retweets
retweets_total = tweets[tweets['Reference Type'] == 'retweeted']
retweets_total = retweets_total.drop(columns=['Reference Type'])
retweets_total["Date"] = pd.to_datetime(retweets_total["Date"]).dt.date
retweets_total = retweets_total.sort_values('Referenced Tweet Author ID').reset_index(drop = True)

users = set(retweets_total['Author ID']).union(set(retweets_total['Referenced Tweet Author ID']))
print(f'Tenemos: {retweets_total.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')
del tweets, users

# This will be our Sorce-Target List. Will include the weights of each tweets
retweets_total.head()

Tenemos: 30,918,011 Retweets y 624,358 Usario con Rts o Retwiteados


Unnamed: 0,Author ID,Date,Referenced Tweet Author ID
0,373097280.0,2021-06-09,12.0
1,411495598.0,2021-06-10,12.0
2,411495598.0,2021-06-10,12.0
3,94589151.0,2021-06-04,12.0
4,36158466.0,2021-06-04,12.0


In [7]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets_total["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets_total["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes)):,} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

affilliation_df = pd.DataFrame(list(user_to_party_paro.items()), columns=['User ID', 'Political Affiliation'])
affilliation_df['Political Affiliation'].value_counts(dropna=False)

Faltan clasificar 587,246 usuarios


Political Affiliation
NaN                   587246
Retweets Izquierda     23138
Retweets Derecha        6812
No Retweets             3844
Retweets Centro         3543
Name: count, dtype: int64

In [39]:
#del affilliation_df,ids_faltantes, ids_faltantes1, ids_faltantes2, usuario
# Importamos master ids para filtrar
# Bring Master ID file
users_information = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/users_information.gzip', compression='gzip')
master_id = users_information.reset_index()[['Author ID', 'Author Name']]
master_id = master_id.rename(columns={
    'Author ID': 'ID',
    'Author Name': 'Label'
})

# dict for color
color = {
    'Retweets Izquierda': (0,0,1),
    'Retweets Derecha': (1,0,0),
    'Retweets Centro': (0,1,0),
    'No Retweets': (0,0,0)
}

master_id['Political Affiliation'] = master_id['ID'].apply(lambda x: user_to_party_paro[x])
master_id['Color'] = master_id['Political Affiliation'].apply(lambda x: color[x])
master_id = master_id.drop_duplicates(subset = 'ID')
master_id

Unnamed: 0,ID,Label,Political Affiliation,Color
0,0.000000e+00,0,No Retweets,"(0, 0, 0)"
1,1.000000e+00,0,No Retweets,"(0, 0, 0)"
2,2.000000e+00,0,No Retweets,"(0, 0, 0)"
3,3.000000e+00,0,No Retweets,"(0, 0, 0)"
5,4.000000e+00,0,No Retweets,"(0, 0, 0)"
...,...,...,...,...
37339,1.389722e+18,Neoplasticista,Retweets Izquierda,"(0, 0, 1)"
37340,1.389737e+18,JC13177979,Retweets Derecha,"(1, 0, 0)"
37341,1.389741e+18,JhonatanVRojo,No Retweets,"(0, 0, 0)"
37342,1.389769e+18,VaneLen18,Retweets Izquierda,"(0, 0, 1)"


In [11]:
# Adding Source Labels
temp = retweets_total.merge(master_id, how = 'inner', left_on = 'Author ID', right_on='ID')
temp = temp.rename(columns = {'Label': 'Source Label', 'Political Affiliation': 'Source PA'}).drop(columns='ID')

# Adding Target Labels
retweets = temp.merge(master_id, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='ID')
retweets = retweets.rename(columns = {'Label': 'Target Label', 'Political Affiliation': 'Target PA'})
retweets = retweets.drop(columns= 'ID')

# Count how many tweets and users we have now
users = list(set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID'])))
print(f'Cantidad de filas (Tweets): {retweets.shape[0]:,}. Cantidad de Columns: {retweets.shape[1]:,}')
print(f'Tenemos: {retweets.shape[0]:,} Retweets y {len(users):,} Usuarios con Rts o Retwiteados')

# Save this Dataframe for Nodes List
master_id = master_id[master_id['ID'].isin(users)] # Filter in Retweets DataFrame
master_id = master_id.astype({
    'ID': int,
    'Label': str,
    'Political Affiliation': str
}).reset_index(drop=True)

master_id.to_csv(os.path.join(save_path, 'Nodes' + ".csv"), index = False, sep = ";")
id_to_label = master_id.set_index('ID').to_dict()['Label']
#del temp, retweets_total, users
retweets

Cantidad de filas (Tweets): 13,784,608. Cantidad de Columns: 7
Tenemos: 13,784,608 Retweets y 36,964 Usuarios con Rts o Retwiteados


Unnamed: 0,Author ID,Date,Referenced Tweet Author ID,Source Label,Source PA,Target Label,Target PA
0,3.730973e+08,2021-06-09,1.200000e+01,Keevinmg,Retweets Izquierda,0,No Retweets
1,3.730973e+08,2021-06-10,1.200000e+01,Keevinmg,Retweets Izquierda,0,No Retweets
2,4.114956e+08,2021-06-10,1.200000e+01,CamiloGuerreroB,Retweets Derecha,0,No Retweets
3,4.114956e+08,2021-06-10,1.200000e+01,CamiloGuerreroB,Retweets Derecha,0,No Retweets
4,4.114956e+08,2021-06-16,1.200000e+01,CamiloGuerreroB,Retweets Derecha,0,No Retweets
...,...,...,...,...,...,...,...
13784603,1.248691e+18,2021-05-25,1.248691e+18,DobleMoralCol,No Retweets,DobleMoralCol,No Retweets
13784604,1.342872e+18,2021-05-03,1.342872e+18,DiegoARamirezH8,No Retweets,DiegoARamirezH8,No Retweets
13784605,1.342872e+18,2021-06-03,1.342872e+18,DiegoARamirezH8,No Retweets,DiegoARamirezH8,No Retweets
13784606,1.342872e+18,2021-06-08,1.342872e+18,DiegoARamirezH8,No Retweets,DiegoARamirezH8,No Retweets


In [40]:
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')

print(f'primer nodo: {min(users)} último nodo: {max(users)}')
print(f'Total de nodos en la muestra: {len(users):,}')

del nodes_no_receipt, nodes_no_send, self_loops, users

Nodos Retweeteados por alguien pero que no Retweetean: 6,510
Nodos que Retweetean a alguien pero no son Retweeteados: 833
Nodos que tienen un self loop: 7
primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en la muestra: 36,964


In [32]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-05-01 23:59:59'
v2_end = '2021-06-30 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

In [42]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Nodes.csv', sep = ';')
index = master_id['ID'].reset_index()
# Useful Dicts
idx_to_id = master_id['ID'].reset_index().to_dict()['ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Dict for lookup in Graph
id_to_idx = {id:idx for idx, id in idx_to_id.items()}

# dict for color
color = {
    'Retweets Izquierda': (0,0,1),
    'Retweets Derecha': (1,0,0),
    'Retweets Centro': (0,1,0),
    'No Retweets': (0,0,0)
}
index

Unnamed: 0,index,ID
0,0,12
1,1,12996
2,2,777978
3,3,784125
4,4,1061601
...,...,...
36959,36959,1389721694961651712
36960,36960,1389737202742071296
36961,36961,1389741234370064384
36962,36962,1389769251704147968


In [34]:
# Runtime 5 minutes
for d_s, d_e in tqdm(zip(date_start, date_end)):
    # Get 3 days Retweets
    window = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] < d_e.date())] 

    # Get total of Tweets during that period
    rts_per_user= window.groupby('Author ID').size().reset_index(name = 'total')

    # Create Normalize Weight
    temp = window.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number of rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal weight'] = temp['number of rts']/temp['total']
    temp['normal weight'] = temp['normal weight'].round(3)
    temp.columns = ["Source ID", "Target ID", "Number of rts", "Total of rts", "Normal weight"]

    # Assign political affiliation
    temp["source_political_afilliation"] = temp["Source ID"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["Target ID"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['Source ID'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['Target ID'].apply(lambda x: id_to_label[x])

    # Add ending date
    temp['Ending date'] = d_e.date()
    
    # Replace Source IDs for Indexes
    temp = temp.merge(index, how = 'left', left_on='Source ID', right_on = 'ID')
    temp = temp.rename(columns = {'index': 'Source Index'}).drop(columns=['ID'])
    
    # Replace Target IDs for Indexes
    temp = temp.merge(index, how = 'left', left_on='Target ID', right_on = 'ID')
    temp = temp.rename(columns = {'index': 'Target Index'}).drop(columns=['ID'])

    #Adding dtypes
    temp = temp.astype({
        'Source ID': int,
        'Target ID': int,
        'Number of rts': int,
        'Total of rts': int,
        'Normal weight': float,
        'Source Index': int,
        'Target Index': int
    })
    temp = temp[['Source Index', 'Target Index', 'Number of rts', 'Total of rts', 'Normal weight', 'Ending date']] 
    
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source_Target", str(d_e.date()) + ".csv"), index = False, sep = ";")

del d_s, d_e

61it [05:43,  5.62s/it]


In [51]:
# Example of our lists
ej_csv = pd.read_csv(save_path + '/Source_Target/2021-05-16.csv', sep = ';')
nodes = set(set(ej_csv['Source Index'].unique()).union(set(ej_csv['Target Index'].unique())))
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(ej_csv['Source Index'].unique()):,}. Nodos Target {len(ej_csv['Target Index'].unique()):,}. Total de Nodos {len(nodes):,}")
ej_csv.head()

 Numero de Arcos: 441,858. Nodos Source: 28,348. Nodos Target 11,445. Total de Nodos 29,985


Unnamed: 0,Source Index,Target Index,Number of rts,Total of rts,Normal weight,Ending date
0,1,27,8,214,0.037,2021-05-16
1,1,105,13,214,0.061,2021-05-16
2,1,191,1,214,0.005,2021-05-16
3,1,227,1,214,0.005,2021-05-16
4,1,255,6,214,0.028,2021-05-16


In [None]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Nodes.csv', sep = ';')
index = master_id['ID'].reset_index()
# Useful Dicts
idx_to_id = master_id['ID'].reset_index().to_dict()['ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Dict for lookup in Graph
id_to_idx = {id:idx for idx, id in idx_to_id.items()}

In [186]:
# Create Graph Runtime 5 minutes
files = glob.glob('../../../Data/3_Day_Graphs/Source_Target/*.csv')

for file in tqdm(files):
    date = file.split('.')[-2].split('/')[-1]

    # Initialice Directed Graph
    g = gt.load_graph_from_csv(file,
                               directed=True,
                               eprop_types=['int','int',"float",'string'],
                               eprop_names=['Number fo rts','Total of rts', 'Normal Weights', 'Ending date'],
                               skip_first=True,
                               hashed=False,
                               hash_type='int',
                               csv_options={'delimiter': ';'})

    # Create a Vertex property map for labels
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('int64_t')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('vector<double>')

    # Assign Labels to vertices using loop
    vertex_list = g.get_vertices()
    for v in vertex_list:
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
    del vertex_list

    # # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map

    # Save Graphs
    filename = os.path.join(save_path, date + ".graphml")
    g.save(filename)

100%|██████████| 61/61 [04:33<00:00,  4.48s/it]


In [187]:
# Example of our graphs
ej_g = gt.load_graph(save_path + f'/2021-05-28.graphml')
ej_g

<Graph object, directed, with 36964 vertices and 470286 edges, 4 internal vertex properties, 4 internal edge properties, at 0x7f313eb41e50>

In [189]:
# Checking properties of Vertex
test_id = 20560294
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
master_id.iloc[test_idx]

Idx in Graph: 255
20560294
WRadioColombia
No Retweets


ID                             20560294
Label                    WRadioColombia
Political Affiliation       No Retweets
Name: 255, dtype: object

In [211]:
ej_csv = pd.read_csv(save_path + '/Source_Target/2021-05-28.csv', sep = ';')

# Checking properties of Vertex
test_id_source = 784125
test_id_target = 110010608

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]

test_edge_idx = (source_idx, target_idx)

print(f'Edge in Graph: {test_edge_idx}')
print(f"normal Weight {ej_g.ep['Normal Weights'][test_edge_idx]}")
print('-----------------')
print('Value in Edge List')
ej_csv['Normal weight'][(ej_csv['Source Index'] == source_idx) & (ej_csv['Target Index'] == target_idx)]

Edge in Graph: (3, 4040)
normal Weight 0.1
-----------------
Value in Edge List


54    0.1
Name: Normal weight, dtype: float64

## Daily retweet network (with no rolling window)

In [None]:
tweets_paro = glob.glob('/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_*')
tweets_paro

In [None]:
# Import the tweets from Paro but select only the retweets
retweets = pd.DataFrame()
for file in tqdm(tweets_paro):
    tweets = pd.read_pickle(file, compression = "gzip")

    # Select only retweets
    rts = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
    rts = rts.drop(columns = 'Reference Type')
    retweets = pd.concat([retweets, rts], axis = 0)
retweets = retweets.reset_index(drop = True)
del rts, tweets 
print('Shape:', retweets.shape)

In [None]:
retweets.head()

In [None]:
# Fix data types
retweets["ID"] = retweets["ID"].astype(int)
retweets["Author ID"] = retweets["Author ID"].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)
retweets['Referenced Tweet'] = retweets['Referenced Tweet'].astype(int)

# Remove time from retweets date
retweets["Date"] = pd.to_datetime(retweets["Date"]).dt.date

### Create Source-Target DataFrame

In [None]:
save_path = "../../Data/Daily_graphs"

In [None]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

In [None]:
users_information = pd.read_pickle("../../../Data/Tweets_DataFrames/users_information.gzip", compression = "gzip")

In [None]:
users_information

In [None]:
# Label Referenced Tweet Author
retweets = users_information.reset_index()[["Author ID", "Author Name"]] \
    .rename(columns = {"Author ID": "Referenced Tweet Author ID", 
                       "Author Name": "Referenced Tweet Author Name"}) \
                       .merge(retweets, how = "right", on = "Referenced Tweet Author ID")

In [None]:
retweets.head()

In [None]:
# Users without name
retweets.iloc[:, 0:2].drop_duplicates().isna().sum()

In [None]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    temp = retweets[retweets['Date'] == d.date()]
    temp = temp.groupby(["Author ID", "Author Name", "Date", "Referenced Tweet Author ID", "Referenced Tweet Author Name"]).size().reset_index(name = "w")
    temp.columns = ["source", "source_label", "date", "target", "target_label", "w"]
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d.date()) + ".csv"), index = False, sep = ";")

### Create networks

In [None]:
# Node list
nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))
len(nodes_id)

In [None]:
users_information.shape

In [None]:
users_information = users_information.reset_index()

In [None]:
# Convert users_information DataFrame to a dictionary for fast lookup
users_info_dict = users_information.set_index('Author ID')['Author Name'].to_dict()

nodes_label = []
nodes_political_affilitation = []
for i in tqdm(nodes_id):
    # Assign label
    label = users_info_dict.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_label.append(label)

    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'label': nodes_label,
    'pa': nodes_political_affilitation
}

# Save as pickle
with open("../../../Data/Daily_graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

In [None]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict['id'])}

for d in tqdm(daily_grid):
    # Filter retweets first
    filtered_retweets = retweets[retweets['Date'] == d.date()]
    
    # Create weights
    test = filtered_retweets.groupby(['Author ID', 'Referenced Tweet Author ID']).size().reset_index(name="w")
    
    # Use dictionary lookups for index mappings
    test['idx_s'] = [id_to_idx.get(x, np.nan) for x in test['Author ID']]
    test['idx_t'] = [id_to_idx.get(x, np.nan) for x in test['Referenced Tweet Author ID']]
    
    # The edge list should be based on the idx of the nodes and include weights
    edges_list_with_weights = list(zip(test['idx_s'].dropna().astype(int), test['idx_t'].dropna().astype(int), test['w']))
    
    # Create graph
    g = gt.Graph(directed=True)
    g.add_vertex(num_nodes)
    
    # Node attributes
    node_id = g.new_vertex_property("string", vals=nodes_dict['id'])
    node_label = g.new_vertex_property("string", vals=nodes_dict['label'])
    node_affiliation = g.new_vertex_property("string", vals=nodes_dict['pa'])
    
    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Label"] = node_label
    g.vertex_properties["Political Affiliation"] = node_affiliation
    
    # Edge weight attribute
    edge_weights = g.new_edge_property("int")
    
    # Add edges and assign weights
    for source, target, weight in edges_list_with_weights:
        e = g.add_edge(source, target)
        edge_weights[e] = weight

    g.edge_properties["weight"] = edge_weights
    
    g.save(os.path.join(save_path, str(d.date()) + ".graphml"))

In [None]:
print(g)
print(g.vp["ID"][624353])
print(g.vp["Label"][624353])
print(g.vp["Political Affiliation"][624353])