# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

In [29]:
import pandas as pd
import numpy as np
import pickle
import os
from tqdm import tqdm
import scipy.sparse as sp
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import find
import scipy.sparse
import graph_tool.all as gt
import glob

## Retweet network with a 3-day rolling window

In [3]:
tweets = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/tweets_lite.gzip', compression='gzip')
print('Shape:', tweets.shape)
tweets.head()

Shape: (45330718, 4)


Unnamed: 0,Author ID,Date,Reference Type,Referenced Tweet Author ID
0,9.180596e+17,2021/06/29 23:37:59,retweeted,141943900.0
1,9.180596e+17,2021/06/29 23:37:16,retweeted,8.628063e+17
2,9.180596e+17,2021/06/29 23:31:36,retweeted,1.402301e+18
3,9.180596e+17,2021/06/29 23:30:41,retweeted,382419800.0
4,9.180596e+17,2021/06/29 23:29:39,retweeted,14834300.0


In [4]:
# We define a function which returns a Boolean specifying if matrix is Non Zero
def is_matrix_nonzero(matrix):
    return len(matrix.nonzero()[0]) > 0

We create a Dictionary with all the Author IDs and their indexes in the Dataframe. This will help us query the Dataframe for the Tweets and ReTweets of each user

In [7]:
tweets['Date'] = pd.to_datetime(tweets['Date'], errors = 'coerce')

# List of Twitter users
users = np.unique(tweets[['Author ID']].values)
print(f"there are some nan's values. Proof: {users[-1]} at -1" )
users = users[:-1]
users = [ int(x) for x in users ]

# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date1 = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

v2_start = '2021-04-30 23:59:59'
v2_end = '2021-06-29 23:59:59'
date2 = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

user_indices = {user: idx for idx, user in enumerate(users)}
datestr = list(date2.strftime("%d-%m-%Y"))

there are some nan's values. Proof: nan at -1


In [11]:
# We save this file for further usage
with open('../../../Data/Pickle/user_indices.pkl', 'wb') as file:
    pickle.dump(user_indices, file)

In [30]:
with open('../../../Data/Pickle/user_to_party_paro.pkl', 'rb') as file:
    user_to_party_paro = pickle.load(file)

Each matrix contains the Retweet Network in windows of 3 days between April 28 and June 27 of 2023

In this _for loop_ we create the adjacency matrix for constructing the graph.

Each cell _RT<sub>i,j</sub>_ is the amount of Tweets the _i_ user Retweeted from the _j_ user.

This process is done for all the tweets done in intervals of 3 days during the Paro Nacional.

The Adjacency Matrix will be stored in the Matrices folder of Data

In [None]:
k = 0
os.chdir('../../../Data/Matrices/')
for start_date, end_date in tqdm(zip(date1, date2)):
    # get tweets by current day between start_date and end_date
    test = tweets[(tweets['Date'] >= start_date) & (tweets['Date'] <= end_date)]

    # 'rts' dataframe contains the Author ID and the Referenced Author ID in the 
    # timeframe we are interested.
    rts = test.loc[(test["Reference Type"] == "retweeted") & (test["Referenced Tweet Author ID"].isin(users)),
                                                    ["Author ID", "Referenced Tweet Author ID"]]

    # We rename the 'rts' dataframe columns for code easyness now.
    new_column_names = {'Author ID':'user1', 'Referenced Tweet Author ID':'user2'}
    rts = rts.rename(columns = new_column_names)
    
    # Because of the data structure, we use a sparse matrix.
    A = sp.csr_matrix((len(users), len(users)), dtype = int)
    lil = lil_matrix(A.shape)

    for row in rts.itertuples(index = False):
        user1, user2 = row.user1, row.user2
    
        idx_user1 = user_indices[user1]
        idx_user2 = user_indices[user2]

        lil[idx_user1, idx_user2] += 1
        lil[idx_user2, idx_user1] += 1

    if is_matrix_nonzero(lil):
        pass
    else:
        print("Matrix is zero")
    
    # This matrices are sparse. Therefore, we save it as such.
    A = lil.tocsr()
    filename = f'adj_end_of_{datestr[k]}.csr'
    sp.save_npz(filename, A, compressed = False)
    k += 1

## Daily retweet network (with no rolling window)

In [12]:
tweets_paro = glob.glob('/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_*')
tweets_paro

['/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_4.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_5.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_3.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_2.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_1.gzip']

In [13]:
# Import the tweets from Paro but select only the retweets
retweets = pd.DataFrame()
for file in tqdm(tweets_paro):
    tweets = pd.read_pickle(file, compression = "gzip")
    # Select only retweets
    rts = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
    rts = rts.drop(columns = 'Reference Type')
    retweets = pd.concat([retweets, rts], axis = 0)
retweets = retweets.reset_index(drop = True)
del rts, tweets 
print('Shape:', retweets.shape)

100%|██████████| 5/5 [02:44<00:00, 32.85s/it]


Shape: (30918011, 14)


In [14]:
retweets.head()

Unnamed: 0,ID,Author ID,Author Name,Date,Text,Replies,Retweets,Favorites,Quotes,is Retweet?,Reply To User Name,Mentions,Referenced Tweet,Referenced Tweet Author ID
0,1.409944e+18,325932532.0,wilman_86,2021/06/29 13:38:42,RT @udsnoexisten: Feliz almuerzo para todos ht...,0.0,131.0,0.0,0.0,True,,udsnoexisten,1.409943e+18,21952960.0
1,1.409726e+18,325932532.0,wilman_86,2021/06/28 23:09:28,RT @charlasamenas63: El Profesionalismo de jug...,0.0,1.0,0.0,0.0,True,,charlasamenas63,1.409726e+18,9.273397e+17
2,1.409498e+18,325932532.0,wilman_86,2021/06/28 08:05:01,RT @elyrxc: What's up bro... 🙊 😍🤣 https://t.co...,0.0,1.0,0.0,0.0,True,,elyrxc,1.409352e+18,1.402825e+18
3,1.409266e+18,325932532.0,wilman_86,2021/06/27 16:41:47,RT @Lidio_Dominante: Aquí la derecha debe ente...,0.0,84.0,0.0,0.0,True,,Lidio_Dominante,1.409202e+18,1.391738e+18
4,1.408904e+18,325932532.0,wilman_86,2021/06/26 16:46:05,RT @Lidio_Dominante: No hay mérito en que teng...,0.0,17.0,0.0,0.0,True,,Lidio_Dominante,1.408853e+18,1.391738e+18


In [15]:
# Fix data types
retweets["ID"] = retweets["ID"].astype(int)
retweets["Author ID"] = retweets["Author ID"].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)
retweets['Referenced Tweet'] = retweets['Referenced Tweet'].astype(int)
# Remove time from retweets date
retweets["Date"] = pd.to_datetime(retweets["Date"]).dt.date

In [18]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("/mnt/disk2/Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(ids_faltantes)} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

Faltan clasificar 598057 usuarios


### Create Source-Target DataFrame

In [21]:
save_path = "../../../Data/Daily_graphs"

In [17]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have retweets from 2021-04-28 to 2021-06-29


In [21]:
users_information = pd.read_pickle("../../../Data/Tweets_DataFrames/users_information.gzip", compression = "gzip")

In [41]:
# Label Referenced Tweet Author
retweets = users_information.reset_index()[["Author ID", "Author Name"]] \
    .rename(columns = {"Author ID": "Referenced Tweet Author ID", 
                       "Author Name": "Referenced Tweet Author Name"}) \
                       .merge(retweets, how = "right", on = "Referenced Tweet Author ID")

In [45]:
# Users without name
retweets.iloc[:, 0:2].drop_duplicates().isna().sum()

Referenced Tweet Author ID           0
Referenced Tweet Author Name    587246
dtype: int64

In [46]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    temp = retweets[retweets['Date'] == d.date()]
    temp = temp.groupby(["Author ID", "Author Name", "Date", "Referenced Tweet Author ID", "Referenced Tweet Author Name"]).size().reset_index(name = "w")
    temp.columns = ["source", "source_label", "date", "target", "target_label", "w"]
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d.date()) + ".csv"), index = False, sep = ";")

100%|██████████| 63/63 [03:11<00:00,  3.04s/it]


### Create networks

In [189]:
# Node list
nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))

In [72]:
users_information = users_information.reset_index()

In [207]:
# Convert users_information DataFrame to a dictionary for fast lookup
users_info_dict = users_information.set_index('Author ID')['Author Name'].to_dict()

nodes_label = []
nodes_political_affilitation = []
for i in tqdm(nodes_id):
    # Assign label
    label = users_info_dict.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_label.append(label)

    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'label': nodes_label,
    'pa': nodes_political_affilitation
}

# Save as pickle
with open("../../../Data/Daily_graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

  0%|          | 0/624358 [00:00<?, ?it/s]

100%|██████████| 624358/624358 [00:01<00:00, 429591.22it/s]


In [215]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict['id'])}

for d in tqdm(daily_grid):
    # Filter retweets first
    filtered_retweets = retweets[retweets['Date'] == d.date()]
    
    # Create weights
    test = filtered_retweets.groupby(['Author ID', 'Referenced Tweet Author ID']).size().reset_index(name="w")
    
    # Use dictionary lookups for index mappings
    test['idx_s'] = [id_to_idx.get(x, np.nan) for x in test['Author ID']]
    test['idx_t'] = [id_to_idx.get(x, np.nan) for x in test['Referenced Tweet Author ID']]
    
    # The edge list should be based on the idx of the nodes and include weights
    edges_list_with_weights = list(zip(test['idx_s'].dropna().astype(int), test['idx_t'].dropna().astype(int), test['w']))
    
    # Create graph
    g = gt.Graph(directed=True)
    g.add_vertex(num_nodes)
    
    # Node attributes
    node_id = g.new_vertex_property("string", vals=nodes_dict['id'])
    node_label = g.new_vertex_property("string", vals=nodes_dict['label'])
    node_affiliation = g.new_vertex_property("string", vals=nodes_dict['pa'])
    
    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Label"] = node_label
    g.vertex_properties["Political Affiliation"] = node_affiliation
    
    # Edge weight attribute
    edge_weights = g.new_edge_property("int")
    
    # Add edges and assign weights
    for source, target, weight in edges_list_with_weights:
        e = g.add_edge(source, target)
        edge_weights[e] = weight

    g.edge_properties["weight"] = edge_weights
    
    g.save(os.path.join(save_path, str(d.date()) + ".graphml"))

 33%|███▎      | 21/63 [19:38<36:55, 52.75s/it]

In [None]:
print(g)
print(g.vp["ID"][624353])
print(g.vp["Label"][624353])
print(g.vp["Political Affiliation"][624353])

<Graph object, directed, with 624358 vertices and 460845 edges, 3 internal vertex properties, at 0x7f08b17f8150>
113246190
renlopezs
Retweets Centro
