# Adjacency Matrices

In this Notebook, we create the adjacency matrices for the creation of the graph.

In [1]:
import pandas as pd
import numpy as np
import pickle
import os
from tqdm import tqdm
import scipy.sparse as sp
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import find
import scipy.sparse
import graph_tool.all as gt

Here We import the necessary data and then look it up

In [2]:
tweets = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/tweets_lite.pkl')

In [3]:
print('Shape:', tweets.shape)
tweets.head()

Shape: (45330426, 4)


Unnamed: 0,Author ID,Date,Reference Type,Referenced Tweet Author ID
0,1.000014e+18,2021/06/28 08:17:49,retweeted,352373166.0
1,1.000014e+18,2021/06/25 12:00:06,retweeted,14834302.0
2,1.000014e+18,2021/06/25 11:52:30,retweeted,528290945.0
3,1.000014e+18,2021/06/24 17:49:16,retweeted,753376280.0
4,1.000014e+18,2021/06/24 15:21:04,retweeted,132102878.0


## Retweet network with a 3-day rolling window

In [4]:
# We define a function which returns a Boolean specifying if matrix is Non Zero
def is_matrix_nonzero(matrix):
    return len(matrix.nonzero()[0]) > 0

We create a Dictionary with all the Author IDs and their indexes in the Dataframe. This will help us query the Dataframe for the Tweets and ReTweets of each user

In [5]:
tweets['Date'] = pd.to_datetime(tweets['Date'], errors = 'coerce')

# List of Twitter users
users = np.unique(tweets[['Author ID']].values)
users = [ int(x) for x in users ]

# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date1 = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

v2_start = '2021-04-30 23:59:59'
v2_end = '2021-06-29 23:59:59'
date2 = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

user_indices = {user: idx for idx, user in enumerate(users)}
datestr = list(date2.strftime("%d-%m-%Y"))

In [None]:
# We save this file for further usage
with open('../fcastrillon/Data/user_indices.pkl', 'wb') as file:
    pickle.dump(user_indices, file)

Each matrix contains the Retweet Network in windows of 3 days between April 28 and June 27 of 2023

In this _for loop_ we create the adjacency matrix for constructing the graph.

Each cell _RT<sub>i,j</sub>_ is the amount of Tweets the _i_ user Retweeted from the _j_ user.

This process is done for all the tweets done in intervals of 3 days during the Paro Nacional.

The Adjacency Matrix will be stored in the Matrices folder of Data

In [None]:
k = 0
os.chdir('../Matrices/')
for start_date, end_date in tqdm(zip(date1, date2)):
    # get tweets by current day between start_date and end_date
    test = tweets[(tweets['Date'] >= start_date) & (tweets['Date'] <= end_date)]

    # 'rts' dataframe contains the Author ID and the Referenced Author ID in the 
    # timeframe we are interested.
    rts = test.loc[(test["Reference Type"] == "retweeted") & (test["Referenced Tweet Author ID"].isin(users)),
                                                    ["Author ID", "Referenced Tweet Author ID"]]

    # We rename the 'rts' dataframe columns for code easyness now.
    new_column_names = {'Author ID':'user1', 'Referenced Tweet Author ID':'user2'}
    rts = rts.rename(columns = new_column_names)
    
    # Because of the data structure, we use a sparse matrix.
    A = sp.csr_matrix((len(users), len(users)), dtype = int)
    lil = lil_matrix(A.shape)

    for row in rts.itertuples(index = False):
        user1, user2 = row.user1, row.user2
    
        idx_user1 = user_indices[user1]
        idx_user2 = user_indices[user2]

        lil[idx_user1, idx_user2] += 1
        lil[idx_user2, idx_user1] += 1

    if is_matrix_nonzero(lil):
        pass
    else:
        print("Matrix is zero")
    
    # This matrices are sparse. Therefore, we save it as such.
    A = lil.tocsr()
    filename = f'adj_end_of_{datestr[k]}.csr'
    sp.save_npz(filename, A, compressed = False)
    k += 1

## Daily retweet network (with no rolling window)

In [5]:
os.listdir('/mnt/disk2/Data/Pickle')

['user_to_party_jan_oct.pkl',
 'mapa.pkl',
 'user_to_party.pkl',
 'rts_usuario.pkl',
 'users_to_date.pkl',
 'rts_usuario_paro.pkl',
 'user_to_party_paro.pkl',
 'rts_usuario_jan_oct.pkl',
 'user_indices.pkl',
 'tweets_jan21.gzip']

In [6]:
# Import tweets from Paro
# tweets = pd.read_pickle(r'/mnt/disk2/Data/Tweets_DataFrames/tweets_Usuarios_V1.gzip')
user_to_party_paro = pd.read_pickle("/mnt/disk2/Data/Pickle/user_to_party_paro.pkl")
# Select only retweets
rtweets = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
rtweets = rtweets.drop(columns = 'Reference Type')
rtweets = rtweets.reset_index(drop = True)
print('Shape:', rtweets.shape)

Shape: (30918011, 3)


In [7]:
# Fix data types
rtweets["Author ID"] = rtweets["Author ID"].astype(int)
rtweets['Referenced Tweet Author ID'] = rtweets['Referenced Tweet Author ID'].astype(int)
# Remove time from rtweets date
rtweets["Date"] = pd.to_datetime(rtweets["Date"]).dt.date

In [8]:
rtweets.head()

Unnamed: 0,Author ID,Date,Referenced Tweet Author ID
0,1000013974610567168,2021-06-28,352373166
1,1000013974610567168,2021-06-25,14834302
2,1000013974610567168,2021-06-25,528290945
3,1000013974610567168,2021-06-24,753376280
4,1000013974610567168,2021-06-24,132102878


In [9]:
ids_faltantes1 = set(rtweets["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(rtweets["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(ids_faltantes)} usuarios")

Faltan clasificar 598057 usuarios


In [10]:
# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

In [11]:
fecha_min = rtweets["Date"].min()
fecha_max = rtweets["Date"].max()
print(f"We have rtweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have rtweets from 2021-04-28 to 2021-06-29


In [12]:
save_path = "../../../Data/Daily_graphs"

In [47]:
os.listdir(save_path)

['2021-04-28.gt', '2021-04-29.gt']

In [53]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    test = rtweets[rtweets['Date'] == d.date()]

    nodes_ids = np.unique(np.concatenate((test["Author ID"].unique(), test['Referenced Tweet Author ID'].unique())))
    num_nodes = len(nodes_ids)
    nodes_idx = [i for i in range(num_nodes)]
    nodes_dict = dict(zip(nodes_ids, nodes_idx))
    political_affiliations = [user_to_party_paro[i] for i in nodes_ids]
    # The edge list should be based on the idx of the nodes, nor in their ids
    edges_list = [(nodes_dict[row['Author ID']], nodes_dict[row['Referenced Tweet Author ID']]) for index, row in test.iterrows()]

    g = gt.Graph(directed=False)

    # Add nodes
    g.add_vertex(num_nodes)

    # Create attributes
    node_id = g.new_vertex_property("string")
    node_affiliation = g.new_vertex_property("string")

    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Political Affiliation"] = node_affiliation

    # Assign attributes to each node
    for i, z in enumerate(zip(nodes_ids, political_affiliations)):
        node_id[g.vertex(i)] = z[0]
        node_affiliation[g.vertex(i)] = z[1]

    g.add_edge_list(edges_list)

    g.save(os.path.join(save_path, str(d.date()) + ".gt"))

100%|██████████| 63/63 [26:48<00:00, 25.53s/it]
