In [26]:
import pandas as pd
import numpy as np
import pickle
import os
from tqdm import tqdm
import scipy.sparse as sp
from scipy.sparse import lil_matrix
from scipy.sparse import csr_matrix
from scipy.sparse import find
import scipy.sparse

In [5]:
tweets = pd.read_pickle('../fcastrillon/Data/tweets_lite.pkl')
rts_usuario = pd.read_pickle('../fcastrillon/Data/rts_usuario.pkl')

In [8]:
print('Shape:', tweets.shape)
tweets.head()

Shape: (45330426, 4)


Unnamed: 0,Author ID,Date,Reference Type,Referenced Tweet Author ID
0,1.000014e+18,2021/06/28 08:17:49,retweeted,352373166.0
1,1.000014e+18,2021/06/25 12:00:06,retweeted,14834302.0
2,1.000014e+18,2021/06/25 11:52:30,retweeted,528290945.0
3,1.000014e+18,2021/06/24 17:49:16,retweeted,753376280.0
4,1.000014e+18,2021/06/24 15:21:04,retweeted,132102878.0


In [9]:

print('Shape: ', rts_usuario.shape)
rts_usuario.head()

Shape:  (37308, 9)


Unnamed: 0_level_0,Retweets Derecha,Retweets Izquierda,Retweets Centro,Retweets Totales,Dummy Derecha,Dummy Izquierda,Dummy Centro,No Retweets,Afiliacion
Author ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
12996.0,1,321,109,431,1,1,1,0,Retweets Izquierda
777978.0,1,1,1,3,1,1,1,0,Retweets Centro
784125.0,0,46,10,56,0,1,1,0,Retweets Izquierda
1061601.0,0,223,3,226,0,1,1,0,Retweets Izquierda
1488031.0,0,0,0,0,0,0,0,1,No Retweets


In [10]:
# We define a function which returns a Boolean specifying if matrix is Non Zero
def is_matrix_nonzero(matrix):
    return len(matrix.nonzero()[0]) > 0

In [13]:
tweets['Date'] = pd.to_datetime(tweets['Date'], errors = 'coerce')

# List of Twitter users
users = np.unique(tweets[['Author ID']].values)
users = [ int(x) for x in users ]

# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date1 = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

v2_start = '2021-04-30 23:59:59'
v2_end = '2021-06-29 23:59:59'
date2 = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

user_indices = {user: idx for idx, user in enumerate(users)}


In [24]:
with open('../fcastrillon/Data/user_indices', 'wb') as file:
    pickle.dump(user_indices, file)

In this _for loop_ we create the adjacency matrix for constructing the graph.

Each cell _RT<sub>i,j</sub>_ is the amount of Tweets the _i_ user Retweeted from the _j_ user.

This process is done for all the tweets done in intervals of 3 days during the Paro Nacional.

The Adjacency Matrix will be stored in the Matrices folder of Data

In [62]:
k = 1
for start_date, end_date in tqdm(zip(date1, date2)):
    # get tweets by current day between start_date and end_date
    test = tweets[(tweets['Date'] >= start_date) & (tweets['Date'] <= end_date)]

    # 'rts' dataframe contains the Author ID and the Referenced Author ID in the 
    # timeframe we are interested.
    rts = test.loc[(test["Reference Type"] == "retweeted") & (test["Referenced Tweet Author ID"].isin(users)),
                                                    ["Author ID", "Referenced Tweet Author ID"]]

    # We rename the 'rts' dataframe columns for code easyness now.
    new_column_names = {'Author ID':'user1', 'Referenced Tweet Author ID':'user2'}
    rts = rts.rename(columns = new_column_names)
    
    # Because of the data structure, we use a sparse matrix.
    A = sp.csr_matrix((len(users), len(users)), dtype = int)
    lil = lil_matrix(A.shape)

    for row in rts.itertuples(index = False):
        user1, user2 = row.user1, row.user2
    
        idx_user1 = user_indices[user1]
        idx_user2 = user_indices[user2]

        lil[idx_user1, idx_user2] += 1
        lil[idx_user2, idx_user1] += 1

    if is_matrix_nonzero(lil):
        print("Matrix is nonzero")
    else:
        print("Matrix is zero")
    
    # This matrices are sparse. Therefore, we save it as such.
    A = lil.tocsr()
    filename = f'../fcastrillon/Data/Matrices/adj_{k}.csr'
    sp.save_npz(filename, A, compressed = False)
    k += 1

1it [00:08,  8.70s/it]

Matrix is nonzero


2it [00:18,  9.45s/it]

Matrix is nonzero


3it [00:30, 10.43s/it]

Matrix is nonzero


4it [00:42, 11.09s/it]

Matrix is nonzero


5it [00:55, 11.82s/it]

Matrix is nonzero


6it [01:09, 12.56s/it]

Matrix is nonzero


7it [01:23, 12.90s/it]

Matrix is nonzero


8it [01:34, 12.49s/it]

Matrix is nonzero


9it [01:44, 11.75s/it]

Matrix is nonzero


10it [01:54, 11.24s/it]

Matrix is nonzero


11it [02:04, 10.86s/it]

Matrix is nonzero


12it [02:14, 10.44s/it]

Matrix is nonzero


13it [02:22,  9.80s/it]

Matrix is nonzero


14it [02:30,  9.21s/it]

Matrix is nonzero


15it [02:38,  8.84s/it]

Matrix is nonzero


16it [02:46,  8.53s/it]

Matrix is nonzero


17it [02:53,  8.13s/it]

Matrix is nonzero


18it [03:00,  7.89s/it]

Matrix is nonzero


19it [03:08,  7.70s/it]

Matrix is nonzero


20it [03:16,  7.79s/it]

Matrix is nonzero


21it [03:23,  7.79s/it]

Matrix is nonzero


22it [03:31,  7.77s/it]

Matrix is nonzero


23it [03:39,  7.68s/it]

Matrix is nonzero


24it [03:46,  7.62s/it]

Matrix is nonzero


25it [03:54,  7.76s/it]

Matrix is nonzero


26it [04:03,  7.93s/it]

Matrix is nonzero


27it [04:11,  8.21s/it]

Matrix is nonzero


28it [04:20,  8.33s/it]

Matrix is nonzero


29it [04:29,  8.55s/it]

Matrix is nonzero


30it [04:38,  8.67s/it]

Matrix is nonzero


31it [04:47,  8.73s/it]

Matrix is nonzero


32it [04:55,  8.57s/it]

Matrix is nonzero


33it [05:03,  8.32s/it]

Matrix is nonzero


34it [05:10,  7.97s/it]

Matrix is nonzero


35it [05:16,  7.48s/it]

Matrix is nonzero


36it [05:22,  7.03s/it]

Matrix is nonzero


37it [05:28,  6.58s/it]

Matrix is nonzero


38it [05:33,  6.21s/it]

Matrix is nonzero


39it [05:38,  5.87s/it]

Matrix is nonzero


40it [05:44,  5.70s/it]

Matrix is nonzero


41it [05:49,  5.66s/it]

Matrix is nonzero


42it [05:55,  5.70s/it]

Matrix is nonzero


43it [06:01,  5.69s/it]

Matrix is nonzero


44it [06:06,  5.48s/it]

Matrix is nonzero


45it [06:10,  5.20s/it]

Matrix is nonzero


46it [06:14,  4.95s/it]

Matrix is nonzero


47it [06:19,  4.76s/it]

Matrix is nonzero


48it [06:23,  4.66s/it]

Matrix is nonzero


49it [06:28,  4.58s/it]

Matrix is nonzero


50it [06:32,  4.49s/it]

Matrix is nonzero


51it [06:36,  4.36s/it]

Matrix is nonzero


52it [06:40,  4.21s/it]

Matrix is nonzero


53it [06:44,  4.19s/it]

Matrix is nonzero


54it [06:49,  4.31s/it]

Matrix is nonzero


55it [06:53,  4.43s/it]

Matrix is nonzero


56it [06:57,  4.35s/it]

Matrix is nonzero


57it [07:01,  4.17s/it]

Matrix is nonzero


58it [07:05,  4.08s/it]

Matrix is nonzero


59it [07:09,  3.98s/it]

Matrix is nonzero


60it [07:13,  3.92s/it]

Matrix is nonzero


61it [07:17,  7.17s/it]

Matrix is nonzero



