# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [1]:
# Mathematical and Data Managment
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Graph Managment
import graph_tool.all as gt
from utils.Functions import *

# Miscellaneous
import pickle
import os
from tqdm import tqdm
from glob import glob
from concurrent.futures import ProcessPoolExecutor



## Retweet network with a 3-day rolling window

In [2]:
save_path = "../../../Data/3_Day_Graphs"

In [9]:
# Load Tweets
tweets = pd.read_pickle('../../../Data/Tweets_DataFrames/tweets_lite.gzip', compression='gzip')
tweets = tweets.astype({
    'Author ID': float,
    'Referenced Tweet Author ID': float
})
# Fix date
tweets["Date"] = pd.to_datetime(tweets["Date"], errors='coerce').dt.date

# get retweets
retweets = tweets[tweets['Reference Type'] == 'retweeted']

# Count Users and tweets
users_that_retweet = set(retweets['Author ID']).union()
users_retweeted = set(retweets['Referenced Tweet Author ID'])
retweets_users = users_retweeted.union(users_that_retweet)
tweets_users = set(tweets['Author ID']).union(set(tweets['Referenced Tweet Author ID']))

print(f"Tenemos: {len(retweets):,} Retweets, {len(tweets):,} Tweets, {len(retweets_users):,} Rewitteros y {len(tweets_users):,} Twitteros")
del tweets_users, retweets_users, users_retweeted, users_that_retweet
tweets.head()

Tenemos: 30,918,011 Retweets, 45,330,718 Tweets, 624,358 Rewitteros y 5,759,089 Twitteros


Unnamed: 0,Author ID,Author Name,Referenced Tweet Author ID,Date,Reference Type,Referenced Tweet
0,138377765.0,hmauriciojg,134855300.0,2021-06-28,replied_to,1.409586e+18
1,138377765.0,hmauriciojg,1131821000.0,2021-06-28,replied_to,1.409192e+18
2,138377765.0,hmauriciojg,62337500.0,2021-06-27,replied_to,1.409298e+18
3,138377765.0,hmauriciojg,576647400.0,2021-06-22,replied_to,1.407171e+18
4,138377765.0,hmauriciojg,126832600.0,2021-06-21,replied_to,1.40675e+18


In [10]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes)):,} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

affilliation_df = pd.DataFrame(list(user_to_party_paro.items()), columns=['User ID', 'Political Affiliation'])
affilliation_df['Political Affiliation'].value_counts(dropna=False)

Faltan clasificar 587,246 usuarios


Political Affiliation
NaN               587246
Izquierda          23138
Derecha             6812
Sin Clasificar      3844
Centro              3543
Name: count, dtype: int64

In [11]:
# Create Master Id
del affilliation_df,ids_faltantes, ids_faltantes1, ids_faltantes2, usuario
users_information = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/users_information.gzip', compression='gzip')
master_id = users_information.reset_index()[['Author ID', 'Author Name']]
master_id = master_id.rename(columns={
    'Author ID': 'User ID',
    'Author Name': 'Label'
})
del users_information
master_id['Political Affiliation'] = master_id['User ID'].apply(lambda x: user_to_party_paro[x])
print(f"Duplicated IDs: {sum(master_id.duplicated(subset = 'User ID'))}")
master_id = master_id.drop_duplicates(subset = 'User ID')
master_id

Duplicated IDs: 7


Unnamed: 0,User ID,Label,Political Affiliation
0,0.000000e+00,0,Sin Clasificar
1,1.000000e+00,0,Sin Clasificar
2,2.000000e+00,0,Sin Clasificar
3,3.000000e+00,0,Sin Clasificar
5,4.000000e+00,0,Sin Clasificar
...,...,...,...
37339,1.389722e+18,Neoplasticista,Izquierda
37340,1.389737e+18,JC13177979,Derecha
37341,1.389741e+18,JhonatanVRojo,Sin Clasificar
37342,1.389769e+18,VaneLen18,Izquierda


In [12]:
# Adding Source Labels
temp = tweets.merge(master_id, how = 'inner', left_on = 'Author ID', right_on='User ID', validate = 'm:1')
temp = temp.rename(columns = {'Label': 'Source Label', 'Political Affiliation': 'Source PA'}).drop(columns='User ID')

# Adding Target Labels
tweets = temp.merge(master_id, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='User ID', validate = 'm:1')
tweets = tweets.rename(columns = {'Label': 'Target Label', 'Political Affiliation': 'Target PA'})

tweets = tweets.drop(columns= ['User ID'])

# Count how many tweets and users we have now
users_rts = set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID']))
non_singletons = master_id[master_id['User ID'].isin(users_rts)] # Nodes conected by a Rt
singletons = master_id[~master_id['User ID'].isin(users_rts)] # Nodes that aren't in Rts during Paro (Perhaps only Twitted or Retwitted with replies) 

# Insights
print(f"Tenemos: {tweets.shape[0]:,} tweets, {len(tweets[tweets['Reference Type'] == 'retweeted']):,} retweets")
print(f"Tenemos: {len(non_singletons):,} Usuarios con Rts o Retwiteados. Usuarios sin tweets {len(singletons)}")
print(f"Tenemos: {len(tweets['Referenced Tweet'].unique()):,} Tweets referenciados (Originales)")
tweets.head()

Tenemos: 17,761,847 tweets, 13,784,608 retweets
Tenemos: 37,112 Usuarios con Rts o Retwiteados. Usuarios sin tweets 225
Tenemos: 2,309,812 Tweets referenciados (Originales)


Unnamed: 0,Author ID,Author Name,Referenced Tweet Author ID,Date,Reference Type,Referenced Tweet,Source Label,Source PA,Target Label,Target PA
0,138377765.0,hmauriciojg,134855279.0,2021-06-28,replied_to,1.409586e+18,hmauriciojg,Sin Clasificar,DanielSamperO,Centro
1,138377765.0,hmauriciojg,134855279.0,2021-05-27,replied_to,1.397903e+18,hmauriciojg,Sin Clasificar,DanielSamperO,Centro
2,138377765.0,hmauriciojg,134855279.0,2021-05-07,replied_to,1.390699e+18,hmauriciojg,Sin Clasificar,DanielSamperO,Centro
3,138377765.0,hmauriciojg,134855279.0,2021-05-04,replied_to,1.389664e+18,hmauriciojg,Sin Clasificar,DanielSamperO,Centro
4,138377765.0,hmauriciojg,134855279.0,2021-05-04,replied_to,1.389664e+18,hmauriciojg,Sin Clasificar,DanielSamperO,Centro


In [21]:
# Save this Dataframe for Nodes List
master_id = non_singletons # Aislamos a los singletons del Master ID
master_id = master_id.astype({
    'User ID': int,
    'Label': str,
    'Political Affiliation': str
}).reset_index(drop=True)

master_id = master_id.reset_index().rename(columns = {'index': 'ID'})
id_to_label = master_id.set_index('User ID').to_dict()['Label']

master_id = master_id[['ID', 'Label', 'Political Affiliation', 'User ID']]

# Save
master_id.to_csv(os.path.join(save_path, 'Master_Index' + ".csv"), index = False, sep = ";")
master_id

Unnamed: 0,ID,Label,Political Affiliation,User ID
0,0,0,Sin Clasificar,12
1,1,alerios,Izquierda,12996
2,2,ranaberden,Centro,777978
3,3,jsanti,Izquierda,784125
4,4,thisgoblin,Izquierda,1061601
...,...,...,...,...
36959,36959,Neoplasticista,Izquierda,1389721694961651712
36960,36960,JC13177979,Derecha,1389737202742071296
36961,36961,JhonatanVRojo,Sin Clasificar,1389741234370064384
36962,36962,VaneLen18,Izquierda,1389769251704147968


In [22]:
# Insights using only retweets
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')

print(f'primer nodo: {min(users_rts)} último nodo: {max(users_rts):,}')
print(f'Total de nodos en conectados: {len(users_rts):,}')
print(f'Total de nodos Singletons: {len(singletons):,}')
print(f'Total de usuarios que TWITTEARON Durante el paro: {len(singletons) + len(non_singletons):,}')

del nodes_no_receipt, nodes_no_send, self_loops, users_rts

Nodos Retweeteados por alguien pero que no Retweetean: 6,510
Nodos que Retweetean a alguien pero no son Retweeteados: 833
Nodos que tienen un self loop: 10
primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en conectados: 36,964
Total de nodos Singletons: 373
Total de usuarios que TWITTEARON Durante el paro: 37,337


In [36]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Master_Index.csv', sep = ';')

# Useful Dicts
idx_to_id = master_id['User ID'].reset_index().to_dict()['User ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Dict for lookup in Graph
id_to_idx = {id:idx for idx, id in idx_to_id.items()}

# dict for color
color = {
    'Izquierda': 'blue',
    'Derecha': 'red',
    'Centro': 'green',
    'Sin Clasificar': 'gray'
}

In [24]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-05-01 23:59:59'
v2_end = '2021-06-30 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

# date_start = date_start[7:8]
# date_end = date_end[7:8]

In [35]:
def put_idx(x):
    try:
        return int(id_to_idx[x])
    except:
        return -1

# Get the amount of tweets for each day
def create_tweets_per_day(date_tuple):
    d_s, d_e = date_tuple
    window = tweets[(tweets['Date'] >= d_s.date()) & (tweets['Date'] < d_e.date())]
    window_rts = window[window['Reference Type'] == 'retweeted']
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)
    temp = tweets_og_per_user.reset_index(name = 'tweet').rename(columns={'Referenced Tweet Author ID': 'User ID'})

    # Pivot table
    tweets_per_user = window.pivot_table(index='Author ID', columns='Reference Type', aggfunc='size', fill_value=0).reset_index()
    tweets_per_user = tweets_per_user.rename(columns = {'Author ID': 'User ID'})
    tweets_per_user = tweets_per_user.astype({'User ID': int})
    
    # Add Master Id Index and remove people that aren't there
    tweets_per_user['ID'] = tweets_per_user['User ID'].apply(lambda x: put_idx(x))
    tweets_per_user['ID'] = tweets_per_user['ID'].fillna(-1).astype(int)
    tweets_per_user = tweets_per_user[tweets_per_user['ID'] != -1]
    tweets_per_user.columns.name = None
    
    # Add other nodes from master Index
    tweets_per_user = tweets_per_user.merge(master_id[['ID', 'User ID']], on='ID', how = 'outer').drop(columns = 'User ID_x')
    tweets_per_user = tweets_per_user.rename(columns = {'User ID_y': 'User ID'})
    tweets_per_user = tweets_per_user.merge(temp, on='User ID', how = 'outer')
    tweets_per_user.fillna(0, inplace=True)
    tweets_per_user.set_index('ID', inplace=True)
    
    tweets_per_user = tweets_per_user.astype({
        'quoted': int,
        'replied_to': int,
        'retweeted': int,
        'tweet': int,
        'User ID': int
    })
    tweets_per_user = tweets_per_user[['User ID', 'replied_to', 'quoted', 'retweeted', 'tweet']]
 
    # Save file
    filename = os.path.join(save_path, 'Tweets_Per_Day', f'starting_{str(d_s.date())}' + ".csv")
    tweets_per_user.to_csv(filename, sep = ";")
    

def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        executor.map(create_tweets_per_day, dates)

if __name__ == '__main__':
    main()

In [26]:
def create_source_target(date_tuple):
    d_s, d_e = date_tuple
    # Get 3 days Retweets
    window_rts = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] < d_e.date())]
    
    # Get total of Tweets during that period
    rts_per_user= window_rts.groupby('Author ID').size().reset_index(name = 'total')
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)

    # Create Normalize Weight
    temp = window_rts.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number_of_rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal_weight'] = temp['number_of_rts']/temp['total']
    temp['normal_weight'] = temp['normal_weight']
    temp.drop(columns='total', inplace=True)
    temp.rename(columns = {'Author ID': "source_user_id", "Referenced Tweet Author ID": "target_user_id"}, inplace=True)
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: int(id_to_idx[x]))
    temp['Target'] = temp['target_user_id'].apply(lambda x: int(id_to_idx[x]))
    
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'number_of_rts': int,
        'normal_weight': float,
        'source_user_id': int,
        'target_user_id': int
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source_Target", 'starting_' + str(d_s.date()) + ".csv"), index = False, sep = ";")
    
def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        executor.map(create_source_target, dates)

if __name__ == '__main__':
    main()

In [27]:
# Example of our lists
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-06-16.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(set(ej_csv['Source'])):,}. Nodos Target {len(set(ej_csv['Target'])):,}. Total de Nodos {len(nodes):,}")
ej_csv.head(10)

 Numero de Arcos: 234,657. Nodos Source: 24,254. Nodos Target 8,981. Total de Nodos 26,015


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
0,1,1140,12996,53855557,1,0.027778,Izquierda,Centro,alerios,CathyJuvinao
1,1,1712,12996,62528273,1,0.027778,Izquierda,Centro,alerios,AntonioSanguino
2,1,1939,12996,66740100,1,0.027778,Izquierda,Sin Clasificar,alerios,ZuluagaCamila
3,1,3504,12996,98781946,2,0.055556,Izquierda,Izquierda,alerios,IvanCepedaCast
4,1,3782,12996,104622520,1,0.027778,Izquierda,Izquierda,alerios,valentinabz
5,1,6170,12996,142448269,1,0.027778,Izquierda,Izquierda,alerios,maryluzherran
6,1,7882,12996,165748292,1,0.027778,Izquierda,Centro,alerios,angelamrobledo
7,1,9727,12996,201256928,1,0.027778,Izquierda,Izquierda,alerios,marthaperaltae
8,1,10570,12996,221466950,4,0.111111,Izquierda,Izquierda,alerios,wilsonariasc
9,1,11057,12996,233001872,1,0.027778,Izquierda,Izquierda,alerios,jorgerojas2022


In [38]:
def create_graph(file_tuple):
    file1, file2 = file_tuple
    starting_date_str = file1.split('.')[-2].split('/')[-1].split('_')[-1]
    starting_date = datetime.strptime(starting_date_str, '%Y-%m-%d')
    ending_date = starting_date + timedelta(days=3)
    ending_date_str = ending_date.strftime('%Y-%m-%d')
    
    csv = pd.read_csv(file1, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weights'] = normal_weight
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')

    # Create a Vertex property maps
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('int64_t')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    vertex_quoted_map = g.new_vertex_property('int64_t')
    vertex_replies_map = g.new_vertex_property('int64_t')
    vertex_retweeted_map = g.new_vertex_property('int64_t')
    vertex_tweets_map = g.new_vertex_property('int64_t')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map
    
    tweets_per_day = pd.read_csv(file2, delimiter=';', index_col='ID')

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v_index in g.iter_vertices():
        v = g.vertex(v_index)
        # Add tweets information of users
        vertex_quoted_map[v] = tweets_per_day['quoted'].loc[v_index]
        vertex_retweeted_map[v] = tweets_per_day['retweeted'].loc[v_index]
        vertex_replies_map[v] = tweets_per_day['replied_to'].loc[v_index]
        vertex_tweets_map[v] = tweets_per_day['tweet'].loc[v_index]
        
        # Add Master Index Information
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
        # Add categories Information
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        
        # Add isolated
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    g.vp['Replies'] = vertex_replies_map
    g.vp['Retweets'] = vertex_retweeted_map
    g.vp['Quoted'] = vertex_quoted_map
    g.vp['Tweets'] = vertex_tweets_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_starting_date = g.new_graph_property('string')
    graph_ending_date = g.new_graph_property('string')
    graph_starting_date[g] = starting_date_str
    graph_ending_date[g] = ending_date_str
    g.gp['Starting Date'] = graph_starting_date
    g.gp['Ending Date'] = graph_ending_date

    # Save Graphs
    filename = os.path.join(save_path, 'Graphs' ,'starting_' + starting_date_str + ".graphml")
    g.save(filename)


def main():
    source_target_files = glob('../../../Data/3_Day_Graphs/Source_Target/starting_*.csv')
    tweets_files = glob('../../../Data/3_Day_Graphs/Tweets_Per_Day/starting_*.csv')
    source_target_files.sort()
    tweets_files.sort()
    files = list(zip(source_target_files, tweets_files))

    with ProcessPoolExecutor() as executor:
        executor.map(create_graph, files)

if __name__ == '__main__':
    main()

In [39]:
print('Edge List')
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-05-07.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(save_path + f'/Graphs/starting_2021-05-07.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

# Example of Tweets df
ej_t = pd.read_csv(save_path + f'/Tweets_Per_Day/starting_2021-05-07.csv', sep = ';', index_col='ID')
source_nodes_count = sum(1 for v in ej_g.vertices() if v.out_degree() > 0)
print(f"{source_nodes_count:,} Active Nodes on Graph and {len(ej_t[ej_t['retweeted']>0]):,} Active Nodes on df")
ej_t.head()

Edge List
Non Isolate Vertices 31,078 and 547,260 edges
Graph
Non Isolate Vertices 31,078 and 547,260 edges
29,504 Active Nodes on Graph and 29,504 Active Nodes on df


Unnamed: 0_level_0,User ID,replied_to,quoted,retweeted,tweet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,12996,0,0,31,0
2,777978,9,0,1,2
3,784125,0,0,12,0
4,1061601,0,0,21,0
5,1488031,6,1,2,1


In [43]:
dict(ej_g.vp)

{'Centro': <VertexPropertyMap object with value type 'bool', for Graph 0x7fecd07aec10, at 0x7fecd07af050>,
 'Color': <VertexPropertyMap object with value type 'string', for Graph 0x7fecd07aec10, at 0x7fecd07ac850>,
 'Derecha': <VertexPropertyMap object with value type 'bool', for Graph 0x7fecd07aec10, at 0x7fecc7a57a50>,
 'Isolate': <VertexPropertyMap object with value type 'bool', for Graph 0x7fecd07aec10, at 0x7fecc7a57950>,
 'Izquierda': <VertexPropertyMap object with value type 'bool', for Graph 0x7fecd07aec10, at 0x7fecc7a57850>,
 'Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fecd07aec10, at 0x7fecc7a57750>,
 'Political Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fecd07aec10, at 0x7fecc7a57650>,
 'Quoted': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7fecd07aec10, at 0x7fecc7a57550>,
 'Replies': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7fecd07aec10, at 0x7fecc7a57450>,
 'Retweets':

In [9]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23639
1920140406
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23639
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                     1920140406
Name: 23639, dtype: object

In [10]:
# Checking properties of Vertex
test_id_source = 1389784145417678848
test_id_target = 1010113508233699328

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]
test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weights'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (36963, 31708)
Edge In graph: (36963, 31708)
Normal Weights 0.0454545454545454
Number of rts 1
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
547258,36963,31708,1389784145417678848,1010113508233699328,1,0.045455,Izquierda,Izquierda,kars0518,LevyRincon


In [14]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f"Quoted Tweets: {ej_g.vp['Quoted'][test_idx]}")
print(f"Replies Tweets: {ej_g.vp['Replies'][test_idx]}")
print(f"Retweets Tweets: {ej_g.vp['Retweets'][test_idx]}")
print(f"Original Tweets: {ej_g.vp['Tweets'][test_idx]}")
print(f'Idx in Graph: {test_idx}')
print(f"User ID: {ej_g.vp['User ID'][test_idx]}")
print('---------Checking on Tweets Df----------')
ej_t.loc[test_idx]

Quoted Tweets: 3
Replies Tweets: 0
Retweets Tweets: 6
Original Tweets: 21
Idx in Graph: 23639
User ID: 1920140406
---------Checking on Tweets Df----------


User ID       1920140406
replied_to             0
quoted                 3
retweeted              6
tweet                 21
Name: 23639, dtype: int64

## Daily retweet network (with no rolling window)

In [3]:
save_path = "../../../Data/Daily_graphs"

In [4]:
tweets_paro = glob('/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_*')
tweets_paro

['/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_4.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_5.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_3.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_2.gzip',
 '/mnt/disk2/Data/Tweets_DataFrames/tweets_paro_1.gzip']

In [5]:
# Import the tweets from Paro but select only the retweets
retweets = pd.DataFrame()
for file in tqdm(tweets_paro):
    tweets = pd.read_pickle(file, compression = "gzip")

    # Select only retweets
    rts = tweets.loc[tweets["Reference Type"] == "retweeted",:].reset_index(drop = True)
    rts = rts.drop(columns = 'Reference Type')
    retweets = pd.concat([retweets, rts], axis = 0)
retweets = retweets.reset_index(drop = True)
del rts, tweets 
print('Shape:', retweets.shape)

100%|██████████| 5/5 [03:45<00:00, 45.03s/it]


Shape: (30918011, 24)


In [6]:
users = set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID']))
print(f'Tenemos: {retweets.shape[0]:,} Retweets y {len(users):,} Usario con Rts o Retwiteados')

Tenemos: 30,918,011 Retweets y 624,358 Usario con Rts o Retwiteados


In [7]:
# Fix data types
retweets["ID"] = retweets["ID"].astype(int)
retweets["Author ID"] = retweets["Author ID"].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)
retweets['Referenced Tweet'] = retweets['Referenced Tweet'].astype(int)

# Remove time from retweets date
retweets["Date"] = pd.to_datetime(retweets["Date"]).dt.date

### Create Source-Target DataFrame

In [8]:
save_path = "../../Data/Daily_graphs"

In [9]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have retweets from 2021-04-28 to 2021-06-29


In [10]:
users_information = pd.read_pickle("../../../Data/Tweets_DataFrames/users_information.gzip", compression = "gzip")

In [11]:
# Label Referenced Tweet Author
retweets = users_information.reset_index()[["Author ID", "Author Name"]] \
    .rename(columns = {"Author ID": "Referenced Tweet Author ID", 
                       "Author Name": "Referenced Tweet Author Name"}) \
                       .merge(retweets, how = "right", on = "Referenced Tweet Author ID")

In [12]:
# Users without name
retweets.iloc[:, 0:2].drop_duplicates().isna().sum()

Referenced Tweet Author ID           0
Referenced Tweet Author Name    587246
dtype: int64

In [None]:
for d in tqdm(daily_grid):
    # Select the retweets from the desired date
    temp = retweets[retweets['Date'] == d.date()]
    temp = temp.groupby(["Author ID", "Author Name", "Date", "Referenced Tweet Author ID", "Referenced Tweet Author Name"]).size().reset_index(name = "w")
    temp.columns = ["source", "source_label", "date", "target", "target_label", "w"]
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target"].apply(lambda x: user_to_party_paro[x])
    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", str(d.date()) + ".csv"), index = False, sep = ";")

### Create networks

In [13]:
# Node list
nodes_id = list(set(retweets["Author ID"]).union(set(retweets["Referenced Tweet Author ID"])))
len(nodes_id)

624358

In [14]:
users_information.shape

(37344, 6)

In [15]:
users_information = users_information.reset_index()

In [16]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

In [17]:
# Convert users_information DataFrame to a dictionary for fast lookup
users_info_dict = users_information.set_index('Author ID')['Author Name'].to_dict()

nodes_label = []
nodes_political_affilitation = []
for i in tqdm(nodes_id):
    # Assign label
    label = users_info_dict.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_label.append(label)

    # Assign Political Affiliation
    pa = user_to_party_paro.get(i, np.nan) # using .get() for safe dictionary lookup
    nodes_political_affilitation.append(pa)

num_nodes = len(nodes_id)
nodes_idx = [i for i in range(num_nodes)]

# Create dictionary
nodes_dict = {
    'id': nodes_id,
    'idx': nodes_idx,
    'label': nodes_label,
    'pa': nodes_political_affilitation
}

# Save as pickle
with open("../../../Data/Daily_graphs/nodes_dict.pkl", "wb") as f:
    pickle.dump(nodes_dict, f)

100%|██████████| 624358/624358 [00:00<00:00, 1256998.88it/s]


In [18]:
len(nodes_dict["id"])

624358

In [19]:
# Identify indices where "pa" is not nan
indices_to_keep = [i for i, pa_value in enumerate(nodes_dict["pa"]) if str(pa_value) != "nan"]

# Create a new dictionary with filtered values
nodes_dict_filtered = {key: [value[i] for i in indices_to_keep] for key, value in nodes_dict.items()}

# Fix idx
nodes_dict_filtered["idx"] = [i for i in range(num_nodes)]
# Fix id
nodes_dict_filtered["id"] = [int(i) for i in nodes_dict_filtered["id"]]

In [20]:
num_nodes = len(nodes_dict_filtered["id"])
num_nodes

37112

In [21]:
len(nodes_dict_filtered["id"])

37112

In [22]:
retweets['Author ID'] = retweets['Author ID'].astype(int)
retweets['Referenced Tweet Author ID'] = retweets['Referenced Tweet Author ID'].astype(int)

In [25]:
nodes_dict_filtered["id"][0]

12

In [28]:
retweets_filtered = retweets.copy()
retweets_filtered = retweets_filtered.loc[retweets['Author ID'].isin(nodes_dict_filtered["id"]),:]
retweets_filtered = retweets_filtered.loc[retweets['Referenced Tweet Author ID'].isin(nodes_dict_filtered["id"]),:].reset_index(drop = True)

In [34]:
# Create a dictionary for O(1) lookup time of indices
id_to_idx = {node_id: idx for idx, node_id in enumerate(nodes_dict_filtered['id'])}

for d in tqdm(daily_grid):
    # Filter retweets first
    retweets_filtered2 = retweets_filtered[retweets_filtered['Date'] == d.date()]
    
    # Create weights
    test = retweets_filtered2.groupby(['Author ID', 'Referenced Tweet Author ID']).size().reset_index(name="w")
    
    # Use dictionary lookups for index mappings
    test['idx_s'] = [id_to_idx.get(x, np.nan) for x in test['Author ID']]
    test['idx_t'] = [id_to_idx.get(x, np.nan) for x in test['Referenced Tweet Author ID']]
    
    # The edge list should be based on the idx of the nodes and include weights
    edges_list_with_weights = list(zip(test['idx_s'].dropna().astype(int), test['idx_t'].dropna().astype(int), test['w']))
    
    # Create graph
    g = gt.Graph(directed=True)
    g.add_vertex(num_nodes)
    
    # Node attributes
    node_id = g.new_vertex_property("string", vals=nodes_dict_filtered['id'])
    node_label = g.new_vertex_property("string", vals=nodes_dict_filtered['label'])
    node_affiliation = g.new_vertex_property("string", vals=nodes_dict_filtered['pa'])
    
    # Assign attributes to graph
    g.vertex_properties["ID"] = node_id
    g.vertex_properties["Label"] = node_label
    g.vertex_properties["Political Affiliation"] = node_affiliation
    
    # Edge weight attribute
    edge_weights = g.new_edge_property("int")
    
    # Add edges and assign weights
    for source, target, weight in edges_list_with_weights:
        e = g.add_edge(source, target)
        edge_weights[e] = weight

    g.edge_properties["weight"] = edge_weights
    
    g.save(os.path.join("../../../Data/Daily_graphs/Full network", str(d.date()) + ".graphml"))

  3%|▎         | 2/63 [00:07<03:56,  3.88s/it]

In [None]:
print(g)
print(g.vp["ID"][624353])
print(g.vp["Label"][624353])
print(g.vp["Political Affiliation"][624353])