# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [1]:
# Mathematical and Data Managment
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Graph Managment
import graph_tool.all as gt
from utils.Bojanowski import *

# Miscellaneous
import pickle
import os
from tqdm import tqdm
from glob import glob
from concurrent.futures import ProcessPoolExecutor
from time import perf_counter

# Paths
path = r"/mnt/disk2/Data"
path_3_day = os.path.join(path,"3_Day_Graphs")
path_daily = os.path.join(path,"Daily_Graphs")

pd.set_option("display.max_columns", None)



# Master

In [21]:
# Load Retweets
retweets = pd.read_pickle(os.path.join(path, "Tweets_DataFrames","retweets.gzip"), compression='gzip')

# Time not needed, only date
retweets["Date"] = pd.to_datetime(retweets["Date"], errors='coerce').dt.date

# Import master
master = pd.read_csv('/mnt/disk2/Data/Master.csv')

# Adding Source Labels
temp = retweets.merge(master, how = 'inner', left_on = 'Author ID', right_on='User ID', validate = 'm:1')
temp = temp.rename(columns = {'Label': 'Source Label', 'Political Affiliation': 'Source PA'}).drop(columns=['User ID'])

# Adding Target Labels
retweets = temp.merge(master, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='User ID', validate = 'm:1')
retweets = retweets.rename(columns = {'Label': 'Target Label', 'Political Affiliation': 'Target PA'})

retweets = retweets.drop(columns= ['User ID'])
print(f"Total de Retweets {retweets.shape[0]:,.0f}")

# # Save this edge list for future checkpoints
# retweets.to_pickle(os.path.join(path, "Tweets_DataFrames", "retweets_edge_list.gzip"), compression = "gzip")
retweets.head()

Total de Retweets 11,694,492


Unnamed: 0,Tweet ID,Author ID,Author Name,Referenced Tweet Author ID,Referenced Tweet Author Name,Referenced Tweet ID,Date,Source Label,Source PA,Target Label,Target PA
0,1.407536e+18,788250700.0,Laura_Milena98,270668814.0,gabocifuentes,1.407204e+18,2021-06-22,Laura_Milena98,Izquierda,gabocifuentes,Centro
1,1.393521e+18,126523600.0,sandrapulga,270668814.0,gabocifuentes,1.393368e+18,2021-05-15,sandrapulga,Izquierda,gabocifuentes,Centro
2,1.404837e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,1.404697e+18,2021-06-15,rchicave,Izquierda,gabocifuentes,Centro
3,1.402071e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,1.401913e+18,2021-06-07,rchicave,Izquierda,gabocifuentes,Centro
4,1.391067e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,1.39102e+18,2021-05-08,rchicave,Izquierda,gabocifuentes,Centro


In [8]:
# Count how many tweets and users we have now
users_rts = set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID']))
non_singletons = master[master['User ID'].isin(users_rts)] # Nodes conected by a Rt
singletons = master[~master['User ID'].isin(users_rts)] # Nodes that aren't in Rts during Paro (Perhaps only Twitted or Retwitted with replies) 

# Insights
print(f"Tenemos: {len(non_singletons):,} Usuarios con Rts o Retwiteados. Usuarios sin tweets {len(singletons)}")

# Save this Dataframe for Nodes List
master = non_singletons # Aislamos a los singletons del Master ID
master = (
    master.astype({
        'User ID': 'float64',
        'Label': str,
        'Political Affiliation': str
    })
    .reset_index(drop=True)
    .reset_index(names = 'ID')
)

# Reordenamos las columnas
master_id = master[['ID', 'Label', 'Political Affiliation','User ID']]

# Save
master_id.to_csv(os.path.join(path, 'Master_Index' + ".csv"), index = False, sep = ";")

# Insights using only retweets
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')
print("\n"+"*"*100+"\n")
print(f'primer nodo: {min(users_rts)} último nodo: {max(users_rts):,}')
print(f'Total de nodos en conectados: {len(users_rts):,}')
print(f'Total de nodos Singletons: {len(singletons):,}')
print(f'Total de usuarios que TWITTEARON Durante el paro: {len(singletons) + len(non_singletons):,}')

del nodes_no_receipt, nodes_no_send, self_loops, users_rts

master_id

Tenemos: 34,840 Usuarios con Rts o Retwiteados. Usuarios sin tweets 2497
Nodos Retweeteados por alguien pero que no Retweetean: 6,220
Nodos que Retweetean a alguien pero no son Retweeteados: 1,575
Nodos que tienen un self loop: 11

****************************************************************************************************

primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en conectados: 34,840
Total de nodos Singletons: 2,497
Total de usuarios que TWITTEARON Durante el paro: 37,337


Unnamed: 0,ID,Label,Political Affiliation,User ID
0,0,0,Sin Clasificar,1.200000e+01
1,1,alerios,Izquierda,1.299600e+04
2,2,ranaberden,Centro,7.779780e+05
3,3,jsanti,Izquierda,7.841250e+05
4,4,thisgoblin,Izquierda,1.061601e+06
...,...,...,...,...
34835,34835,Neoplasticista,Izquierda,1.389722e+18
34836,34836,JC13177979,Derecha,1.389737e+18
34837,34837,JhonatanVRojo,Sin Clasificar,1.389741e+18
34838,34838,VaneLen18,Izquierda,1.389769e+18


# CHECKPOINT: Cargar Master ID y retweets edge list

In [4]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Master_Index.csv', sep = ';')

# Open retweets edge list in any case
retweets = pd.read_pickle(os.path.join(path, "Tweets_DataFrames", "retweets_edge_list.gzip"), compression = "gzip")

# Useful Dicts Index to Var
idx_to_id = master_id['User ID'].reset_index().to_dict()['User ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Useful Dicts User ID to Var
id_to_pa = master_id.set_index('User ID')['Political Affiliation'].to_dict()
id_to_idx = master_id.set_index('User ID')['ID'].to_dict()
id_to_label = master_id.set_index('User ID')['Label'].to_dict()

# dict for color
color = {
    'Izquierda': 'blue',
    'Derecha': 'red',
    'Centro': 'green',
    'Sin Clasificar': 'gray'
}

# Retweet network with a 3-day rolling window

In [3]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-05-01 23:59:59'
v2_end = '2021-06-30 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

# date_start = date_start[7:8]
# date_end = date_end[7:8]

### Get Original Tweets

In [11]:
original = pd.read_pickle(os.path.join(path,"Tweets_DataFrames","original.gzip"), compression='gzip')

# Get the amount of tweets for each day
def create_tweets_per_day(date_tuple):
    d_s, d_e = date_tuple
    window = original[(original['Date'] >= d_s.date()) & (original['Date'] < d_e.date())]
    
    # CAlculate number of tweets
    tweets_per_user = window.groupby('Author ID').size().reset_index(name = 'Original Tweets')
    tweets_per_user.rename(columns = {'Author ID': 'User ID'}, inplace=True)
    
    # Merge with master Index to get the tweets of relevant people
    tweets_per_user = tweets_per_user.merge(master_id, on='User ID', how = 'right')
    
    # Filling na as 0 (No original tweet registerd)
    tweets_per_user.fillna(0, inplace=True)
    tweets_per_user.set_index('User ID', inplace = True)
    
    id_to_tweets = tweets_per_user.to_dict()['Original Tweets']
    filename = os.path.join(path_3_day, 'Tweets_Per_Day', f'starting_{str(d_s.date())}' + ".pkl")
    
    with open(filename, 'wb') as file:
        pickle.dump(id_to_tweets,file)
    

def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_tweets_per_day, dates), total = len(dates), desc="Guardar Original Tweets: "))
    return futures
if __name__ == '__main__':
    tic = perf_counter()
    main()
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

Guardar Original Tweets: 100%|██████████| 61/61 [00:04<00:00, 12.24it/s]


Finish whole cell in 0.0 minutes and 7 secs.


### Create Source-Target DataFrame

In [16]:
def create_source_target(date_tuple):
    d_s, d_e = date_tuple
    # Get 3 days Retweets
    window_rts = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] < d_e.date())]
    
    # Get total of Tweets during that period
    rts_per_user= window_rts.groupby('Author ID').size().reset_index(name = 'total')
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet ID to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet ID']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)

    # Create Normalize Weight
    temp = window_rts.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number_of_rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal_weight'] = temp['number_of_rts']/temp['total']
    temp['normal_weight'] = temp['normal_weight']
    temp.drop(columns='total', inplace=True)
    temp.rename(columns = {'Author ID': "source_user_id", "Referenced Tweet Author ID": "target_user_id"}, inplace=True)
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: int(id_to_idx[x]))
    temp['Target'] = temp['target_user_id'].apply(lambda x: int(id_to_idx[x]))
    
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: id_to_pa[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: id_to_pa[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'number_of_rts': int,
        'normal_weight': float,
        'source_user_id': 'float64',
        'target_user_id': 'float64'
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    # Save results as csv
    temp.to_csv(os.path.join(path_3_day, "Source_Target", 'starting_' + str(d_s.date()) + ".csv"), index = False, sep = ";")
    
def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_source_target, dates), total = len(dates), desc = "Guardando Source-Target files: "))
    return futures

if __name__ == '__main__':
    tic = perf_counter()
    main()
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

Guardando Source-Target files: 100%|██████████| 61/61 [00:27<00:00,  2.23it/s]


Finish whole cell in 0.0 minutes and 30 secs.


### Create Networks

In [15]:
# Example of our lists
ej_csv = pd.read_csv(path_3_day + f'/Source_Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(set(ej_csv['Source'])):,}. Nodos Target {len(set(ej_csv['Target'])):,}. Total de Nodos {len(nodes):,}")
ej_csv.head(10)

 Numero de Arcos: 613,074. Nodos Source: 28,792. Nodos Target 13,979. Total de Nodos 30,443


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
0,1,191,12996.0,17813487.0,1,0.029412,Izquierda,Sin Clasificar,alerios,NoticiasCaracol
1,1,365,12996.0,25185308.0,1,0.029412,Izquierda,Centro,alerios,sergio_fajardo
2,1,970,12996.0,49681553.0,2,0.058824,Izquierda,Izquierda,alerios,CamiloRomero
3,1,1543,12996.0,59976153.0,1,0.029412,Izquierda,Izquierda,alerios,Nataliaescribe
4,1,1936,12996.0,66711542.0,1,0.029412,Izquierda,Centro,alerios,CristoBustos
5,1,2185,12996.0,70594101.0,1,0.029412,Izquierda,Izquierda,alerios,josegreghg
6,1,2494,12996.0,76664119.0,1,0.029412,Izquierda,Sin Clasificar,alerios,vanguardiacom
7,1,3855,12996.0,106131505.0,1,0.029412,Izquierda,Izquierda,alerios,LaCaballero
8,1,4952,12996.0,127657219.0,1,0.029412,Izquierda,Izquierda,alerios,AlfonsoOspina
9,1,5526,12996.0,134855279.0,4,0.117647,Izquierda,Centro,alerios,DanielSamperO


In [6]:
def create_graph(file_tuple):
    file1, file2 = file_tuple
    starting_date_str = file1.split('.')[-2].split('/')[-1].split('_')[-1]
    starting_date = datetime.strptime(starting_date_str, '%Y-%m-%d')
    ending_date = starting_date + timedelta(days=3)
    ending_date_str = ending_date.strftime('%Y-%m-%d')
    
    csv = pd.read_csv(file1, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weight'] = normal_weight
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')

    # Create a Vertex property maps
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('double')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    vertex_tweets_map = g.new_vertex_property('int64_t')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map
    
    with open(file2, "rb") as file:
        tweets_per_day = pickle.load(file)

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v_index in g.iter_vertices():
        v = g.vertex(v_index)
        # Add tweets information of users
        vertex_tweets_map[v] = tweets_per_day[idx_to_id[v]]
        
        # Add Master Index Information
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
        # Add categories Information
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        
        # Add isolated
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    g.vp['Tweets'] = vertex_tweets_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_starting_date = g.new_graph_property('string')
    graph_ending_date = g.new_graph_property('string')
    graph_starting_date[g] = starting_date_str
    graph_ending_date[g] = ending_date_str
    g.gp['Starting Date'] = graph_starting_date
    g.gp['Ending Date'] = graph_ending_date

    # Save Graphs
    filename = os.path.join(path_3_day, 'Graphs' ,'starting_' + starting_date_str + ".graphml")
    g.save(filename)


def main():
    source_target_files = glob(os.path.join(path_3_day,"Source_Target", "starting_*.csv"))
    tweets_files = glob(os.path.join(path_3_day,"Tweets_Per_Day", "starting_*.pkl"))
    source_target_files.sort()
    tweets_files.sort()
    files = list(zip(source_target_files, tweets_files))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_graph, files), total = len(files), desc="Guardando Objetos de Grafos: "))
    return futures

if __name__ == '__main__':
    tic = perf_counter()
    main()
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

Guardando Objetos de Grafos: 100%|██████████| 61/61 [00:26<00:00,  2.26it/s]

Finish whole cell in 0.0 minutes and 27 secs.





In [7]:
print('Edge List')
ej_csv = pd.read_csv(path_3_day + f'/Source_Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(path_3_day + f'/Graphs/starting_2021-05-04.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

Edge List
Non Isolate Vertices 30,443 and 613,074 edges
Graph
Non Isolate Vertices 30,443 and 613,074 edges


In [8]:
# Revisamos los VertexPropertyMaps
dict(ej_g.vp)

{'Centro': <VertexPropertyMap object with value type 'bool', for Graph 0x7f4ab4b736d0, at 0x7f4b48be20d0>,
 'Color': <VertexPropertyMap object with value type 'string', for Graph 0x7f4ab4b736d0, at 0x7f4ab4a2be90>,
 'Derecha': <VertexPropertyMap object with value type 'bool', for Graph 0x7f4ab4b736d0, at 0x7f4b48aece90>,
 'Isolate': <VertexPropertyMap object with value type 'bool', for Graph 0x7f4ab4b736d0, at 0x7f4b48973750>,
 'Izquierda': <VertexPropertyMap object with value type 'bool', for Graph 0x7f4ab4b736d0, at 0x7f4b489fa010>,
 'Label': <VertexPropertyMap object with value type 'string', for Graph 0x7f4ab4b736d0, at 0x7f4b489f9d10>,
 'Political Label': <VertexPropertyMap object with value type 'string', for Graph 0x7f4ab4b736d0, at 0x7f4b488822d0>,
 'Sin Clasificar': <VertexPropertyMap object with value type 'bool', for Graph 0x7f4ab4b736d0, at 0x7f4b48882190>,
 'Tweets': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7f4ab4b736d0, at 0x7f4b59d36a50>,
 'User I

In [9]:
# Revisamos los EdgePropertyMaps
dict(ej_g.ep)

{'Normal Weight': <EdgePropertyMap object with value type 'double', for Graph 0x7f4ab4b736d0, at 0x7f4b48cd0d10>,
 'Number of rts': <EdgePropertyMap object with value type 'int32_t', for Graph 0x7f4ab4b736d0, at 0x7f4b48cd0cd0>}

In [10]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23631
1920140406.0
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23631
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                   1920140406.0
Name: 23631, dtype: object

In [18]:
# Checking properties of Vertex
test_id_source = 37698374
test_id_target = 1146346814

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]
test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weight'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (584, 21864)
Edge In graph: (584, 21864)
Normal Weights 0.0052631578947368
Number of rts 1
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
10283,584,21864,37698374.0,1146347000.0,1,0.005263,Derecha,Derecha,jairocor,DefensoyCFM


# CHECKPOINT: Cargar Master ID y retweets edge list

In [18]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Master_Index.csv', sep = ';')

# Open retweets edge list in any case
retweets = pd.read_pickle(os.path.join(path, "Tweets_DataFrames", "retweets_edge_list.gzip"), compression = "gzip")

# Useful Dicts Index to Var
idx_to_id = master_id['User ID'].reset_index().to_dict()['User ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Useful Dicts User ID to Var
id_to_pa = master_id.set_index('User ID')['Political Affiliation'].to_dict()
id_to_idx = master_id.set_index('User ID')['ID'].to_dict()
id_to_label = master_id.set_index('User ID')['Label'].to_dict()

# dict for color
color = {
    'Izquierda': 'blue',
    'Derecha': 'red',
    'Centro': 'green',
    'Sin Clasificar': 'gray'
}

# Daily retweet network (with no rolling window)

In [19]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have retweets from 2021-04-28 to 2021-06-29


### Get original Tweets

In [16]:
original = pd.read_pickle(os.path.join(path,"Tweets_DataFrames","original.gzip"), compression='gzip')

# Get the amount of tweets for each day
def create_tweets_per_day(date):
    window = original[original['Date'] == date.date()]
    
    # CAlculate number of tweets
    tweets_per_user = window.groupby('Author ID').size().reset_index(name = 'Original Tweets')
    tweets_per_user.rename(columns = {'Author ID': 'User ID'}, inplace=True)
    
    # Merge with master Index to get the tweets of relevant people
    tweets_per_user = tweets_per_user.merge(master_id, on='User ID', how = 'right')
    
    # Filling na as 0 (No original tweet registerd)
    tweets_per_user.fillna(0, inplace=True)
    tweets_per_user.set_index('User ID', inplace = True)
    
    id_to_tweets = tweets_per_user.to_dict()['Original Tweets']
    filename = os.path.join(path_daily, 'Tweets_Per_Day', f'starting_{str(date.date())}' + ".pkl")
    
    with open(filename, 'wb') as file:
        pickle.dump(id_to_tweets,file)
    

def main(daily_grid):
    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_tweets_per_day, daily_grid), total = len(daily_grid), desc= "Guardando Tweets Originales: "))
    return futures

if __name__ == '__main__':
    tic = perf_counter()
    main(daily_grid)
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

Guardando Tweets Originales: 100%|██████████| 63/63 [00:03<00:00, 17.34it/s]


Finish whole cell in 0.0 minutes and 5 secs.


### Create Source-Target DataFrame

In [21]:
def create_source_target(date):
    window_rts = retweets[retweets['Date'] == date.date()]
    
    # Get total of Tweets during that period
    rts_per_user= window_rts.groupby('Author ID').size().reset_index(name = 'total')
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet ID']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)

    # Create Normalize Weight
    temp = window_rts.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number_of_rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal_weight'] = temp['number_of_rts']/temp['total']
    temp['normal_weight'] = temp['normal_weight']
    temp.drop(columns='total', inplace=True)
    temp.rename(columns = {'Author ID': "source_user_id", "Referenced Tweet Author ID": "target_user_id"}, inplace=True)
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: int(id_to_idx[x]))
    temp['Target'] = temp['target_user_id'].apply(lambda x: int(id_to_idx[x]))
    
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: id_to_pa[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: id_to_pa[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'number_of_rts': int,
        'normal_weight': float,
        'source_user_id': 'float64',
        'target_user_id': 'float64'
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    # Save results as csv
    temp.to_csv(os.path.join(path_daily, "Source-Target", 'starting_' + str(date.date()) + ".csv"), index = False, sep = ";")
    
def main(daily_grid):
    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_source_target, daily_grid), total = len(daily_grid),desc="Guardando Source-Target files: "))
    return futures

if __name__ == '__main__':
    tic = perf_counter()
    main(daily_grid)
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

Guardando Source-Target files: 100%|██████████| 63/63 [00:09<00:00,  6.86it/s]


Finish whole cell in 0.0 minutes and 10 secs.


### Create networks

In [23]:
def create_graph(file_tuple):
    file1, file2 = file_tuple
    date_str = file1.split('.')[-2].split('/')[-1].split('_')[-1]
    
    csv = pd.read_csv(file1, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weight'] = normal_weight

    # Create a Vertex property maps
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('double')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    vertex_tweets_map = g.new_vertex_property('int64_t')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map
    
    with open(file2, "rb") as file:
        tweets_per_day = pickle.load(file)

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v_index in g.iter_vertices():
        v = g.vertex(v_index)
        # Add tweets information of users
        vertex_tweets_map[v] = tweets_per_day[idx_to_id[v]]
        
        # Add Master Index Information
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
        # Add categories Information
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        
        # Add isolated
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    g.vp['Tweets'] = vertex_tweets_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_date_map = g.new_graph_property('string')
    graph_date_map[g] = date_str
    g.gp['Date'] = graph_date_map

    # Save Graphs
    filename = os.path.join(path_daily, 'Graphs' ,'starting_' + date_str + ".graphml")
    g.save(filename)


def main():
    source_target_files = glob(os.path.join(path_daily,"Source-Target","starting_*.csv"))
    tweets_files = glob(os.path.join(path_daily,"Tweets_Per_Day","starting_*.pkl"))
    source_target_files.sort()
    tweets_files.sort()
    files = list(zip(source_target_files, tweets_files))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_graph, files), total = len(files)))
    return futures

if __name__ == '__main__':
    tic = perf_counter()
    main()
    toc = perf_counter()
    time = toc-tic
    
    print(f"Finish whole cell in {time//60} minutes and {time%60:,.0f} secs.")

100%|██████████| 63/63 [00:13<00:00,  4.55it/s]

Finish whole cell in 0.0 minutes and 15 secs.





In [24]:
print('Edge List')
ej_csv = pd.read_csv(path_daily + f'/Source-Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(path_daily + f'/Graphs/starting_2021-05-04.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

Edge List
Non Isolate Vertices 25,960 and 278,382 edges
Graph
Non Isolate Vertices 25,960 and 278,382 edges


In [25]:
# Revisamos los VertexPropertyMaps
dict(ej_g.vp)

{'Centro': <VertexPropertyMap object with value type 'bool', for Graph 0x7fd27fa1dc10, at 0x7fd1ba23fd90>,
 'Color': <VertexPropertyMap object with value type 'string', for Graph 0x7fd27fa1dc10, at 0x7fd234b743d0>,
 'Derecha': <VertexPropertyMap object with value type 'bool', for Graph 0x7fd27fa1dc10, at 0x7fd1be2632d0>,
 'Isolate': <VertexPropertyMap object with value type 'bool', for Graph 0x7fd27fa1dc10, at 0x7fd2346f1210>,
 'Izquierda': <VertexPropertyMap object with value type 'bool', for Graph 0x7fd27fa1dc10, at 0x7fd265b06850>,
 'Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fd27fa1dc10, at 0x7fd265b06990>,
 'Political Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fd27fa1dc10, at 0x7fd265a43b10>,
 'Sin Clasificar': <VertexPropertyMap object with value type 'bool', for Graph 0x7fd27fa1dc10, at 0x7fd265a43c50>,
 'Tweets': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7fd27fa1dc10, at 0x7fd265a43d10>,
 'User I

In [32]:
# Revisamos los EdgePropertyMaps
dict(ej_g.ep)

{'Normal Weight': <EdgePropertyMap object with value type 'double', for Graph 0x7fd27fa1dc10, at 0x7fd265a43e90>,
 'Number of rts': <EdgePropertyMap object with value type 'int32_t', for Graph 0x7fd27fa1dc10, at 0x7fd265a43f50>}

In [35]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23631
1920140406.0
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23631
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                   1920140406.0
Name: 23631, dtype: object

In [37]:
# Checking properties of Vertex
test_id_source = 37698374
test_id_target = 1146346814

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]
test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weight'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (584, 21864)
Edge In graph: (584, 21864)
Normal Weights 0.0133333333333333
Number of rts 1
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
4502,584,21864,37698374.0,1146347000.0,1,0.013333,Derecha,Derecha,jairocor,DefensoyCFM


# No me acuerdo pa que era esto

In [39]:
graphs = glob(os.path.join(path_daily, 'Graphs','*.graphml'))

rts_node_day = pd.DataFrame(columns=daily_grid, index=range(len(master_id)))
tweets_node_day = pd.DataFrame(columns=daily_grid, index=range(len(master_id)))
for graph in tqdm(graphs):
    g = gt.load_graph(graph)
    date = g.gp['Date']
    
    for v in g.vertices():
        row = int(v)
        number_of_rts = sum(g.ep['Number of rts'][e] for e in v.out_edges())
        tweets = g.vp['Tweets'][v]
        
        rts_node_day.loc[row,date] = number_of_rts
        tweets_node_day.loc[row,date] = tweets

rts_node_day.to_pickle(os.path.join(path,"Pickle","rts_node_day.pkl"))
tweets_node_day.to_pickle(os.path.join(path,"Pickle","tweets_node_day.pkl"))

100%|██████████| 63/63 [06:57<00:00,  6.62s/it]
