# Retweets Networks

In this notebook, we create a daily retweet network. First, we transform our dataset into a Source-Target DataFrame. Afterward, we construct a `graph-tool` object.

We have 45,330,718 Tweets from 37330 from which 30,918,011 are ReTweets

In [1]:
# Mathematical and Data Managment
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Graph Managment
import graph_tool.all as gt
from utils.Bojanowski import *

# Miscellaneous
import pickle
import os
from tqdm import tqdm
from glob import glob
from concurrent.futures import ProcessPoolExecutor



In [2]:
save_path = "../../../Data/3_Day_Graphs"

In [3]:
# Load Retweets
retweets = pd.read_pickle('../../../Data/Tweets_DataFrames/retweets_lite.gzip', compression='gzip')

retweets = retweets.astype({
    'Author ID': float,
    'Referenced Tweet Author ID': float
})
retweets["Date"] = pd.to_datetime(retweets["Date"], errors='coerce').dt.date

In [4]:
# Import political labelling dictionary
user_to_party_paro = pd.read_pickle("../../../Data/Pickle/user_to_party_paro.pkl")

ids_faltantes1 = set(retweets["Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes2 = set(retweets["Referenced Tweet Author ID"]) - set(user_to_party_paro.keys())
ids_faltantes = np.concatenate((list(ids_faltantes1), list(ids_faltantes2)))
print(f"Faltan clasificar {len(np.unique(ids_faltantes)):,} usuarios")

# Los dejamos como inclasificados 
for usuario in ids_faltantes:
    user_to_party_paro[usuario] = np.nan

affilliation_df = pd.DataFrame(list(user_to_party_paro.items()), columns=['User ID', 'Political Affiliation'])
affilliation_df['Political Affiliation'].value_counts(dropna=False)

Faltan clasificar 587,246 usuarios


Political Affiliation
NaN               587246
Izquierda          23138
Derecha             6812
Sin Clasificar      3844
Centro              3543
Name: count, dtype: int64

In [5]:
# Create Master Id
#del affilliation_df,ids_faltantes, ids_faltantes1, ids_faltantes2, usuario
users_information = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/users_information.gzip', compression='gzip')
master_id = users_information.reset_index()[['Author ID', 'Author Name']]
master_id = master_id.rename(columns={
    'Author ID': 'User ID',
    'Author Name': 'Label'
})
del users_information
master_id['Political Affiliation'] = master_id['User ID'].apply(lambda x: user_to_party_paro[x])
print(f"Duplicated IDs: {sum(master_id.duplicated(subset = 'User ID'))}")
master_id = master_id.drop_duplicates(subset = 'User ID')
master_id

Duplicated IDs: 7


Unnamed: 0,User ID,Label,Political Affiliation
0,0.000000e+00,0,Sin Clasificar
1,1.000000e+00,0,Sin Clasificar
2,2.000000e+00,0,Sin Clasificar
3,3.000000e+00,0,Sin Clasificar
5,4.000000e+00,0,Sin Clasificar
...,...,...,...
37339,1.389722e+18,Neoplasticista,Izquierda
37340,1.389737e+18,JC13177979,Derecha
37341,1.389741e+18,JhonatanVRojo,Sin Clasificar
37342,1.389769e+18,VaneLen18,Izquierda


In [6]:
# Adding Source Labels
temp = retweets.merge(master_id, how = 'inner', left_on = 'Author ID', right_on='User ID', validate = 'm:1')
temp = temp.rename(columns = {'Label': 'Source Label', 'Political Affiliation': 'Source PA'}).drop(columns=['User ID'])

# Adding Target Labels
retweets = temp.merge(master_id, how = 'inner', left_on = 'Referenced Tweet Author ID', right_on='User ID', validate = 'm:1')
retweets = retweets.rename(columns = {'Label': 'Target Label', 'Political Affiliation': 'Target PA'})

retweets = retweets.drop(columns= ['User ID'])
print(retweets.shape)
retweets.head()

(13784608, 11)


Unnamed: 0,Tweet ID,Author ID,Author Name,Referenced Tweet Author ID,Referenced Tweet Author Name,Date,Referenced Tweet,Source Label,Source PA,Target Label,Target PA
0,1.407536e+18,788250700.0,Laura_Milena98,270668814.0,gabocifuentes,2021-06-22,1.407204e+18,Laura_Milena98,Izquierda,gabocifuentes,Centro
1,1.393521e+18,126523600.0,sandrapulga,270668814.0,gabocifuentes,2021-05-15,1.393368e+18,sandrapulga,Izquierda,gabocifuentes,Centro
2,1.404837e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,2021-06-15,1.404697e+18,rchicave,Izquierda,gabocifuentes,Centro
3,1.402071e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,2021-06-07,1.401913e+18,rchicave,Izquierda,gabocifuentes,Centro
4,1.391067e+18,8.996701e+17,rchicave,270668814.0,gabocifuentes,2021-05-08,1.39102e+18,rchicave,Izquierda,gabocifuentes,Centro


In [7]:
# Count how many tweets and users we have now
users_rts = set(retweets['Author ID']).union(set(retweets['Referenced Tweet Author ID']))
non_singletons = master_id[master_id['User ID'].isin(users_rts)] # Nodes conected by a Rt
singletons = master_id[~master_id['User ID'].isin(users_rts)] # Nodes that aren't in Rts during Paro (Perhaps only Twitted or Retwitted with replies) 

# Insights
print(f"Tenemos: {len(non_singletons):,} Usuarios con Rts o Retwiteados. Usuarios sin tweets {len(singletons)}")
# Save this Dataframe for Nodes List
master_id = non_singletons # Aislamos a los singletons del Master ID
master_id = master_id.astype({
    'User ID': int,
    'Label': str,
    'Political Affiliation': str
}).reset_index(drop=True)

master_id = master_id.reset_index().rename(columns = {'index': 'ID'})
id_to_label = master_id.set_index('User ID').to_dict()['Label']

master_id = master_id[['ID', 'Label', 'Political Affiliation', 'User ID']]

# Save
master_id.to_csv(os.path.join(save_path, 'Master_Index' + ".csv"), index = False, sep = ";")
master_id

Tenemos: 36,964 Usuarios con Rts o Retwiteados. Usuarios sin tweets 373


Unnamed: 0,ID,Label,Political Affiliation,User ID
0,0,0,Sin Clasificar,12
1,1,alerios,Izquierda,12996
2,2,ranaberden,Centro,777978
3,3,jsanti,Izquierda,784125
4,4,thisgoblin,Izquierda,1061601
...,...,...,...,...
36959,36959,Neoplasticista,Izquierda,1389721694961651712
36960,36960,JC13177979,Derecha,1389737202742071296
36961,36961,JhonatanVRojo,Sin Clasificar,1389741234370064384
36962,36962,VaneLen18,Izquierda,1389769251704147968


In [8]:
# Open Master Index in Any case
master_id = pd.read_csv('/mnt/disk2/Data/3_Day_Graphs/Master_Index.csv', sep = ';')

# Useful Dicts
idx_to_id = master_id['User ID'].reset_index().to_dict()['User ID']
idx_to_pa = master_id['Political Affiliation'].reset_index().to_dict()['Political Affiliation']
idx_to_label = master_id['Label'].reset_index().to_dict()['Label']

# Dict for lookup in Graph
id_to_idx = {id:idx for idx, id in idx_to_id.items()}

# dict for color
color = {
    'Izquierda': 'blue',
    'Derecha': 'red',
    'Centro': 'green',
    'Sin Clasificar': 'gray'
}

In [9]:
# Insights using only retweets
nodes_no_receipt = set(retweets["Author ID"]) -  set(retweets["Referenced Tweet Author ID"])
nodes_no_send = set(retweets["Referenced Tweet Author ID"]) - set(retweets["Author ID"])
self_loops = set(retweets[retweets['Author ID'] == retweets['Referenced Tweet Author ID']])

print(f'Nodos Retweeteados por alguien pero que no Retweetean: {len(nodes_no_receipt):,}')
print(f'Nodos que Retweetean a alguien pero no son Retweeteados: {len(nodes_no_send):,}')
print(f'Nodos que tienen un self loop: {len(self_loops)}')

print(f'primer nodo: {min(users_rts)} último nodo: {max(users_rts):,}')
print(f'Total de nodos en conectados: {len(users_rts):,}')
print(f'Total de nodos Singletons: {len(singletons):,}')
print(f'Total de usuarios que TWITTEARON Durante el paro: {len(singletons) + len(non_singletons):,}')

del nodes_no_receipt, nodes_no_send, self_loops, users_rts

Nodos Retweeteados por alguien pero que no Retweetean: 6,510
Nodos que Retweetean a alguien pero no son Retweeteados: 833
Nodos que tienen un self loop: 11
primer nodo: 12.0 último nodo: 1.3897841454176788e+18
Total de nodos en conectados: 36,964
Total de nodos Singletons: 373
Total de usuarios que TWITTEARON Durante el paro: 37,337


## Retweet network with a 3-day rolling window

In [10]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date_start = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

del v1_end, v1_start

v2_start = '2021-05-01 23:59:59'
v2_end = '2021-06-30 23:59:59'
date_end = pd.date_range(start = v2_start, end = v2_end, freq = 'D')

del v2_start, v2_end

# date_start = date_start[7:8]
# date_end = date_end[7:8]

### Get Original Tweets

In [16]:
original = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/original.gzip', compression='gzip')

# Get the amount of tweets for each day
def create_tweets_per_day(date_tuple):
    d_s, d_e = date_tuple
    window = original[(original['Date'] >= d_s.date()) & (original['Date'] < d_e.date())]
    
    # CAlculate number of tweets
    tweets_per_user = window.groupby('Author ID').size().reset_index(name = 'Original Tweets')
    tweets_per_user.rename(columns = {'Author ID': 'User ID'}, inplace=True)
    
    # Merge with master Index to get the tweets of relevant people
    tweets_per_user = tweets_per_user.merge(master_id, on='User ID', how = 'right')
    
    # Filling na as 0 (No original tweet registerd)
    tweets_per_user.fillna(0, inplace=True)
    tweets_per_user.set_index('User ID', inplace = True)
    
    id_to_tweets = tweets_per_user.to_dict()['Original Tweets']
    filename = os.path.join(save_path, 'Tweets_Per_Day', f'starting_{str(d_s.date())}' + ".pkl")
    
    with open(filename, 'wb') as file:
        pickle.dump(id_to_tweets,file)
    

def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_tweets_per_day, dates), total = len(dates)))
    return futures
if __name__ == '__main__':
    main()

100%|██████████| 61/61 [00:04<00:00, 12.85it/s]


### Create Source-Target DataFrame

In [17]:
def create_source_target(date_tuple):
    d_s, d_e = date_tuple
    # Get 3 days Retweets
    window_rts = retweets[(retweets['Date'] >= d_s.date()) & (retweets['Date'] < d_e.date())]
    
    # Get total of Tweets during that period
    rts_per_user= window_rts.groupby('Author ID').size().reset_index(name = 'total')
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)

    # Create Normalize Weight
    temp = window_rts.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number_of_rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal_weight'] = temp['number_of_rts']/temp['total']
    temp['normal_weight'] = temp['normal_weight']
    temp.drop(columns='total', inplace=True)
    temp.rename(columns = {'Author ID': "source_user_id", "Referenced Tweet Author ID": "target_user_id"}, inplace=True)
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: int(id_to_idx[x]))
    temp['Target'] = temp['target_user_id'].apply(lambda x: int(id_to_idx[x]))
    
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'number_of_rts': int,
        'normal_weight': float,
        'source_user_id': int,
        'target_user_id': int
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source_Target", 'starting_' + str(d_s.date()) + ".csv"), index = False, sep = ";")
    
def main():
    dates = list(zip(date_start, date_end))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_source_target, dates), total = len(dates)))
    return futures

if __name__ == '__main__':
    main()

100%|██████████| 61/61 [00:31<00:00,  1.96it/s]


### Create Networks

In [18]:
# Example of our lists
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-06-16.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f" Numero de Arcos: {ej_csv.shape[0]:,}. Nodos Source: {len(set(ej_csv['Source'])):,}. Nodos Target {len(set(ej_csv['Target'])):,}. Total de Nodos {len(nodes):,}")
ej_csv.head(10)

 Numero de Arcos: 234,657. Nodos Source: 24,254. Nodos Target 8,981. Total de Nodos 26,015


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
0,1,1140,12996,53855557,1,0.027778,Izquierda,Centro,alerios,CathyJuvinao
1,1,1712,12996,62528273,1,0.027778,Izquierda,Centro,alerios,AntonioSanguino
2,1,1939,12996,66740100,1,0.027778,Izquierda,Sin Clasificar,alerios,ZuluagaCamila
3,1,3504,12996,98781946,2,0.055556,Izquierda,Izquierda,alerios,IvanCepedaCast
4,1,3782,12996,104622520,1,0.027778,Izquierda,Izquierda,alerios,valentinabz
5,1,6170,12996,142448269,1,0.027778,Izquierda,Izquierda,alerios,maryluzherran
6,1,7882,12996,165748292,1,0.027778,Izquierda,Centro,alerios,angelamrobledo
7,1,9727,12996,201256928,1,0.027778,Izquierda,Izquierda,alerios,marthaperaltae
8,1,10570,12996,221466950,4,0.111111,Izquierda,Izquierda,alerios,wilsonariasc
9,1,11057,12996,233001872,1,0.027778,Izquierda,Izquierda,alerios,jorgerojas2022


In [19]:
def create_graph(file_tuple):
    file1, file2 = file_tuple
    starting_date_str = file1.split('.')[-2].split('/')[-1].split('_')[-1]
    starting_date = datetime.strptime(starting_date_str, '%Y-%m-%d')
    ending_date = starting_date + timedelta(days=3)
    ending_date_str = ending_date.strftime('%Y-%m-%d')
    
    csv = pd.read_csv(file1, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weight'] = normal_weight
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')

    # Create a Vertex property maps
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('int64_t')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    vertex_tweets_map = g.new_vertex_property('int64_t')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map
    
    with open(file2, "rb") as file:
        tweets_per_day = pickle.load(file)

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v_index in g.iter_vertices():
        v = g.vertex(v_index)
        # Add tweets information of users
        vertex_tweets_map[v] = tweets_per_day[idx_to_id[v]]
        
        # Add Master Index Information
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
        # Add categories Information
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        
        # Add isolated
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    g.vp['Tweets'] = vertex_tweets_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_starting_date = g.new_graph_property('string')
    graph_ending_date = g.new_graph_property('string')
    graph_starting_date[g] = starting_date_str
    graph_ending_date[g] = ending_date_str
    g.gp['Starting Date'] = graph_starting_date
    g.gp['Ending Date'] = graph_ending_date

    # Save Graphs
    filename = os.path.join(save_path, 'Graphs' ,'starting_' + starting_date_str + ".graphml")
    g.save(filename)


def main():
    source_target_files = glob('../../../Data/3_Day_Graphs/Source_Target/starting_*.csv')
    tweets_files = glob('../../../Data/3_Day_Graphs/Tweets_Per_Day/starting_*.pkl')
    source_target_files.sort()
    tweets_files.sort()
    files = list(zip(source_target_files, tweets_files))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_graph, files), total = len(files)))
    return futures

if __name__ == '__main__':
    main()

100%|██████████| 61/61 [00:30<00:00,  1.98it/s]


In [20]:
print('Edge List')
ej_csv = pd.read_csv(save_path + f'/Source_Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(save_path + f'/Graphs/starting_2021-05-04.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

Edge List
Non Isolate Vertices 32,886 and 718,230 edges
Graph
Non Isolate Vertices 32,886 and 718,230 edges


In [21]:
dict(ej_g.vp)

{'Centro': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87b17c0d0, at 0x7fc927682310>,
 'Color': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87b17c0d0, at 0x7fc8890a5e90>,
 'Derecha': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87b17c0d0, at 0x7fc703b9fc50>,
 'Isolate': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87b17c0d0, at 0x7fc703b9fd10>,
 'Izquierda': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87b17c0d0, at 0x7fc703e82850>,
 'Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87b17c0d0, at 0x7fc7038aba10>,
 'Political Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87b17c0d0, at 0x7fc7038abcd0>,
 'Sin Clasificar': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87b17c0d0, at 0x7fc6fc31dad0>,
 'Tweets': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7fc87b17c0d0, at 0x7fc6fc31e9d0>,
 'User I

In [22]:
dict(ej_g.ep)

{'Normal Weight': <EdgePropertyMap object with value type 'double', for Graph 0x7fc87b17c0d0, at 0x7fc6fc31f550>,
 'Number of rts': <EdgePropertyMap object with value type 'int32_t', for Graph 0x7fc87b17c0d0, at 0x7fc6fc31e010>}

In [23]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23639
1920140406
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23639
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                     1920140406
Name: 23639, dtype: object

In [24]:
# Checking properties of Vertex
test_id_source = 1389784145417678848
test_id_target = 1010113508233699328

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]
test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weight'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (36963, 31708)
Edge In graph: (36963, 31708)
Normal Weights 0.088235294117647
Number of rts 3
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
718229,36963,31708,1389784145417678848,1010113508233699328,3,0.088235,Izquierda,Izquierda,kars0518,LevyRincon


## Daily retweet network (with no rolling window)

In [25]:
save_path = "../../../Data/Daily_graphs"

In [26]:
fecha_min = retweets["Date"].min()
fecha_max = retweets["Date"].max()
print(f"We have retweets from {fecha_min.strftime('%Y-%m-%d')} to {fecha_max.strftime('%Y-%m-%d')}")
daily_grid = pd.date_range(start = fecha_min, end = fecha_max, freq = 'D')

We have retweets from 2021-04-28 to 2021-06-29


### Get original Tweets

In [38]:
original = pd.read_pickle('/mnt/disk2/Data/Tweets_DataFrames/original.gzip', compression='gzip')

# Get the amount of tweets for each day
def create_tweets_per_day(date):
    window = original[original['Date'] == date.date()]
    
    # CAlculate number of tweets
    tweets_per_user = window.groupby('Author ID').size().reset_index(name = 'Original Tweets')
    tweets_per_user.rename(columns = {'Author ID': 'User ID'}, inplace=True)
    
    # Merge with master Index to get the tweets of relevant people
    tweets_per_user = tweets_per_user.merge(master_id, on='User ID', how = 'right')
    
    # Filling na as 0 (No original tweet registerd)
    tweets_per_user.fillna(0, inplace=True)
    tweets_per_user.set_index('User ID', inplace = True)
    
    id_to_tweets = tweets_per_user.to_dict()['Original Tweets']
    filename = os.path.join(save_path, 'Tweets_Per_Day', f'starting_{str(date.date())}' + ".pkl")
    
    with open(filename, 'wb') as file:
        pickle.dump(id_to_tweets,file)
    

def main(daily_grid):
    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_tweets_per_day, daily_grid), total = len(daily_grid)))
    return futures

if __name__ == '__main__':
    main(daily_grid)

100%|██████████| 63/63 [00:03<00:00, 18.00it/s]


### Create Source-Target DataFrame

In [45]:
def create_source_target(date):
    window_rts = retweets[retweets['Date'] == date.date()]
    
    # Get total of Tweets during that period
    rts_per_user= window_rts.groupby('Author ID').size().reset_index(name = 'total')
    
    # Get Tweets that were retweeted (Grouping by Referenced Tweet to avoid duplicating (Counting al retweets and not the original retweeted tweet))
    tweets_og_per_user = window_rts.groupby(['Referenced Tweet Author ID', 'Referenced Tweet']).agg('size').reset_index()
    tweets_og_per_user = tweets_og_per_user.groupby('Referenced Tweet Author ID').size()
    tweets_og_per_user.index.astype(int)

    # Create Normalize Weight
    temp = window_rts.groupby(["Author ID", "Referenced Tweet Author ID"]).size().reset_index(name = "number_of_rts")
    temp = temp.merge(rts_per_user, how = 'right', on='Author ID')
    temp['normal_weight'] = temp['number_of_rts']/temp['total']
    temp['normal_weight'] = temp['normal_weight']
    temp.drop(columns='total', inplace=True)
    temp.rename(columns = {'Author ID': "source_user_id", "Referenced Tweet Author ID": "target_user_id"}, inplace=True)
    
    # Assign Index
    temp['Source'] = temp['source_user_id'].apply(lambda x: int(id_to_idx[x]))
    temp['Target'] = temp['target_user_id'].apply(lambda x: int(id_to_idx[x]))
    
    # Assign political affiliation
    temp["source_political_afilliation"] = temp["source_user_id"].apply(lambda x: user_to_party_paro[x])
    temp["target_political_afilliation"] = temp["target_user_id"].apply(lambda x: user_to_party_paro[x])
    temp.dropna(subset=['target_political_afilliation', 'source_political_afilliation'], inplace=True)
    
    # Assign Label
    temp['source_label'] = temp['source_user_id'].apply(lambda x: id_to_label[x])
    temp['target_label'] = temp['target_user_id'].apply(lambda x: id_to_label[x])

    #Adding dtypes
    temp = temp.astype({
        'Source': int,
        'Target': int,
        'number_of_rts': int,
        'normal_weight': float,
        'source_user_id': int,
        'target_user_id': int
    })
    columns = list(temp.columns)
    columns.remove('Source')
    columns.insert(0, 'Source')
    columns.remove('Target')
    columns.insert(1, 'Target')
    temp = temp[columns]

    # Save results as csv
    temp.to_csv(os.path.join(save_path, "Source-Target", 'starting_' + str(date.date()) + ".csv"), index = False, sep = ";")
    
def main(daily_grid):
    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_source_target, daily_grid), total = len(daily_grid)))
    return futures

if __name__ == '__main__':
    main(daily_grid)

100%|██████████| 63/63 [00:16<00:00,  3.89it/s]


### Create networks

In [48]:
def create_graph(file_tuple):
    file1, file2 = file_tuple
    date_str = file1.split('.')[-2].split('/')[-1].split('_')[-1]
    
    csv = pd.read_csv(file1, delimiter=';')
    
    g = gt.Graph(directed=True)
    g.add_vertex(len(master_id))
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')
    
    for row in csv.itertuples(index = False):
        e = g.add_edge(row[0], row[1])
        number_of_rts[e] = row[4]
        normal_weight[e] = row[5]

    g.ep['Number of rts'] = number_of_rts
    g.ep['Normal Weight'] = normal_weight
    
    # Create Edge property map
    number_of_rts = g.new_edge_property('int')
    normal_weight = g.new_edge_property('float')

    # Create a Vertex property maps
    vertex_PA_map = g.new_vertex_property('string')
    vertex_ID_map = g.new_vertex_property('int64_t')
    vertex_label_map = g.new_vertex_property('string')
    vertex_color_map = g.new_vertex_property('string')
    vertex_isolate_map = g.new_vertex_property('bool')
    vertex_tweets_map = g.new_vertex_property('int64_t')
    
    # Get Different categories
    categories = [cat for cat in set(idx_to_pa.values())]
    for cat in categories:
        # Create a dummy property map
        vertex_dummy_map = g.new_vertex_property('bool')
        globals()['map_'+cat] = vertex_dummy_map
    
    with open(file2, "rb") as file:
        tweets_per_day = pickle.load(file)

    # Assign Labels to vertices using loop (Create one VertexPropertyMap for every Political Label)
    for v_index in g.iter_vertices():
        v = g.vertex(v_index)
        # Add tweets information of users
        vertex_tweets_map[v] = tweets_per_day[idx_to_id[v]]
        
        # Add Master Index Information
        vertex_PA_map[v] = idx_to_pa[v]
        vertex_ID_map[v] = idx_to_id[v]
        vertex_label_map[v] = idx_to_label[v]
        vertex_color_map[v] = color[idx_to_pa[v]]
        
        # Add categories Information
        for cat in categories:
            if idx_to_pa[v] == cat:
                map = globals()['map_'+cat]
                map[v] = True
                globals()['map_'+cat] = map
            else:
                map = globals()['map_'+cat]
                map[v] = False
                globals()['map_'+cat] = map
        
        # Add isolated
        if v.out_degree() == 0 and v.in_degree() == 0:
            vertex_isolate_map[v] = True
        else: 
            vertex_isolate_map[v] = False

    # Add properties
    g.vp['Political Label'] = vertex_PA_map
    g.vp['User ID'] = vertex_ID_map
    g.vp['Label'] = vertex_label_map
    g.vp['Color'] = vertex_color_map
    g.vp['Isolate'] = vertex_isolate_map
    g.vp['Tweets'] = vertex_tweets_map
    
    # Add dummy maps
    for cat in categories:
        map = globals()['map_'+cat]
        g.vp[cat] = map
    
    # Add graph properties
    graph_date_map = g.new_graph_property('string')
    graph_date_map[g] = date_str
    g.gp['Date'] = graph_date_map

    # Save Graphs
    filename = os.path.join(save_path, 'Graphs' ,'starting_' + date_str + ".graphml")
    g.save(filename)


def main():
    source_target_files = glob('../../../Data/Daily_graphs/Source-Target/starting_*.csv')
    tweets_files = glob('../../../Data/Daily_graphs/Tweets_Per_Day/starting_*.pkl')
    source_target_files.sort()
    tweets_files.sort()
    files = list(zip(source_target_files, tweets_files))

    with ProcessPoolExecutor() as executor:
        futures = list(tqdm(executor.map(create_graph, files), total = len(files)))
    return futures

if __name__ == '__main__':
    main()

100%|██████████| 63/63 [00:15<00:00,  4.02it/s]


In [51]:
print('Edge List')
ej_csv = pd.read_csv(save_path + f'/Source-Target/starting_2021-05-04.csv', sep = ';')
nodes = set(set(ej_csv['Source'].unique()).union(set(ej_csv['Target'].unique())))
print(f"Non Isolate Vertices {len(nodes):,} and {len(ej_csv):,} edges")

# Example of our graphs
ej_g = gt.load_graph(save_path + f'/Graphs/starting_2021-05-04.graphml')
print('Graph')
print(f"Non Isolate Vertices {ej_g.num_vertices() - sum(ej_g.vp['Isolate']):,} and {ej_g.num_edges():,} edges")

Edge List
Non Isolate Vertices 28,329 and 327,632 edges
Graph
Non Isolate Vertices 28,329 and 327,632 edges


In [52]:
dict(ej_g.vp)

{'Centro': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87ac9ec50, at 0x7fc74241e050>,
 'Color': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87ac9ec50, at 0x7fc74241dfd0>,
 'Derecha': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87ac9ec50, at 0x7fc74241de50>,
 'Isolate': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87ac9ec50, at 0x7fc74241dd50>,
 'Izquierda': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87ac9ec50, at 0x7fc74241dc50>,
 'Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87ac9ec50, at 0x7fc74241db50>,
 'Political Label': <VertexPropertyMap object with value type 'string', for Graph 0x7fc87ac9ec50, at 0x7fc74241da50>,
 'Sin Clasificar': <VertexPropertyMap object with value type 'bool', for Graph 0x7fc87ac9ec50, at 0x7fc74241d990>,
 'Tweets': <VertexPropertyMap object with value type 'int64_t', for Graph 0x7fc87ac9ec50, at 0x7fc74241d890>,
 'User I

In [53]:
dict(ej_g.ep)

{'Normal Weight': <EdgePropertyMap object with value type 'double', for Graph 0x7fc87ac9ec50, at 0x7fc74241d690>,
 'Number of rts': <EdgePropertyMap object with value type 'int32_t', for Graph 0x7fc87ac9ec50, at 0x7fc74241d5d0>}

In [54]:
# Checking properties of Vertex
test_id = 1920140406
test_idx = id_to_idx[test_id]
print(f'Idx in Graph: {test_idx}')
print(ej_g.vp['User ID'][test_idx])
print(ej_g.vp['Label'][test_idx])
print(ej_g.vp['Political Label'][test_idx])
print(ej_g.vp['Color'][test_idx])
print('---------Checking on Master Index----------')
master_id.iloc[test_idx]

Idx in Graph: 23639
1920140406
Santialarconu
Izquierda
blue
---------Checking on Master Index----------


ID                               23639
Label                    Santialarconu
Political Affiliation        Izquierda
User ID                     1920140406
Name: 23639, dtype: object

In [55]:
# Checking properties of Vertex
test_id_source = 1389784145417678848
test_id_target = 1010113508233699328

source_idx = id_to_idx[test_id_source]
target_idx = id_to_idx[test_id_target]
test_edge_idx = (source_idx, target_idx)

edge_index = ej_g.edge_index[test_edge_idx]
print(f'Edge: {test_edge_idx}')
print(f'Edge In graph: {test_edge_idx}')
print(f"Normal Weights {ej_g.ep['Normal Weight'][test_edge_idx]}")
print(f"Number of rts {ej_g.ep['Number of rts'][test_edge_idx]}")
print('--------Value in Edge List---------')
ej_csv.loc[(ej_csv['Source'] == source_idx) & (ej_csv['Target'] == target_idx)]

Edge: (36963, 31708)
Edge In graph: (36963, 31708)
Normal Weights 0.1666666666666666
Number of rts 1
--------Value in Edge List---------


Unnamed: 0,Source,Target,source_user_id,target_user_id,number_of_rts,normal_weight,source_political_afilliation,target_political_afilliation,source_label,target_label
327631,36963,31708,1389784145417678848,1010113508233699328,1,0.166667,Izquierda,Izquierda,kars0518,LevyRincon
