# Graph Construction: Network X

In this Notebook, we construct the graph of the Rt's Users during the Paro Nacional. Each graph corresponds to the Rt Network in moving windows of three days between April 28 of 2021 and June 30 of 2021

In [3]:
import pickle
from glob import glob
import numpy as np
import pandas as pd
import networkx as nx
from datetime import datetime
from tqdm import tqdm
from scipy.sparse import csr_matrix
from scipy.sparse import lil_matrix

In [4]:
with open('/mnt/disk2/Data/Pickle/user_indices.pkl','rb') as file:
    user_indices = pickle.load(file)

with open('/mnt/disk2/Data/Pickle/user_to_party.pkl','rb') as file:
    user_to_party = pickle.load(file)

In [5]:
# from user_indices we transform the keys as values and the values as keys
user_indices_2 = {value: key for key, value in user_indices.items()}

Here we create the ```construct_graph``` that will allow us to create the required graph

In [6]:
def construct_graph(adj, date_value):
    """
    Description of your function.

    Args:
        adj (type): The adjacency matrix used to construct the graph. This matrices are saved as _.npz_ files in the _Data/Matrices_ Folder
        date_value (type): Date in datetime format This will be the atributte that will characterices each graph belonging to a specific Date

    Returns:
        type: Graph

    """
    # Make sure the diagonal is zeros.
    csr = csr_matrix(adj)
    lil_matrix = csr.tolil()
    graph = nx.Graph()
    num_nodes = adj.shape[0]
    graph.add_nodes_from(range(num_nodes))

    # Add edges to the graph
    for i in range(num_nodes):
        for j in lil_matrix.rows[i]:
            graph.add_edge(i, j, weight = lil_matrix[i, j])

    # Assign 'affiliation' attribute using list comprehension
    aff_list = [user_to_party.get(user_indices_2.get(node)) for node in graph.nodes]
    attributes = dict(zip(graph.nodes, aff_list))
    nx.set_node_attributes(graph, attributes, 'affiliation')

    # Date attribute for nodes
    date_attribute = {node: date_value for node in graph.nodes}
    nx.set_node_attributes(graph, date_attribute, 'date')
    
    # Date attribute for edges
    date_attribute_edges = {(u,v): date_value for u,v in graph.edges}
    nx.set_edge_attributes(graph, date_attribute_edges, 'date')

    # re Labels
    new_labels = {node: user_indices_2.get(node) for node in graph.nodes}
    nx.relabel_nodes(graph, new_labels, copy = False)

    return graph

In [7]:
# Dates of the Paro Nacional
v1_start = '2021-04-28 00:00:00'
v1_end = '2021-06-27 00:00:00'
date1 = pd.date_range(start = v1_start, end = v1_end, freq = 'D')

v2_start = '2021-04-30 23:59:59'
v2_end = '2021-06-29 23:59:59'
date2 = pd.date_range(start = v2_start, end = v2_end, freq = 'D')
datestr = list(date2.strftime("%d-%m"))


After declaring our constructor function, we now proceed to create the loop over the adjcency matrices to create the graphs

In [None]:
k = 0
files = glob('../fcastrillon/Data/Matrices/*.npz')
for file in tqdm(files):
    
    # First, we load the stored info into a normal CSR matrix.
    data = np.load(file)
    indices = data['indices']
    indptr = data['indptr']
    shape = data['shape']
    data = data['data']
    A = csr_matrix((data, indices, indptr), shape = shape)
    
    # Now we can create the graph using 'graph_tool'.
    date = date2[k]
    date = datetime.strftime(date, '%d-%m-%Y')
    graph = construct_graph(A, date)
         
    # Finally we save the graph in .graphml format.
    filename = f'graph_{date}.graphml'
    output_filepath = '../fcastrillon/Data/Graphs/' + filename
    nx.write_graphml(graph, output_filepath)
    print(f"File '{filename}' successfully created and stored.")
    k += 1

In [4]:
# Load the graph files first. (24 minutes to load)
files = glob('/mnt/disk2/Data/Graphs/*.graphml')
    
graph_list = []  # List to store the loaded graphs

for i, file in tqdm(enumerate(files)):
    graph = nx.read_graphml(file)
    graph_list.append(graph)
    globals()["g" + str(i + 1)] = graph
    
del graph  
del graph_list

61it [24:17, 23.90s/it]


## Node Processing

In [27]:
node_list = []

for node, data in tqdm(g1.nodes(data = True)):
    node_data = {'node': node, 'attributes': data}
    node_list.append(node_data)

node_info = [[d['node']] + list(d['attributes'].values()) if isinstance(d['attributes'], dict) else [d['node']] + d['attributes'] for d in node_list]
node_info = pd.DataFrame(node_info)

# Set column names
node_info.columns = ['ID', 'Affiliation', 'Date']

# We check for duplicates
duplicates = node_info.duplicated()
duplicates.sum()
node_info = node_info.drop_duplicates()

print(node_info.shape)
node_info.head()

100%|██████████| 37308/37308 [00:00<00:00, 915169.04it/s]

(37308, 3)





Unnamed: 0,ID,Affiliation,Date
0,281521014,Retweets Izquierda,11-06-2021
1,1389784145417678848,Retweets Izquierda,11-06-2021
2,1389769251704147968,Retweets Izquierda,11-06-2021
3,1389741234370064384,No Retweets,11-06-2021
4,1389737202742071296,Retweets Derecha,11-06-2021


In [28]:
# we import the list of users and some info from the Checkpoint users_to_date.pkl File
users_to_date = pd.read_pickle('/mnt/disk2/Data/Pickle/users_to_date.pkl')
print(users_to_date.shape)
users_to_date.head()

(37139, 5)


Unnamed: 0,user_id,handle,n_tweets,followers,date
0,17813487,NoticiasCaracol,9829,9369950,2023-05-21
1,7996082,el_pais,11393,8393111,2023-05-21
2,35013719,NoticiasRCN,4871,8246188,2023-05-21
3,9633802,ELTIEMPO,9604,7804384,2023-05-21
4,14834302,elespectador,20903,6101363,2023-05-21


In [29]:
users_to_date = users_to_date.drop(users_to_date.columns[-1], axis = 1)
users_info = {}

for index, row in tqdm(users_to_date.iterrows()):
    key = row['user_id']
    values = row.drop('user_id').to_dict()
    users_info[key] = values

37139it [00:14, 2481.34it/s]


In [30]:
# Change the name of the users_id column
users_to_date.rename(columns={'user_id': 'ID'}, inplace = True)

# change the type of the nod_info to float
node_info['ID'] = node_info['ID'].astype(float)
users_to_date['ID'] = users_to_date['ID'].astype(float)

# We Add the info of the users_to_date
node_info = pd.merge(node_info, users_to_date, on = 'ID', how = 'left')
print(node_info.shape)
node_info.head()

(37308, 6)


Unnamed: 0,ID,Affiliation,Date,handle,n_tweets,followers
0,281521000.0,Retweets Izquierda,11-06-2021,Loucypher0,277.0,776.0
1,1.389784e+18,Retweets Izquierda,11-06-2021,kars0518,377.0,53.0
2,1.389769e+18,Retweets Izquierda,11-06-2021,VaneLen18,756.0,8.0
3,1.389741e+18,No Retweets,11-06-2021,JhonatanVRojo,84.0,103.0
4,1.389737e+18,Retweets Derecha,11-06-2021,JC13177979,1950.0,94.0


In [31]:
# We filter our data to keep this columns
columns = node_info.columns.tolist()
columns = [columns[0], columns[3], columns[1], columns[2]]
columns

['ID', 'handle', 'Affiliation', 'Date']

In [32]:
# Filter
node_info = node_info[columns]
node_info.rename(columns={'handle': 'label'}, inplace = True)

# Drop Date
node_info = node_info.drop(node_info.columns[-1], axis = 1)

# Rename
node_info['label'] = node_info['label'].astype(str)
node_info['Affiliation'] = node_info['Affiliation'].astype(str)

node_info.to_csv('/mnt/disk2/Data/nodes.csv', index = False)

## Edge Processing

In [6]:
graphs = []

for i in range(1, 61):
    g_name = 'g' + str(i)
    graphs.append(g_name)

edge_lists = []

In [7]:
for idx, graph_name in tqdm(enumerate(graphs, start = 1)):
    edge_list = []
    graph = globals()[graph_name]
    
    for u, v, data in graph.edges(data = True):
        edge_data = {'source': u, 'target': v, 'attributes': data}
        edge_list.append(edge_data)

    edge_info = [[d['target']] + [d['source']] + list(d['attributes'].values()) if isinstance(d['attributes'], dict) else [d['node']] + d['attributes'] for d in edge_list]
    edge_info = pd.DataFrame(edge_info)

    # Set column names
    edge_info.columns = ['target', 'source', 'weight', 'date']
    duplicates = edge_info.duplicated()
    edge_info = edge_info.drop_duplicates()
    
    edge_lists.append(edge_info)
    del graph

60it [08:25,  8.42s/it]


In [8]:
edges_info = pd.concat(edge_lists)
del edge_lists
edges_info.head()

Unnamed: 0,target,source,weight,date
0,900469958106329088,1389769251704147968,1.0,11-06-2021
1,882711922830974976,1389769251704147968,1.0,11-06-2021
2,2351617668,1389769251704147968,2.0,11-06-2021
3,172343678,1389769251704147968,1.0,11-06-2021
4,141943866,1389769251704147968,1.0,11-06-2021


In [10]:
# Check for duplicates
duplicates = edges_info.duplicated()
print(sum(duplicates))


0


In [12]:
# Change data type of the edges_info Data Frame for consistency in data
edges_info['target'] = edges_info['target'].astype(float)
edges_info['source'] = edges_info['source'].astype(float)
edges_info['weight'] = edges_info['weight'].astype(int)
edges_info['date'] = pd.to_datetime(edges_info['date'], dayfirst=True)

edges_info = edges_info.sort_values('date')

In [None]:
edges_info.to_csv('C:/Users/Usuario/OneDrive - Universidad de los andes/TorniquetesLight/Twitter/Data/graph_v1/spreadsheet/edges.csv', index = False)

## Subgraphs

In [13]:
edges_info['date'] = pd.to_datetime(edges_info['date'])
edges_info[r'Timeset'] = edges_info['date'].apply(lambda x: x.isoformat())

In [14]:
min_date = edges_info['date'].min()
max_date = edges_info['date'].max()
min_date = pd.to_datetime(min_date)
max_date = pd.to_datetime(max_date)

In [15]:
# Define the number of days in each sub-dataframe
interval = 7
dates = pd.date_range(start = min_date,
                      end = max_date + pd.Timedelta(days = 10), # 10 for 10 subgraphs
                      freq = f'{interval}D')
dates = [date.isoformat() for date in dates]

In [16]:
sub_edges = []
for i in range(len(dates) - 1):
    start_date = dates[i]
    end_date = dates[i + 1]
    sub_df = edges_info[(edges_info['date'] >= start_date) & (edges_info['date'] < end_date)]
    sub_edges.append(sub_df)

### STEPS TO LOAD DYNAMIC GRAPH IN GEPHI
##### 1) Open Gephi
##### 2) Import spreadsheet - nodes.csv
##### 3) Import timeset as TimeStamp
##### 4) Import spreadsheet - edges.csv
##### 5) Import timeset as TimeStamp