In [1]:
# ALL IMPORTS
import pandas as pd
import networkx as nx
import numpy as np
import io
import re
import pickle
import copy

In [2]:
# Load the dataframes
df_athletes = pickle.load(open('df_athletes.txt','rb'))
df_countries = pickle.load(open('df_countries.txt','rb'))
df_events = pickle.load(open('df_events.txt','rb'))

In [3]:
def findLinksAndRemove(f,df1,df2,df3):
    """ Helperfunction to find links in wikipages 
    and remove the link if it is not a wikilink 
    in the dataframes.
    
    Input: the file, and the 3 dataframes
    -------------------------------
    Output: list of edges
    """
    links = re.findall("\[\[(.*?)\]\]", f)                         # Use a regular expression to extract all outgoing links
    links = [x.replace(' ','_') for x in links]                    # Replace space with _
    links = [s.split('|') for s in links]                          # Split the links by the '|'
    
    
    edges = []                                                     # An empty list for edges
    for i in range(len(links)):                                    # Run through all links
                                                                   # For each link, check if the target is in the data.
        if len(df1.loc[df1['WikiLink'] == links[i][0]]) >= 1 or len(df2.loc[df2['WikiLink'] == links[i][0]]) >= 1 or len(df3.loc[df3['WikiLink'] == links[i][0]]) >= 1:
            edges.append(links[i][0])                              # If yes add the link to the edge list. If no, discard it.
    return(edges)


def AddNodes(G, df, nodetype):
    """ Function to add nodes to the graph. 
    Every wikilink in the data is a node in the graph.
     
    Input: The NetworkX DiGraph, The dataframe and the nodetype
    -------------------------------
    Output: The NetworkX DiGraph
    """
    for i in range(df.shape[0]):                                  # Run through all wikilinks in the dataframe
        G.add_node(df.WikiLink.iloc[i], nodetype = nodetype)      # Add the node to the Graph
 
   
def AddEdges(G, df1, df2, df3, nodetype):
    """ Function to add edges to the graph. 
     
    Input: The NetworkX DiGraph, all dataframe and the nodetype
    -------------------------------
    Output: The NetworkX DiGraph
    """
    path_folder = ("./Files/")                                    # Folder with all the downloaded wikipages
    for i in range(df1.shape[0]):                                 # Run through all wikilinks in the dataframe                                                                  
        Node = df1['WikiLink'].iloc[i]                            # Open the page file
        f = io.open(path_folder + nodetype + Node + ".txt",'r',encoding = 'utf-8').read()
        edgesTo = findLinksAndRemove(f,df1,df2,df3)               # Run the helperfuncktion to find links and remove the link if it is not in the data.
        
        for j in edgesTo:                                         # Run through all the finded edges
            if j in list(G.nodes):                                # If the edge link to a node add edge to Graph
                G.add_edge(Node, j)

In [4]:
# Use a NetworkX DiGraph to store the network. Store also the properties of the nodes (i.e. from which dataframe they hail).
G = nx.DiGraph()

# Add Nodes
AddNodes(G, df_countries, 'countries')
AddNodes(G, df_events, 'events')
AddNodes(G, df_athletes, 'athletes')

# Add edges
AddEdges(G, df_countries, df_events, df_athletes, 'countries_')
AddEdges(G, df_events, df_countries, df_athletes, 'events_')
AddEdges(G, df_athletes, df_events, df_countries, 'athletes_')

In [5]:
# Check if nodes do not have any out- or in- degrees. These may discard from the network.
remove = [node for node, degree in dict(G.degree()).items() if degree == 0]
G.remove_nodes_from(remove) 

# Save the graph.
pickle.dump(G, open('G.txt', 'wb'))