# APPENDIX A: Data Cleaning, Data scraping, and Network creation

In [15]:
# ALL IMPORTS
import pandas as pd
import numpy as np
import urllib.request
import re
import json, urllib
import sys
import io
import pickle
import networkx as nx

## Download Wikipedia pages function:

In [2]:
def download(df,nodetype):
    """ This functions Downloads the Wikipages 
    from Wikilinks column in the dataframe
    
    Input: dataframe and nodetype
    -------------------------------
    Output: 
    """
    for i in range(df.shape[0]):                                             # Run through all wikilinks in dataframe 
        baseurl = "https://en.wikipedia.org/w/api.php?"
        action = "action=query"
        title = "titles=" + df["WikiLink"].iloc[i]
        content = "prop=revisions&rvprop=content"
        dataformat ="format=json"
        query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat).encode('ascii', 'ignore').decode('ascii')
        
        wikiresponse = urllib.request.urlopen(query)                         
        wikidata = wikiresponse.read()                                       
        wikitext = wikidata.decode('utf-8')
        wikijson = json.loads(wikitext)['query']['pages']
        key = list(wikijson.keys())[0]

        if key != '-1':                                                      # If Key = '-1',the wikilink does not exist 
            wiki_Print = wikijson[key]['revisions'][0]['*']
            sys.stdout = open(nodetype + "_" + df["WikiLink"].iloc[i] + ".txt", "w",encoding="utf-8")
            print(wiki_Print)

## The Events dataframe

In [3]:
def read_events_file(events):
    """ This functions takes the .csv file --> 
    Convert it into a panda dataframe -->
    Do some datacleaning --> 
    add the WikiLink column
    
    Input: 1.csv files
    -------------------------------
    Output: 1 dataframes
    """    
    df_events = pd.read_csv(events, sep = ';', encoding = 'latin-1', dtype = object)         # Convert events.csv to panda dataframe
    df_events['WikiLink'] = df_events['sport']+"_at_the_2016_Summer_Olympics"                # Add a column for WikiLinks
    df_events['WikiLink'] = df_events['WikiLink'].str.replace('canoe', 'canoeing')           # Replace canou (for Matching Wikilink)
    df_events['WikiLink'] = df_events['WikiLink'].str.replace('hockey', 'field hockey')      # Replace hockey (for Matching Wikilink)
    df_events['WikiLink'] = df_events['WikiLink'].str.replace('synchronised','synchronized') # Replace synchronised (for Matching Wikilink)
    df_events['WikiLink'] = df_events['WikiLink'].str.replace('\s', '_')                     # Replase space with _
    df_events = df_events.drop_duplicates(subset = ["WikiLink"])                             # drop duplicates
    df_events = df_events.dropna()                                                           # Remove NanN
    
    return df_events

df_e = read_events_file('events.csv')

Download events Wikipedia pages:

In [4]:
download(df_e,'events')

## The Country dataframe

In [5]:
def read_Country_file(countries):
    """ This functions takes the country.csv 
    Convert it into a panda dataframe -->
    Do some datacleaning --> 
    add the WikiLink column
    
    Input: .csv files
    -------------------------------
    Output: dataframes
    """
    df_countries = pd.read_csv(countries, sep = ';',   encoding = 'latin-1', dtype = object) # Convert countries.csv into panda dataframe
    df_countries['WikiLink'] = df_countries['country']+"_at_the_2016_Summer_Olympics"        # Add a column for WikiLinks
    df_countries['country2'] = df_countries['country']                                       # Add a column country2 (Will be used for finding athletes WikiLink)
    df_countries['country2'] = df_countries['country2'].str.replace('\s', '_')               # Replase space with _
    df_countries['WikiLink'] = df_countries['WikiLink'].str.replace('\s', '_')               # Replase space with _
    df_countries['WikiLink'] = df_countries['WikiLink'].str.replace("*", "")                 # Remove *
    df_countries = df_countries.dropna()                                                     # Remove NanN
    
    return df_countries

df_c = read_Country_file('countries.csv')

Download Countries wikipedia pages:

In [6]:
download(df_c,'countries')

## The Athletes dataframe

In [7]:
def read_athletes_file(athletes):
    """ This functions takes the .csv file --> 
    Convert it into a panda dataframe -->
    Do some datacleaning 
    
    Input: 1 .csv files
    -------------------------------
    Output: 1 dataframes
    """
  
    df_athletes = pd.read_csv(athletes, sep = ',',   encoding = 'utf-8', dtype = object)     # Convert athletes.csv to panda dataframe
    df_athletes['name2'] = df_athletes['name']                                               # Add a column name2 (Will be used for finding athletes WikiLink)
    df_athletes['name2'] = df_athletes['name2'].str.replace('\s', '_')                       # Replase space with _
    df_athletes = df_athletes.dropna()                                                       # Remove NanN
                                                                                
    
    return df_athletes

df_a = read_athletes_file('athletes.csv')

Update the athlete dataframe with wikilink found in the already downloaded country Wikipedia pages 

In [9]:
def Athlet_WikiLink(df1, df2):
    """ This functions finds the athletes wikilinks by 
    find all wikilinks from the coutries wikipages. 
    If the wikilinks matches a name from the athletes dataframe
    save the wikilink in a list.
    
    Input: athletes dataframe, countries dataframe
    -------------------------------
    Output: list with athletes Wikilinks
    """
    A_WikiLink = []                                                                          # Create an empty list
    for i in df2['country2']:                                                                # Run through all countries
        f = io.open('countries_'+ i + '_at_the_2016_Summer_Olympics.txt','r',encoding = 'utf-8').read() # Open the file
        links = re.findall("\[\[(.*?)\]\]", f)                                               # Find all links
        links = [x.replace(' ','_') for x in links]                                          # Replase space with _
        links = [s.split('|') for s in links]                                                # Split the links by the '|'

        for j in range(len(links)):                                                          # Run through all links
                if len(links[j]) == 1:                                                       # If the name and the links are the same
                    if len(df1.loc[df1['name2'] == links[j][0]]) >= 1:                       # If the link excist in the athletes dataframe
                        A_WikiLink.append([df1.at[df1.loc[df1['name2'] == links[j][0]].index[0],'id'],links[j][0]]) # append the wikilink and the 'id' number to the list
                elif len(links[j]) == 2:                                                     # If the name and the links are not the same
                    if len(df1.loc[df1['name2'] == links[j][1]]) >= 1:                       # If the link excist in the athletes dataframe
                        A_WikiLink.append([df1.at[df1.loc[df1['name2'] == links[j][1]].index[0],'id'],links[j][0]]) # append the wikilink and the 'id' number to the list
    return A_WikiLink

A_WikiLink = Athlet_WikiLink(df_a,df_c)                           # Run Function

df_if = pd.DataFrame(A_WikiLink, columns = ['id', 'WikiLink'])    # Convert wikilink list into panda dataframe
df_a = df_a.merge(df_if)                                          # Merge wikilink dataframe and athletes dataframe
df_a = df_a.drop_duplicates()                                     # Drop duplicates


Download the atheletes wikipedia pages

In [10]:
download(df_a,'athletes')

### #REDIRECT

In [11]:
def Athlet_REDIRECT_WikiLink(df1):
    """ This functions finds all athelets wikipedia pages
    that contain '#REDIRECT' and add the correct wikilink to a list.
    
    Input: athletes dataframe
    -------------------------------
    Output: list with correct athletes Wikilinks
    """
    RE_WikiLink = []                                                                          # Create an empty list
    for i in df1['WikiLink']:                                                                 # Run through all atheletes
        f = io.open('athletes_'+ i + '.txt','r',encoding = 'utf-8').read()                    # Open the file
        if len(re.findall("#REDIRECT", f)) > 0:                                               # If #REDIRECT
            links = re.findall("\[\[(.*?)\]\]", f)                                            # Find the new links
            links = [x.replace(' ','_') for x in links]                                       # Replase space with _
            
            for j in range(len(links)):                                                       # Run through all links                                                     # If the name and the links are the same
                if len(df1.loc[df1['WikiLink'] == i]) >= 1:                                   # If the link excist in the athletes dataframe
                        RE_WikiLink.append([df1.at[df1.loc[df1['WikiLink'] == i].index[0],'id'], links[j]]) # append the wikilink and the 'id' number to the list
               
    return RE_WikiLink

RE_WikiLink = Athlet_REDIRECT_WikiLink(df_a)                         # Run Function
df_if = pd.DataFrame(RE_WikiLink, columns = ['id', 'RE_WikiLink'])   # Convert wikilink list into panda dataframe
df_a_RE = df_a.merge(df_if)                                          # Merge wikilink dataframe and athletes dataframe


Some of the REDIRECTED Wikipedia pages cannot be downloaded due to f.eks. the name consists of an 'é'. Therefore these 44 athletes will be removed from the dataframe: 

In [12]:
# Remove #REDIRECT from dataframe:
df_a = df_a[~df_a.id.isin(df_a_RE.id)]

## Create the Olympic Network

Functions to create a networkx graph.

In [17]:
def findLinksAndRemove(f,df1,df2,df3):
    """ Helperfunction to find links in wikipages 
    and remove the link if it is not a wikilink 
    in the dataframes.
    
    Input: the file, and the 3 dataframes
    -------------------------------
    Output: list of edges
    """
    links = re.findall("\[\[(.*?)\]\]", f)                         # Use a regular expression to extract all outgoing links
    links = [x.replace(' ','_') for x in links]                    # Replace space with _
    links = [s.split('|') for s in links]                          # Split the links by the '|'
    
    
    edges = []                                                     # An empty list for edges
    for i in range(len(links)):                                    # Run through all links
                                                                   # For each link, check if the target is in the data.
        if len(df1.loc[df1['WikiLink'] == links[i][0]]) >= 1 or len(df2.loc[df2['WikiLink'] == links[i][0]]) >= 1 or len(df3.loc[df3['WikiLink'] == links[i][0]]) >= 1:
            edges.append(links[i][0])                              # If yes add the link to the edge list. If no, discard it.
    return(edges)


def AddNodes(G, df, nodetype):
    """ Function to add nodes to the graph. 
    Every wikilink in the data is a node in the graph.
     
    Input: The NetworkX DiGraph, The dataframe and the nodetype
    -------------------------------
    Output: The NetworkX DiGraph
    """
    for i in range(df.shape[0]):                                  # Run through all wikilinks in the dataframe
        G.add_node(df.WikiLink.iloc[i], nodetype = nodetype)      # Add the node to the Graph
 
   
def AddEdges(G, df1, df2, df3, nodetype):
    """ Function to add edges to the graph. 
     
    Input: The NetworkX DiGraph, all dataframe and the nodetype
    -------------------------------
    Output: The NetworkX DiGraph
    """
    for i in range(df1.shape[0]):                                 # Run through all wikilinks in the dataframe                                                                  
        Node = df1['WikiLink'].iloc[i]                            # Open the page file
        f = io.open(nodetype + Node + ".txt",'r',encoding = 'utf-8').read()
        edgesTo = findLinksAndRemove(f,df1,df2,df3)               # Run the helperfuncktion to find links and remove the link if it is not in the data.
        
        for j in edgesTo:                                         # Run through all the finded edges
            if j in list(G.nodes):                                # If the edge link to a node add edge to Graph
                G.add_edge(Node, j)

Create the network

In [18]:
# Use a NetworkX DiGraph to store the network. Store also the properties of the nodes (i.e. from which dataframe they hail).
G = nx.DiGraph()

# Add Nodes
AddNodes(G, df_c, 'countries')
AddNodes(G, df_e, 'sports')
AddNodes(G, df_a, 'athletes')

# Add edges
AddEdges(G, df_c, df_e, df_a, 'countries_')
AddEdges(G, df_e, df_c, df_a, 'events_')
AddEdges(G, df_a, df_e, df_c, 'athletes_')

# Check if nodes do not have any out- or in- degrees. These may discard from the network.
remove = [node for node, degree in dict(G.degree()).items() if degree == 0]
G.remove_nodes_from(remove) 

# Find largest connected_components
largest_cc = max(nx.weakly_connected_components(G), key=len)
G = G.subgraph(largest_cc).copy()

# Add dataframes, ass attributes to the nodes
node_attr_e = df_e.set_index('WikiLink').to_dict('index')
node_attr_c = df_c.set_index('WikiLink').to_dict('index')
node_attr_a = df_a.set_index('WikiLink').to_dict('index')
nx.set_node_attributes(G, node_attr_e)
nx.set_node_attributes(G, node_attr_c)
nx.set_node_attributes(G, node_attr_a)


## Save the dataframes and Network
Finally, save the 3 cleaned dataframes and the network for further analysis.

In [20]:
# Save the dataframes:
pickle.dump(df_a, open('df_athletes.txt', 'wb'))
pickle.dump(df_c, open('df_countries.txt', 'wb'))
pickle.dump(df_e, open('df_events.txt', 'wb'))

# Save the network
pickle.dump(G, open('G.txt', 'wb'))

# Save as the graph as .gexf for making an interactive graph plot in GEPHI with sigma js exporter. 
nx.write_gexf(G, "G.gexf")