In [None]:
import re
import wikipedia
import io
from SPARQLWrapper import SPARQLWrapper, JSON

#pip install SPARQLWrapper
#pip install wikipedia

# Convert wiki codes to wikipedia pages
def get_wiki_pages(wiki_codes):
    loss = 0
    wiki_list = []
    wikipedia.set_lang('en')
    
    for wiki_code in wiki_codes:
        try:
            wiki_list.append(wikipedia.page(pageid=wiki_code))
        except Exception:
            loss += 1
    print("Players not found: " + str(loss))
    return wiki_list

# Convert wikipedia pages to strings
def get_name_list(wiki_list):
    name_list = []
    [name_list.append(wiki.title) for wiki in wiki_list]
    return name_list

# Count for each name in name_list how many times it appears in links in the player wikipedia page
def count_references(player_wiki, name_list):
    dict_player = {}
    
    # References in links
    for link in player_wiki.links:
        if link in name_list:
            if link in dict_player.keys():
                dict_player[link] += 1
            else:
                dict_player[link] = 1
    
    # References in content
    for player_name in name_list:
        if player_name in player_wiki.content:
            if player_name in dict_player.keys():
                dict_player[player_name] += player_wiki.content.count(player_name)
            else:
                dict_player[player_name] = player_wiki.content.count(player_name)
        
    return dict_player

# Get dict of link references count for name_list for every wikipedia page in wiki_list
def get_reference_count_of_all_players(wiki_list, name_list):
    dict_all_players = {}
    workload = len(wiki_list)
    completed = 0
    for wiki in wiki_list:
        dict_all_players[wiki.title] = count_references(wiki, name_list)
        completed += 1
        print(str(completed)  + " out of " + str(workload) + " done.")
    return dict_all_players

# Get list of wikipedia page code from dbpedia
def get_players_wiki_code():
    wiki_codes = []
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
                        select ?wiki where {
                            {?person rdf:type dbo:TennisPlayer}
                            {?person dbo:wikiPageID ?wiki}
                        }
                    """)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()


    for result in results["results"]["bindings"]:
        wiki_codes.append(result["wiki"]["value"])
        
    return wiki_codes

# Create CSV Matrix from dictionary
def create_matrix(dict_all_players):
    matrix = ""
    for player in dict_all_players.keys():
        matrix += ";" + player
    matrix += "\n"
    
    for current_player, player_link_dict in dict_all_players.items():
        matrix += current_player
        for player in dict_all_players.keys():
            if player == current_player:
                # Put zero for self reference
                matrix += ";" + str(0) 
            else:
                if player in player_link_dict.keys():
                    # Current player has links to this player
                    matrix += ";" + str(player_link_dict[player])
                else:
                    # Current player doesn't have links to this player, set to 0
                    matrix += ";" + str(0)
        matrix += "\n"
    return matrix
    
def write_file(matrix):
    with io.open("tennis-network.csv", "w", encoding="utf-8") as f:
        f.write(matrix)
    

wiki_codes = get_players_wiki_code()
all_entries = len(wiki_codes)

number_of_entries = all_entries
wiki_list = get_wiki_pages(wiki_codes[0:number_of_entries]) 
name_list = get_name_list(wiki_list)

dict_all_players = get_reference_count_of_all_players(wiki_list, name_list)

matrix = create_matrix(dict_all_players)
write_file(matrix)
print("Done")