## IMBLearn Notebook

This notebook holds iterations using sci-kit learn's imbalanced 

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
import pdfplumber
import tabula
import seaborn as sns
from Olympic_PED_use.src import functions as fn
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

from bs4 import BeautifulSoup
import certifi
import urllib3
import re
from csv import DictReader, DictWriter
import datetime as dt

import glob
import fuzzywuzzy
from fuzzywuzzy import fuzz

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 500)



In [18]:
def create_wiki_doping():
    
    '''This function creates the doping 
    dataframe containing athletes involved
    in doping irregularities (obtained from
    wikipedia)
    The functions fetches the html text and tags
    from the url and creates a dataframe using the 
    wiki_scraper function above. Then formats the 
    column values using the col_format function
    above. Finally, removing rows holding irrelevant
    values and replacing names with different spellings
    than in athlete dataframe with the appropriate spellings
    to match.'''
    
    url = 'https://en.wikipedia.org/wiki/List_of_doping_cases_in_athletics'
    req = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                         ca_certs=certifi.where())
    res = req.request('GET', url)
    soup = BeautifulSoup(res.data, 'html.parser')
    contents = soup.find_all('table', class_='wikitable sortable')  
    
    
    wiki_doping = fn.wiki_scraper(contents)

    wiki_doping = col_format(wiki_doping)

    wiki_doping.date_of_violation = [x.replace('23 May 2002  24 June 2002', '2002') for x in wiki_doping.date_of_violation]
    wiki_doping.date_of_violation = [x[:4] for x in wiki_doping.date_of_violation]

    drop_value = wiki_doping.date_of_violation[44]

    wiki_doping = wiki_doping[wiki_doping.date_of_violation != drop_value]
    wiki_doping = wiki_doping[wiki_doping.date_of_violation != 'Unkn']

    rows_to_drop = [0, 60, 197, 390, 632, 905, 1198, 1532, 1874, 1907, 2249,
                    2293, 2635, 2697, 3039, 3173, 3515, 3693, 4035, 4278, 4289,
                     4874, 4902, 5244, 5487, 5536, 5878, 6121, 6196, 6212,
                    6554, 6797, 6872, 6889, 7231, 7474, 7549, 7603, 7945, 8188,
                    8263, 8392,  8977, 9052, 9219, 9561, 9804, 9879, 10052,
                    10637, 10712, 10907, 11249, 11492, 11567, 11789, 12131,
                    12374, 12449, 12672, 13014, 13257, 13332, 13914, 14157, 14232]

    wiki_doping = wiki_doping.drop(rows_to_drop, axis=0)

    wiki_doping.date_of_violation = [int(x) for x in wiki_doping.date_of_violation]

    wiki_doping.drop_duplicates(inplace=True)


    wiki_names = list(wiki_doping.name.unique())

    wiki_doping = wiki_doping.sort_values(by='name')

    wiki_doping = wiki_doping[wiki_doping.date_of_violation > 2003]

    wiki_doping = wiki_doping[wiki_doping.date_of_violation < 2017]

    wiki_doping.reset_index(drop=True, inplace=True)

In [19]:
athlete_df = pd.read_csv('../data/athletes_dataset/athlete_events.csv')
athlete_df.columns = [x.lower() for x in athlete_df.columns]

In [20]:
athlete_df = athlete_df.loc[athlete_df.sport == 'Athletics']

In [21]:
athlete_df = athlete_df.loc[athlete_df.year > 2002]
athlete_df = athlete_df.loc[athlete_df.year < 2018]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Men's 4 x 100 metres Relay"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Men's 4 x 400 metres Relay"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Women's 4 x 100 metres Relay"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Women's 4 x 400 metres Relay"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Men's Marathon"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Women's Marathon"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Women's 4 x 100 metres Relay"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Women's Heptathlon"]
athlete_df = athlete_df.loc[athlete_df.event != "Athletics Men's Decathlon"]

In [25]:
athlete_names = list(athlete_df.name.unique())
doping_names = list(doping_df.name.unique())

In [26]:
def match_names(term, list_names, min_score=0):
    max_score = -1
    max_name = ''
    for x in list_names:
        score = fuzz.ratio(term, x)
        if (score > min_score) & (score > max_score):
            max_name = x
            max_score = score
    return (max_name, max_score)

In [31]:
dict_list = []
for x in doping_names:
    match = match_names(x, athlete_names, 70)
    print('"{}":"{}",'.format(x,match[0]))
    dict_ = {}
    dict_.update({'doping_name': x})
    dict_.update({'match_name': match[0]})
    dict_.update({'score': match[1]})
    dict_list.append(dict_)

"Abdelatif Chemlal":"Abdelatif Chemlal",
"Abdelhadi Habassa":"",
"Abdelhadi Labäli":"",
"Abdelkader Hachlaf":"Abdelkader Hachlaf",
"Abdellah Haidane":"Abdellah Falil",
"Abderrahim El Asri":"Abderrahim Al-Goumri",
"Abderrahim Goumri":"Abderrahim Al-Goumri",
"Abderrahhime Bouramdane":"",
"Abdulagadir Idriss":"",
"Abraham Kiprotich":"Abraham Kipchirchir Rotich",
"Abubaker Ali Kamal":"Abubaker Ali Kamal",
"Adil Kaouch":"Adil El-Kaouch",
"Adrián Annus":"Adrin Zsolt Annus",
"Adriënne Herzog":"",
"Agatha Jeruto Kimaswai":"",
"Agnieszka Gortel-Maciuk":"",
"Agustín Félix":"",
"Ahmad Hazer":"Ahmad Hazer",
"Ahmed Abd El Raouf":"",
"Ahmed Baday":"",
"Ahmed Faiz":"Ahmed Ali",
"Ahmed Mohamed Dheeb":"",
"Ak Hafiy Tajuddin Rositi":"Ak Hafiy Tajuddin Rositi",
"Alberico Di Cecco":"",
"Aleksandr Bulanov":"Aleksandar Rakovi",
"Aleksandr Vashchilo":"",
"Aleksandra Duliba":"",
"Aleksey Korolev":"Aleksey Pogorelov",
"Aleksey Lesnichy":"Aleksey Lesnichy",
"Aleksey Voyevodin":"Aleksey Nikolayevich Voyevodin",
