In [2]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

In [3]:
#Load CSV Files

# sources: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#
# Data starts in 1896
events_file = 'Resources/athlete_events.csv'
df_events_orig = pd.read_csv(events_file)

# source: World Bank (https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#) 
# Data starts in 1960
pop_gdp_file = 'Resources/population_gdp.csv'
df_pop_gdp_orig = pd.read_csv(pop_gdp_file)

In [4]:
# clean worldbank data
df_pop_gdp = df_pop_gdp_orig
df_pop_gdp = df_pop_gdp.drop(columns = ['Series Code'], axis =1)
df_pop_gdp = df_pop_gdp.rename(
    columns={
        "Series Name": "series_name", 
        "Country Name": "country_name",
        "Country Code": "country_code"})

df_pop_gdp.columns = df_pop_gdp.columns.str.split(' ').str[0].tolist()

keep_columns = ['series_name', 'country_name', 'country_code']
event_years = df_events_orig.Year.unique().astype(str)

for i in event_years:
    keep_columns.append(i)

df_pop_gdp = df_pop_gdp[df_pop_gdp.columns.intersection(keep_columns)]

df_gdp = df_pop_gdp.loc[df_pop_gdp["series_name"] == "GDP (current US$)"]
df_pop = df_pop_gdp.loc[df_pop_gdp["series_name"] == "Population, total"]

In [5]:
# clean Olympic NOC and worldbank country_code to match each other

# extract WorldBank's country_code and country_name
df_worldbanknames = df_pop[["country_code", "country_name"]]

# create a column with stripped country_name
# Sanitize
df_worldbanknames['Wclean'] = df_worldbanknames ['country_name'].str.replace(" ", "").str.lower()

# create a column with stripped Team name in Olympic Data
df_events = df_events_orig
# Santize
df_events['Oclean'] = df_events ['Team'].str.replace(" ", "").str.lower()

# Unique transformations of countries that are in both data bases, but didn't match up.
df_events['NOC'] = df_events['NOC'].str.replace("IRI", "IRN")
df_events['NOC'] = df_events['NOC'].str.replace("GER", "DEU")
df_events['NOC'] = df_events['NOC'].str.replace("BAH", "BHS")
df_events['NOC'] = df_events['NOC'].str.replace("SUI", "CHE")
df_events['NOC'] = df_events['NOC'].str.replace("ISV", "VIR")
df_events['NOC'] = df_events['NOC'].str.replace("GRE", "GRC")
df_events['NOC'] = df_events['NOC'].str.replace("DEN", "DNK")
df_events['NOC'] = df_events['NOC'].str.replace("NED", "NLD")
df_events['NOC'] = df_events['NOC'].str.replace("CGO", "COG")
df_events['NOC'] = df_events['NOC'].str.replace("LAT", "LVA")
df_events['NOC'] = df_events['NOC'].str.replace("INA", "IDN")
df_events['NOC'] = df_events['NOC'].str.replace("GAM", "GMB")
df_events['NOC'] = df_events['NOC'].str.replace("GBS", "GNB")
df_events['NOC'] = df_events['NOC'].str.replace("MAS", "MYS")
df_events['NOC'] = df_events['NOC'].str.replace("NGR", "NGA")
df_events['NOC'] = df_events['NOC'].str.replace("VIN", "VCT")
df_events['NOC'] = df_events['NOC'].str.replace("BRU", "BRN")
df_events['NOC'] = df_events['NOC'].str.replace("SLO", "SVN")
df_events['NOC'] = df_events['NOC'].str.replace("MRI", "MUS")
df_events['NOC'] = df_events['NOC'].str.replace("BUL", "BGR")
df_events['NOC'] = df_events['NOC'].str.replace("PUR", "PRI")
df_events['NOC'] = df_events['NOC'].str.replace("MON", "MCO")
df_events['NOC'] = df_events['NOC'].str.replace("SKN", "KNA")


# merge by stripped country names and make sure Olympic Data NOC matches WorldBank country_code
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='Oclean', right_on = 'Wclean')
df_events['NOC'] = np.where(df_events['country_code'].isnull() == False, df_events['country_code'], df_events['NOC'])

# drop unncessary columns
df_events = df_events.drop(columns = ['Oclean', 'country_code', 'country_name', 'Wclean'], axis = 1)

In [6]:
# code for cleaning countries with numbers

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

    
teamlist1 = df_events["Team"]

teamlist2 = []

for team in teamlist1:
    if (hasNumbers (team)):
        splitlist = team.split("-")[0]
        teamlist2.append(splitlist)
    else:
        teamlist2.append(team)

df_events["Team"] = teamlist2
df_events = df_events[df_events["Year"] >= 1960]
# df_events_trial["Team"].unique()

In [35]:
# cleaning olympians
df_olympians1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Name"].count().reset_index(name = 'num_events')
# df_olympians1.sort_values("num_events", ascending=False).head(20)

df_olympians2 = df_olympians1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_olympians')
# df_olympians2.sort_values("num_olympians", ascending=False).head()


# cleaning medalists
df_medalist1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Medal"].count().reset_index(name = 'num_medals')
df_medalist1 = df_medalist1[df_medalist1['num_medals'] > 0]
# df_medalist1.sort_values("num_medals", ascending=False).head()

df_medalist2 = df_medalist1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_medalist')
# df_medalist2.sort_values("num_medalist", ascending=False).head()

# merging the two
df_medalist3 = df_medalist2[["Team", "Games", "num_medalist"]]
df_olympians = pd.merge(df_olympians2, df_medalist3, how="outer", on=['Team', 'Games'])
df_olympians["perc_medalist"] = df_olympians["num_medalist"]/df_olympians["num_olympians"]


# creating a json file
# df_olympians.pivot_table(["num_olympians", "num_medalist", "perc_medalist"], ["Team"], ["Games"]).reset_index().to_json('HYLab/olympiandata.json', orient='records')

# df_olympians.to_csv('HYLab/olympiandata.csv')

In [36]:
# df_olympians

In [6]:
# df_events

In [7]:
# df_gdp.head()

In [8]:
# df_pop.head()

In [None]:
# df_olympians

## DEBUG STARTS HERE

In [9]:
### DEBUG TO DISCUSS WITH TEAM ###

# Check for items that are not merged
checkmerge = pd.merge(df_events, df_worldbanknames, how="left", left_on='NOC', right_on = 'country_code')
unmatched = checkmerge[checkmerge['country_code'].isnull()]
# mergesuccess = checkmerge[checkmerge['country_code'].isnull() == False]

checking = unmatched
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year", ascending = False)

clearingbydate = checking

# We need to decide how to handle the following cases that are not in the World Bank Database.
# We can just keep them in the Olympics Database and note that they don't have GDP information.

# latest 2016 participants
# Taipei not recognized by WB
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TPE') == False]
# Individual Olympics
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('IOA') == False]
# Palestine not in world bank
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PLE') == False]
# Refugee
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ROT') == False]
# Cook Islands
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('COK') == False]

# latest 2008 participants
# Netherlands Antilles
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('AHO') == False]
# Not sure if this is Portugal
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('POR') == False]

# Latest 2006 Participants
# Serbia and Montenegro
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('SCG') == False]

# Latest 1992 Participants
# bunch of soviet countries that don't exist anymore
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TCH') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('EUN') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YUG') == False]

# Latest 1988 Participants
# West and East Germany, Soviet Union
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('FRG') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('GDR') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('URS') == False]
# the two yemens
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YMD') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YAR') == False]

# latest 1964
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('CAM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('BER') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ZIM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PHI') == False]

# clearingbydate

In [10]:
checking = df_events
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year")
# checking[checking['Team'] == "Bohemia"]
# checking[checking['NOC'] == "URS"]
# checking

In [11]:
# unmatched, but before 1960
unmatched_valuecounts = clearingbydate['NOC'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# unmatched_valuecounts 

In [12]:
# For Spot Checking Data Integrity

### Need to figure out how to get the summary by event

medals = df_events.loc[df_events['Games'] == '2014 Winter'] 
medals = medals.loc[df_events['Medal'] == 'Gold'] 

# this counts the number of Gold Medals that Individuals Received 
medals_gb = medals.groupby(['NOC'])["Medal"].count().rename_axis('country_code').reset_index(name='medal_count')
# medals_gb.sort_values("medal_count", ascending = False)

## Diana's code

In [13]:
df_events_new = df_events.loc[df_events['Year'] >= 1960] 

In [14]:
df_events_new.columns

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')

In [15]:
class Events(Base):
    __tablename__ = 'events'
    ID = Column(Integer, primary_key=True)
    Name = Column(String(255))
    Sex = Column(String(255))
    Age = Column(Integer)
    Height = Column(Integer)
    Weight = Column(Integer)
    Team = Column(String(255))
    NOC = Column(String(255), primary_key=True)
    Games = Column(String(255))
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255))
    City = Column(String(255))
    Sport = Column(String(255))
    Event = Column(String(255), primary_key=True)
    Medal = Column(String(255))

In [15]:
# class Olympians_Team(Base):
#     __tablename__ = 'olympians_team'
#     Year = Column(Integer, primary_key=True)
#     Season = Column(String(255), primary_key=True)
#     Team = Column(String(255), primary_key=True)
#     NOC = Column(String(255), primary_key=True)
#     No_olympians = Column(Integer)

In [16]:
# Create engine connection and export the gg_movie_final dataframe to sql\n",
disk_engine = create_engine('sqlite:///olympic_events.sqlite')
Base.metadata.create_all(disk_engine)

In [17]:
df_events_new.to_sql('events', disk_engine, if_exists='append', index = False)

In [18]:
session = Session(disk_engine)

<h4> Proposal:
    There are different country names (eg. United States 1 and United States 2). I will only use NOC for the base tables and from that point on we can map it with another table

<h2> Olympians over time

In [19]:
a = session.query(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).\
    group_by(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).all()
a

[(1960, 'Summer', 'A. Abdul Razzak', 'Iraq', 'IRQ'),
 (1960, 'Summer', 'A. W. Nancy "Nan" Rae', 'Great Britain', 'GBR'),
 (1960, 'Summer', 'Aage Birch', 'Chok', 'DNK'),
 (1960,
  'Summer',
  'Aartje Johanna "Atie" Voorbij (-Dorresteijn)',
  'Netherlands',
  'NLD'),
 (1960, 'Summer', 'Abbas Khamis', 'United Arab Republic', 'UAR'),
 (1960, 'Summer', 'Abbes Harchi', 'Morocco', 'MAR'),
 (1960, 'Summer', 'Abdallah Gazi', 'United Arab Republic', 'UAR'),
 (1960, 'Summer', 'Abdallah Lahoucine', 'Morocco', 'MAR'),
 (1960, 'Summer', 'Abdel Aziz Fahmi El-Shafei', 'United Arab Republic', 'UAR'),
 (1960, 'Summer', 'Abdel Fattah Abou-Shanab', 'United Arab Republic', 'UAR'),
 (1960, 'Summer', 'Abdel Kader Belghiti', 'Morocco', 'MAR'),
 (1960, 'Summer', 'Abdel Kader Ben Kamel', 'Morocco', 'MAR'),
 (1960, 'Summer', 'Abdel Kader Gangani', 'Morocco', 'MAR'),
 (1960,
  'Summer',
  'Abdel Khadr El-Sayed El-Touni',
  'United Arab Republic',
  'UAR'),
 (1960, 'Summer', 'Abdel Majid Salah Naji', 'Tunisia', 'T

In [20]:
# Intermediate table
olympians_team_detail = pd.read_sql_query('SELECT year, season, name, team, NOC FROM events \
GROUP BY year, season, name, team, NOC',disk_engine)
olympians_team_detail

Unnamed: 0,Year,Season,Name,Team,NOC
0,1960,Summer,A. Abdul Razzak,Iraq,IRQ
1,1960,Summer,"A. W. Nancy ""Nan"" Rae",Great Britain,GBR
2,1960,Summer,Aage Birch,Chok,DNK
3,1960,Summer,"Aartje Johanna ""Atie"" Voorbij (-Dorresteijn)",Netherlands,NLD
4,1960,Summer,Abbas Khamis,United Arab Republic,UAR
...,...,...,...,...,...
148794,2016,Summer,va Csernoviczki,Hungary,HUN
148795,2016,Summer,va Risztov,Hungary,HUN
148796,2016,Summer,zge Bayrak,Turkey,TUR
148797,2016,Summer,zlem Kaya,Turkey,TUR


In [21]:
# Final table with number of athletes per Event (Year&Season)
olympians_team = olympians_team_detail.groupby(['Year','Season', 'Team', 'NOC']).count()
olympians_team.reset_index(inplace = True)
olympians_team_final = olympians_team.rename(columns = {"Name": "No_olympians"})

In [22]:
# Export to sqlite
olympians_team.to_sql('olympians_team_final', disk_engine, if_exists='append', index = False)

In [23]:
# Giving an error - I need to fix it
# b = session.query(olympians_team_final.Year, olympians_team_final.Season, olympians_team_final.Team, olympians_team_final.NOC, olympians_team_final.No_olympians).\
#     group_by(olympians_team_final.Year, olympians_team_final.Season, olympians_team_final.Team, olympians_team_final.NOC, olympians_team_final.No_olympians).all()


<h2> Medals by country / Total Medal count over time / Participating events over time

In [24]:
# Intermediate table 1
medals_team_detail = pd.read_sql_query('SELECT year, season, sport, event, sex, medal, team, NOC FROM events \
GROUP BY year, season, sport, event, sex, medal, team, NOC',disk_engine)
medals_team_detail["Medal"].fillna("-", inplace = True) 

In [25]:
# Intermediate table 2 - may be used for visualizations
medals_team = medals_team_detail.groupby(['Year','Season','Team', 'NOC', 'Medal']).count()[['Sport']]
medals_team.reset_index(inplace = True)
medals_team.rename(columns = {"Sport": "#Medals"}, inplace = True)
medals_team.sort_values(by=["#Medals"], ascending=False)
medals_team.loc[medals_team['Year'] == 1984].sort_values(by=['Year',''"#Medals"], ascending=[True, False])

Unnamed: 0,Year,Season,Team,NOC,Medal,#Medals
2020,1984,Summer,Great Britain,GBR,-,178
1975,1984,Summer,Canada,CAN,-,164
2168,1984,Summer,West Germany,FRG,-,159
2160,1984,Summer,United States,USA,-,138
2042,1984,Summer,Italy,ITA,-,133
...,...,...,...,...,...,...
2295,1984,Winter,West Germany,FRG,Bronze,1
2296,1984,Winter,West Germany,FRG,Gold,1
2297,1984,Winter,West Germany,FRG,Silver,1
2299,1984,Winter,West Germany-1,FRG,Gold,1


In [26]:
# Summary table with all the participant countries. Total Medals is sum of G,S&B
# If you need a list of medals, check the previous table
medals_team_total = medals_team.pivot_table(index=['Year','Season','Team','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_team_total.reset_index(inplace = True)
medals_team_total.replace(np.nan,0, inplace = True)
medals_team_total["Total_Medals"] = medals_team_total["Bronze"] + medals_team_total["Gold"] + medals_team_total["Silver"]
medals_team_total.sort_values(by=['Year','Season', 'Total_Medals'], ascending=[True, True, False], inplace = True)
medals_team_total.loc[medals_team_total['Year'] == 1960]

Medal,Year,Season,Team,NOC,-,Bronze,Gold,Silver,Total_Medals
149,1960,Summer,Soviet Union,URS,96.0,31.0,42.0,29.0,102.0
171,1960,Summer,United States,USA,111.0,15.0,33.0,21.0,69.0
60,1960,Summer,Germany,DEU,120.0,10.0,12.0,19.0,41.0
79,1960,Summer,Italy,ITA,109.0,12.0,13.0,10.0,35.0
8,1960,Summer,Australia,AUS,108.0,6.0,8.0,8.0,22.0
...,...,...,...,...,...,...,...,...,...
217,1960,Winter,Soviet Union-2,URS,2.0,0.0,0.0,0.0,0.0
218,1960,Winter,Spain,ESP,6.0,0.0,0.0,0.0,0.0
221,1960,Winter,Turkey,TUR,3.0,0.0,0.0,0.0,0.0
224,1960,Winter,United States-2,USA,2.0,0.0,0.0,0.0,0.0


In [27]:
# Export to sqlite
medals_team_total.to_sql('medals_team_total', disk_engine, if_exists='append', index = False)

In [28]:
# Giving an error - I need to fix it
# c = session.query(medals_team_total).all()