In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
import sqlite3
Base = declarative_base()

In [2]:
#Load CSV Files

# sources: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#
# Data starts in 1896
events_file = 'Resources/athlete_events.csv'
df_events_orig = pd.read_csv(events_file)

# source: World Bank (https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#) 
# Data starts in 1960
pop_gdp_file = 'Resources/population_gdp.csv'
df_pop_gdp_orig = pd.read_csv(pop_gdp_file)

In [3]:
# clean worldbank data
df_pop_gdp = df_pop_gdp_orig
df_pop_gdp = df_pop_gdp.drop(columns = ['Series Code'], axis =1)
df_pop_gdp = df_pop_gdp.rename(
    columns={
        "Series Name": "series_name", 
        "Country Name": "country_name",
        "Country Code": "country_code"})

df_pop_gdp.columns = df_pop_gdp.columns.str.split(' ').str[0].tolist()

keep_columns = ['series_name', 'country_name', 'country_code']
event_years = df_events_orig.Year.unique().astype(str)

for i in event_years:
    keep_columns.append(i)

df_pop_gdp = df_pop_gdp[df_pop_gdp.columns.intersection(keep_columns)]

df_gdp = df_pop_gdp.loc[df_pop_gdp["series_name"] == "GDP (current US$)"]
df_pop = df_pop_gdp.loc[df_pop_gdp["series_name"] == "Population, total"]

In [4]:
# clean Olympic NOC and worldbank country_code to match each other

# extract WorldBank's country_code and country_name
df_worldbanknames = df_pop[["country_code", "country_name"]]

# create a column with stripped country_name
# Sanitize
df_worldbanknames['Wclean'] = df_worldbanknames ['country_name'].str.replace(" ", "").str.lower()

# create a column with stripped Team name in Olympic Data
df_events = df_events_orig
# Santize
df_events['Oclean'] = df_events ['Team'].str.replace(" ", "").str.lower()

# Unique transformations of countries that are in both data bases, but didn't match up.
df_events['NOC'] = df_events['NOC'].str.replace("IRI", "IRN")
df_events['NOC'] = df_events['NOC'].str.replace("GER", "DEU")
df_events['NOC'] = df_events['NOC'].str.replace("BAH", "BHS")
df_events['NOC'] = df_events['NOC'].str.replace("SUI", "CHE")
df_events['NOC'] = df_events['NOC'].str.replace("ISV", "VIR")
df_events['NOC'] = df_events['NOC'].str.replace("GRE", "GRC")
df_events['NOC'] = df_events['NOC'].str.replace("DEN", "DNK")
df_events['NOC'] = df_events['NOC'].str.replace("NED", "NLD")
df_events['NOC'] = df_events['NOC'].str.replace("CGO", "COG")
df_events['NOC'] = df_events['NOC'].str.replace("LAT", "LVA")
df_events['NOC'] = df_events['NOC'].str.replace("INA", "IDN")
df_events['NOC'] = df_events['NOC'].str.replace("GAM", "GMB")
df_events['NOC'] = df_events['NOC'].str.replace("GBS", "GNB")
df_events['NOC'] = df_events['NOC'].str.replace("MAS", "MYS")
df_events['NOC'] = df_events['NOC'].str.replace("NGR", "NGA")
df_events['NOC'] = df_events['NOC'].str.replace("VIN", "VCT")
df_events['NOC'] = df_events['NOC'].str.replace("BRU", "BRN")
df_events['NOC'] = df_events['NOC'].str.replace("SLO", "SVN")
df_events['NOC'] = df_events['NOC'].str.replace("MRI", "MUS")
df_events['NOC'] = df_events['NOC'].str.replace("BUL", "BGR")
df_events['NOC'] = df_events['NOC'].str.replace("PUR", "PRI")
df_events['NOC'] = df_events['NOC'].str.replace("MON", "MCO")
df_events['NOC'] = df_events['NOC'].str.replace("SKN", "KNA")


# merge by stripped country names and make sure Olympic Data NOC matches WorldBank country_code
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='Oclean', right_on = 'Wclean')
df_events['NOC'] = np.where(df_events['country_code'].isnull() == False, df_events['country_code'], df_events['NOC'])

# drop unncessary columns
df_events = df_events.drop(columns = ['Oclean', 'country_code', 'country_name', 'Wclean'], axis = 1)

In [5]:
# code for cleaning countries with numbers

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

    
teamlist1 = df_events["Team"]

teamlist2 = []

for team in teamlist1:
    if (hasNumbers (team)):
        splitlist = team.split("-")[0]
        teamlist2.append(splitlist)
    else:
        teamlist2.append(team)

df_events["Team"] = teamlist2
df_events = df_events[df_events["Year"] >= 1960]
# df_events_trial["Team"].unique()

In [6]:
# df_olympians

In [7]:
# df_events

In [8]:
# df_gdp.head()

In [9]:
# df_pop.head()

In [10]:
# df_olympians

## DEBUG STARTS HERE

In [11]:
### DEBUG TO DISCUSS WITH TEAM ###

# Check for items that are not merged
checkmerge = pd.merge(df_events, df_worldbanknames, how="left", left_on='NOC', right_on = 'country_code')
unmatched = checkmerge[checkmerge['country_code'].isnull()]
# mergesuccess = checkmerge[checkmerge['country_code'].isnull() == False]

checking = unmatched
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year", ascending = False)

clearingbydate = checking

# We need to decide how to handle the following cases that are not in the World Bank Database.
# We can just keep them in the Olympics Database and note that they don't have GDP information.

# latest 2016 participants
# Taipei not recognized by WB
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TPE') == False]
# Individual Olympics
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('IOA') == False]
# Palestine not in world bank
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PLE') == False]
# Refugee
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ROT') == False]
# Cook Islands
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('COK') == False]

# latest 2008 participants
# Netherlands Antilles
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('AHO') == False]
# Not sure if this is Portugal
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('POR') == False]

# Latest 2006 Participants
# Serbia and Montenegro
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('SCG') == False]

# Latest 1992 Participants
# bunch of soviet countries that don't exist anymore
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TCH') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('EUN') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YUG') == False]

# Latest 1988 Participants
# West and East Germany, Soviet Union
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('FRG') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('GDR') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('URS') == False]
# the two yemens
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YMD') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YAR') == False]

# latest 1964
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('CAM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('BER') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ZIM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PHI') == False]

# clearingbydate

In [12]:
checking = df_events
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year")
# checking[checking['Team'] == "Bohemia"]
# checking[checking['NOC'] == "URS"]
# checking

In [13]:
# unmatched, but before 1960
unmatched_valuecounts = clearingbydate['NOC'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# unmatched_valuecounts 

In [14]:
# For Spot Checking Data Integrity

### Need to figure out how to get the summary by event

medals = df_events.loc[df_events['Games'] == '2014 Winter'] 
medals = medals.loc[df_events['Medal'] == 'Gold'] 

# this counts the number of Gold Medals that Individuals Received 
medals_gb = medals.groupby(['NOC'])["Medal"].count().rename_axis('country_code').reset_index(name='medal_count')
# medals_gb.sort_values("medal_count", ascending = False)

In [15]:
# Disregard

# # cleaning olympians
# df_olympians1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Name"].count().reset_index(name = 'num_events')
# # df_olympians1.sort_values("num_events", ascending=False).head(20)

# df_olympians2 = df_olympians1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_olympians')
# # df_olympians2.sort_values("num_olympians", ascending=False).head()


# # cleaning medalists
# df_medalist1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Medal"].count().reset_index(name = 'num_medals')
# df_medalist1 = df_medalist1[df_medalist1['num_medals'] > 0]
# # df_medalist1.sort_values("num_medals", ascending=False).head()

# df_medalist2 = df_medalist1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_medalist')
# # df_medalist2.sort_values("num_medalist", ascending=False).head()

# # merging the two
# df_medalist3 = df_medalist2[["Team", "Games", "num_medalist"]]
# df_olympians = pd.merge(df_olympians2, df_medalist3, how="outer", on=['Team', 'Games'])
# df_olympians["perc_medalist"] = df_olympians["num_medalist"]/df_olympians["num_olympians"]

# ### Need to keep the medal types for it to be filterable...

# # # creating a json file.. doesn't work
# # df_olympians.pivot_table(["num_olympians", "num_medalist", "perc_medalist"], ["Team"], ["Games"]).reset_index()
# # # .to_json('HYLab/olympiandata.json', orient='records')

# # df_olympians.to_csv('HYLab/olympiandata.csv')

## Diana's code

In [16]:
df_events_new = df_events.loc[df_events['Year'] >= 1960] 

In [17]:
df_events_new.columns

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal'],
      dtype='object')

In [18]:
class Events(Base):
    __tablename__ = 'events'
    ID = Column(Integer, primary_key=True)
    Name = Column(String(255))
    Sex = Column(String(255))
    Age = Column(Integer)
    Height = Column(Integer)
    Weight = Column(Integer)
    Team = Column(String(255))
    NOC = Column(String(255), primary_key=True)
    Games = Column(String(255))
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255))
    City = Column(String(255))
    Sport = Column(String(255))
    Event = Column(String(255), primary_key=True)
    Medal = Column(String(255))

In [19]:
class Olympians_Team_Final(Base):
    __tablename__ = 'olympians_team_final'
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255), primary_key=True)
    Team = Column(String(255), primary_key=True)
    NOC = Column(String(255), primary_key=True)
    No_olympians = Column(Integer)

In [20]:
class Medals_Team_Total(Base):
    __tablename__ = 'medals_team_total'
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255), primary_key=True)
    Team = Column(String(255), primary_key=True)
    NOC = Column(String(255), primary_key=True)
    No_medal = Column(Integer)
    Bronze = Column(Integer)
    Gold = Column(Integer)
    Silver = Column(Integer)
    Total_Medals = Column(Integer)

In [21]:
# Create engine connection and export the gg_movie_final dataframe to sql\n",
disk_engine = create_engine('sqlite:///olympic_events.sqlite')


In [22]:
Base.metadata.create_all(disk_engine)

In [23]:
df_events_new.to_sql('events', disk_engine, if_exists='append', index = False)

In [24]:
session = Session(disk_engine)
conn = disk_engine.connect()

In [25]:
# To check the tables that are in our sqlite db
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
print(table)

# To check the columns in the tables
conn = sqlite3.connect("olympic_events.sqlite")
cursor = conn.execute('select * from olympians_team_final')
cursor.description

                   name
0                events
1  olympians_team_final
2     medals_team_total


(('Year', None, None, None, None, None, None),
 ('Season', None, None, None, None, None, None),
 ('Team', None, None, None, None, None, None),
 ('NOC', None, None, None, None, None, None),
 ('No_olympians', None, None, None, None, None, None))

<h4> Proposal:
    There are different country names (eg. United States 1 and United States 2). I will only use NOC for the base tables and from that point on we can map it with another table

<h2> Olympians over time

In [36]:
test1 = session.query(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).\
    group_by(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).all()

In [27]:
# Intermediate table
olympians_team_detail = pd.read_sql_query('SELECT year, season, name, team, NOC FROM events \
GROUP BY year, season, name, team, NOC',disk_engine)
olympians_team_detail

Unnamed: 0,Year,Season,Name,Team,NOC
0,1960,Summer,A. Abdul Razzak,Iraq,IRQ
1,1960,Summer,"A. W. Nancy ""Nan"" Rae",Great Britain,GBR
2,1960,Summer,Aage Birch,Chok,DNK
3,1960,Summer,"Aartje Johanna ""Atie"" Voorbij (-Dorresteijn)",Netherlands,NLD
4,1960,Summer,Abbas Khamis,United Arab Republic,UAR
...,...,...,...,...,...
147893,2016,Summer,va Csernoviczki,Hungary,HUN
147894,2016,Summer,va Risztov,Hungary,HUN
147895,2016,Summer,zge Bayrak,Turkey,TUR
147896,2016,Summer,zlem Kaya,Turkey,TUR


In [28]:
# Final table with number of athletes per Event (Year&Season)
olympians_team = olympians_team_detail.groupby(['Year','Season', 'Team', 'NOC']).count()
olympians_team.reset_index(inplace = True)
olympians_team_final = olympians_team.rename(columns = {"Name": "No_olympians"})

In [29]:
# Export to sqlite
olympians_team_final.to_sql('olympians_team_final', disk_engine, if_exists='append', index = False)

In [30]:
# Giving an error - I need to fix it
test2 = session.query(Olympians_Team_Final.Year, Olympians_Team_Final.Season,\
                        Olympians_Team_Final.Team, Olympians_Team_Final.NOC, Olympians_Team_Final.No_olympians).all()

test2

[(1960, 'Summer', 'Afghanistan', 'AFG', 12),
 (1960, 'Summer', 'Aldebaran II', 'ITA', 2),
 (1960, 'Summer', 'Aletta', 'IRL', 3),
 (1960, 'Summer', 'Ali-Baba VI', 'CHE', 2),
 (1960, 'Summer', 'Ardilla', 'ARG', 3),
 (1960, 'Summer', 'Argentina', 'ARG', 84),
 (1960, 'Summer', 'Argo II', 'CAN', 3),
 (1960, 'Summer', 'Astrid III', 'FRA', 3),
 (1960, 'Summer', 'Australia', 'AUS', 179),
 (1960, 'Summer', 'Austria', 'AUT', 96),
 (1960, 'Summer', 'Baccara', 'CHE', 3),
 (1960, 'Summer', 'Bahamas', 'BHS', 3),
 (1960, 'Summer', 'Bajazzo', 'URU', 3),
 (1960, 'Summer', 'Balaton', 'HUN', 2),
 (1960, 'Summer', 'Ballerina IV', 'CHE', 3),
 (1960, 'Summer', 'Beaver', 'GBR', 2),
 (1960, 'Summer', 'Belgium', 'BEL', 99),
 (1960, 'Summer', 'Bella', 'FIN', 2),
 (1960, 'Summer', 'Bellatrix IX', 'DEU', 3),
 (1960, 'Summer', 'Bermuda', 'BMU', 1),
 (1960, 'Summer', 'Bermudes', 'BER', 3),
 (1960, 'Summer', 'Bermudian', 'BER', 2),
 (1960, 'Summer', 'Bim', 'BHS', 3),
 (1960, 'Summer', 'Boreas', 'JPN', 3),
 (1960, 'S

<h2> Medals by country / Total Medal count over time / Participating events over time

In [31]:
# Intermediate table 1
medals_team_detail = pd.read_sql_query('SELECT year, season, sport, event, sex, medal, team, NOC FROM events \
GROUP BY year, season, sport, event, sex, medal, team, NOC',disk_engine)
medals_team_detail["Medal"].fillna("No_medal", inplace = True) 

In [32]:
# Intermediate table 2 - may be used for visualizations
medals_team = medals_team_detail.groupby(['Year','Season','Team', 'NOC', 'Medal']).count()[['Sport']]
medals_team.reset_index(inplace = True)
medals_team.rename(columns = {"Sport": "#Medals"}, inplace = True)
medals_team.sort_values(by=["#Medals"], ascending=False)
medals_team.loc[medals_team['Year'] == 1984].sort_values(by=['Year',''"#Medals"], ascending=[True, False])

Unnamed: 0,Year,Season,Team,NOC,Medal,#Medals
1813,1984,Summer,Great Britain,GBR,No_medal,178
1768,1984,Summer,Canada,CAN,No_medal,164
1961,1984,Summer,West Germany,FRG,No_medal,159
1953,1984,Summer,United States,USA,No_medal,138
1835,1984,Summer,Italy,ITA,No_medal,133
...,...,...,...,...,...,...
2027,1984,Winter,Puerto Rico,PRI,No_medal,1
2041,1984,Winter,Switzerland,CHE,Bronze,1
2049,1984,Winter,West Germany,FRG,Bronze,1
2052,1984,Winter,West Germany,FRG,Silver,1


In [33]:
# Summary table with all the participant countries. Total Medals is sum of G,S&B
# If you need a list of medals, check the previous table
medals_team_total = medals_team.pivot_table(index=['Year','Season','Team','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_team_total.reset_index(inplace = True)
medals_team_total.replace(np.nan,0, inplace = True)
medals_team_total["Total_Medals"] = medals_team_total["Bronze"] + medals_team_total["Gold"] + medals_team_total["Silver"]
medals_team_total.sort_values(by=['Year','Season', 'Total_Medals'], ascending=[True, True, False], inplace = True)
medals_team_total.loc[medals_team_total['Year'] == 1960]

Medal,Year,Season,Team,NOC,Bronze,Gold,No_medal,Silver,Total_Medals
149,1960,Summer,Soviet Union,URS,31.0,42.0,96.0,29.0,102.0
171,1960,Summer,United States,USA,15.0,33.0,111.0,21.0,69.0
60,1960,Summer,Germany,DEU,10.0,12.0,120.0,19.0,41.0
79,1960,Summer,Italy,ITA,12.0,13.0,109.0,10.0,35.0
8,1960,Summer,Australia,AUS,6.0,8.0,108.0,8.0,22.0
...,...,...,...,...,...,...,...,...,...
205,1960,Winter,New Zealand,NZL,0.0,0.0,6.0,0.0,0.0
208,1960,Winter,South Africa,ZAF,0.0,0.0,3.0,0.0,0.0
209,1960,Winter,South Korea,KOR,0.0,0.0,13.0,0.0,0.0
211,1960,Winter,Spain,ESP,0.0,0.0,6.0,0.0,0.0


In [34]:
# Export to sqlite
medals_team_total.to_sql('medals_team_total', disk_engine, if_exists='append', index = False)

In [37]:
# Giving an error - I need to fix it
test3 = session.query(Medals_Team_Total.Year, Medals_Team_Total.Season, Medals_Team_Total.Team, \
                      Medals_Team_Total.NOC, Medals_Team_Total.Bronze, Medals_Team_Total.Silver, \
                      Medals_Team_Total.Gold, Medals_Team_Total.Total_Medals).\
                    filter(Medals_Team_Total.NOC == 'USA').all()

test3

[(1960, 'Summer', 'United States', 'USA', 15, 21, 33, 69),
 (1960, 'Summer', 'Minotaur', 'USA', 0, 0, 1, 1),
 (1960, 'Summer', 'Shrew II', 'USA', 1, 0, 0, 1),
 (1960, 'Summer', 'Spirit VI', 'USA', 0, 0, 0, 0),
 (1960, 'Summer', 'Vim III', 'USA', 0, 0, 0, 0),
 (1960, 'Winter', 'United States', 'USA', 4, 4, 3, 11),
 (1964, 'Summer', 'United States', 'USA', 25, 26, 36, 87),
 (1964, 'Summer', 'Aphrodite', 'USA', 1, 0, 0, 1),
 (1964, 'Summer', 'Bingo', 'USA', 1, 0, 0, 1),
 (1964, 'Summer', 'Glider', 'USA', 0, 1, 0, 1),
 (1964, 'Summer', 'Widgeon', 'USA', 1, 0, 0, 1),
 (1964, 'Winter', 'United States', 'USA', 5, 2, 1, 8),
 (1968, 'Summer', 'United States', 'USA', 34, 28, 45, 107),
 (1968, 'Winter', 'United States', 'USA', 1, 3, 1, 5),
 (1972, 'Summer', 'United States', 'USA', 30, 32, 33, 95),
 (1972, 'Winter', 'United States', 'USA', 3, 2, 3, 8),
 (1976, 'Summer', 'United States', 'USA', 25, 35, 35, 95),
 (1976, 'Winter', 'United States', 'USA', 5, 3, 3, 11),
 (1980, 'Winter', 'United States