In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
import sqlite3
Base = declarative_base()

In [2]:
#Load CSV Files

# sources: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#
# Data starts in 1896
events_file = 'Resources/athlete_events.csv'
df_events_orig = pd.read_csv(events_file)

# source: World Bank (https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#) 
# Data starts in 1960
pop_gdp_file = 'Resources/population_gdp.csv'
df_pop_gdp_orig = pd.read_csv(pop_gdp_file)

In [3]:
# clean worldbank data
df_pop_gdp = df_pop_gdp_orig
df_pop_gdp = df_pop_gdp.drop(columns = ['Series Code'], axis =1)
df_pop_gdp = df_pop_gdp.rename(
    columns={
        "Series Name": "series_name", 
        "Country Name": "country_name",
        "Country Code": "country_code"})

df_pop_gdp.columns = df_pop_gdp.columns.str.split(' ').str[0].tolist()

keep_columns = ['series_name', 'country_name', 'country_code']
event_years = df_events_orig.Year.unique().astype(str)

for i in event_years:
    keep_columns.append(i)

df_pop_gdp = df_pop_gdp[df_pop_gdp.columns.intersection(keep_columns)]

df_gdp = df_pop_gdp.loc[df_pop_gdp["series_name"] == "GDP (current US$)"]
df_pop = df_pop_gdp.loc[df_pop_gdp["series_name"] == "Population, total"]

df_pop_gdp_new = df_pop_gdp.loc[(df_pop_gdp["series_name"] == "GDP (current US$)") | (df_pop_gdp["series_name"] == "Population, total")]
df_pop_gdp_new

Unnamed: 0,series_name,country_name,country_code,1960,1964,1968,1972,1976,1980,1984,...,1998,2000,2002,2004,2006,2008,2010,2012,2014,2016
0,GDP (current US$),Afghanistan,AFG,537777811.111111,800000044.444444,1373333366.66667,1595555475.55556,2555555566.66667,3641723321.99546,..,...,..,..,4055176933.36905,5226775163.30205,6971286731.72065,10109218067.7904,15856574731.4411,20001615788.6719,20484873230.2111,19362642266.6484
1,GDP (current US$),Albania,ALB,..,..,..,..,..,..,1857338011.85488,...,2545967253.2416,3480355188.60063,4348070165.19261,7184681398.5698,8896073938.31407,12881353984.6426,11926962834.9645,12319784701.3346,13228244336.3241,11861353752.0672
2,GDP (current US$),Algeria,DZA,2723648551.75208,2909351792.58659,3852115816.97758,6761786386.54713,17728347374.994,42345277342.0195,53698278905.9678,...,48187747528.899,54786074940.2073,56758113501.1671,85324767230.4939,117030941571.939,170997541140.985,161205065469.309,209062886917.045,213808808746.696,160032930353.764
3,GDP (current US$),American Samoa,ASM,..,..,..,..,..,..,..,...,..,..,514000000,512000000,496000000,563000000,576000000,644000000,642000000,652000000
4,GDP (current US$),Andorra,AND,..,..,..,113408231.944085,227281024.620741,446416105.825017,330070689.298282,...,1211932397.81713,1434429703.33518,1733116883.11688,2935659299.72684,3543256805.92147,4007353156.58415,3355695364.23841,3164615186.94591,3350736367.25488,2877311946.90265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
523,"Population, total",Sub-Saharan Africa,SSF,227233223,249873743,275937224,306359274,342018831,383188232,429521938,...,631400656,665327581,701066167,738983255,779566820,822945457,869025106,917726973,968959787,1022526541
524,"Population, total",Sub-Saharan Africa (excluding high income),SSA,227191523,249827421,275886090,306303245,341958327,383124971,429457221,...,631321810,665246450,700982444,738900780,779482220,822858501,868935336,917638670,968868428,1022431864
525,"Population, total",Sub-Saharan Africa (IDA & IBRD countries),TSS,227233223,249873743,275937224,306359274,342018831,383188232,429521938,...,631400656,665327581,701066167,738983255,779566820,822945457,869025106,917726973,968959787,1022526541
526,"Population, total",Upper middle income,UMC,1184555662,1264214938,1390259908,1529438212,1652300299,1760396042,1878028567,...,2274454564,2317310149,2355936797,2392790614,2428657785,2463294188,2499410493,2537625754,2577855859,2617804661


In [4]:
df_pop_gdp_new2 = df_pop_gdp_new.melt(id_vars=['series_name','country_name', 'country_code'])
df_pop_gdp_new2 = df_pop_gdp_new2.rename(columns={'variable':'year'})
df_pop_gdp_final = df_pop_gdp_new2.pivot_table(index=['year','country_code','country_name'], values = 'value', columns='series_name', aggfunc='sum')
df_pop_gdp_final = df_pop_gdp_final.reset_index()
df_pop_gdp_final = df_pop_gdp_final.astype({'year': 'int64'})
df_pop_gdp_final = df_pop_gdp_final.rename(columns = {'GDP (current US$)': 'GDP', 'Population, total': 'Population', 'country_code': 'NOC', 'year': 'Year', 'country_name': 'Country'})
df_pop_gdp_final.head()


series_name,Year,NOC,Country,GDP,Population
0,1960,ABW,Aruba,..,54211
1,1960,AFG,Afghanistan,537777811.111111,8996973
2,1960,AGO,Angola,..,5454933
3,1960,ALB,Albania,..,1608800
4,1960,AND,Andorra,..,13411


In [5]:
# clean Olympic NOC and worldbank country_code to match each other

# extract WorldBank's country_code and country_name
df_worldbanknames = df_pop[["country_code", "country_name"]]

# create a column with stripped country_name
# Sanitize
df_worldbanknames['Wclean'] = df_worldbanknames ['country_name'].str.replace(" ", "").str.lower()

# create a column with stripped Team name in Olympic Data
df_events = df_events_orig
# Santize
df_events['Oclean'] = df_events ['Team'].str.replace(" ", "").str.lower()

# Unique transformations of countries that are in both data bases, but didn't match up.
df_events['NOC'] = df_events['NOC'].str.replace("IRI", "IRN")
df_events['NOC'] = df_events['NOC'].str.replace("GER", "DEU")
df_events['NOC'] = df_events['NOC'].str.replace("BAH", "BHS")
df_events['NOC'] = df_events['NOC'].str.replace("SUI", "CHE")
df_events['NOC'] = df_events['NOC'].str.replace("ISV", "VIR")
df_events['NOC'] = df_events['NOC'].str.replace("GRE", "GRC")
df_events['NOC'] = df_events['NOC'].str.replace("DEN", "DNK")
df_events['NOC'] = df_events['NOC'].str.replace("NED", "NLD")
df_events['NOC'] = df_events['NOC'].str.replace("CGO", "COG")
df_events['NOC'] = df_events['NOC'].str.replace("LAT", "LVA")
df_events['NOC'] = df_events['NOC'].str.replace("INA", "IDN")
df_events['NOC'] = df_events['NOC'].str.replace("GAM", "GMB")
df_events['NOC'] = df_events['NOC'].str.replace("GBS", "GNB")
df_events['NOC'] = df_events['NOC'].str.replace("MAS", "MYS")
df_events['NOC'] = df_events['NOC'].str.replace("NGR", "NGA")
df_events['NOC'] = df_events['NOC'].str.replace("VIN", "VCT")
df_events['NOC'] = df_events['NOC'].str.replace("BRU", "BRN")
df_events['NOC'] = df_events['NOC'].str.replace("SLO", "SVN")
df_events['NOC'] = df_events['NOC'].str.replace("MRI", "MUS")
df_events['NOC'] = df_events['NOC'].str.replace("BUL", "BGR")
df_events['NOC'] = df_events['NOC'].str.replace("PUR", "PRI")
df_events['NOC'] = df_events['NOC'].str.replace("MON", "MCO")
df_events['NOC'] = df_events['NOC'].str.replace("SKN", "KNA")


# merge by stripped country names and make sure Olympic Data NOC matches WorldBank country_code
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='Oclean', right_on = 'Wclean')
df_events['NOC'] = np.where(df_events['country_code'].isnull() == False, df_events['country_code'], df_events['NOC'])

# drop unncessary columns
df_events = df_events.drop(columns = ['Oclean', 'country_code', 'country_name', 'Wclean'], axis = 1)

In [6]:
# Filter Olympic games database to be only after 1960
df_events = df_events[df_events["Year"] >= 1960]

In [7]:
# code for cleaning countries with numbers

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

    
teamlist1 = df_events["Team"]

teamlist2 = []

for team in teamlist1:
    if (hasNumbers (team)):
        splitlist = team.split("-")[0]
        teamlist2.append(splitlist)
    else:
        teamlist2.append(team)

df_events["Team"] = teamlist2

# df_events_trial["Team"].unique()

In [8]:
# df_olympians.head()

In [9]:
# df_events.head()

In [8]:
df_gdp.head()

Unnamed: 0,series_name,country_name,country_code,1960,1964,1968,1972,1976,1980,1984,...,1998,2000,2002,2004,2006,2008,2010,2012,2014,2016
0,GDP (current US$),Afghanistan,AFG,537777811.111111,800000044.444444,1373333366.66667,1595555475.55556,2555555566.66667,3641723321.99546,..,...,..,..,4055176933.36905,5226775163.30205,6971286731.72065,10109218067.7904,15856574731.4411,20001615788.6719,20484873230.2111,19362642266.6484
1,GDP (current US$),Albania,ALB,..,..,..,..,..,..,1857338011.85488,...,2545967253.2416,3480355188.60063,4348070165.19261,7184681398.5698,8896073938.31407,12881353984.6426,11926962834.9645,12319784701.3346,13228244336.3241,11861353752.0672
2,GDP (current US$),Algeria,DZA,2723648551.75208,2909351792.58659,3852115816.97758,6761786386.54713,17728347374.994,42345277342.0195,53698278905.9678,...,48187747528.899,54786074940.2073,56758113501.1671,85324767230.4939,117030941571.939,170997541140.985,161205065469.309,209062886917.045,213808808746.696,160032930353.764
3,GDP (current US$),American Samoa,ASM,..,..,..,..,..,..,..,...,..,..,514000000.0,512000000.0,496000000.0,563000000.0,576000000.0,644000000.0,642000000.0,652000000.0
4,GDP (current US$),Andorra,AND,..,..,..,113408231.944085,227281024.620741,446416105.825017,330070689.298282,...,1211932397.81713,1434429703.33518,1733116883.11688,2935659299.72684,3543256805.92147,4007353156.58415,3355695364.23841,3164615186.94591,3350736367.25488,2877311946.90265


In [11]:
# df_pop.head()

In [12]:
# df_olympians.head()

## DEBUG STARTS HERE

In [9]:
### DEBUG TO DISCUSS WITH TEAM ###

# Check for items that are not merged
checkmerge = pd.merge(df_events, df_worldbanknames, how="left", left_on='NOC', right_on = 'country_code')
unmatched = checkmerge[checkmerge['country_code'].isnull()]
# mergesuccess = checkmerge[checkmerge['country_code'].isnull() == False]

checking = unmatched
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year", ascending = False)

clearingbydate = checking

# We need to decide how to handle the following cases that are not in the World Bank Database.
# We can just keep them in the Olympics Database and note that they don't have GDP information.

# latest 2016 participants
# Taipei not recognized by WB
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TPE') == False]
# Individual Olympics
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('IOA') == False]
# Palestine not in world bank
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PLE') == False]
# Refugee
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ROT') == False]
# Cook Islands
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('COK') == False]

# latest 2008 participants
# Netherlands Antilles
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('AHO') == False]
# Not sure if this is Portugal
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('POR') == False]

# Latest 2006 Participants
# Serbia and Montenegro
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('SCG') == False]

# Latest 1992 Participants
# bunch of soviet countries that don't exist anymore
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('TCH') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('EUN') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YUG') == False]

# Latest 1988 Participants
# West and East Germany, Soviet Union
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('FRG') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('GDR') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('URS') == False]
# the two yemens
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YMD') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('YAR') == False]

# latest 1964
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('CAM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('BER') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('ZIM') == False]
clearingbydate = clearingbydate[clearingbydate['NOC'].str.contains('PHI') == False]

# clearingbydate

In [10]:
checking = df_events
checking['Year'] = checking['Year'].astype('int')
checking = checking.sort_values("Year")
# checking[checking['Team'] == "Bohemia"]
# checking[checking['NOC'] == "URS"]
# checking

In [11]:
# unmatched, but before 1960
unmatched_valuecounts = clearingbydate['NOC'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# unmatched_valuecounts 

In [12]:
# For Spot Checking Data Integrity

### Need to figure out how to get the summary by event

medals = df_events.loc[df_events['Games'] == '2014 Winter'] 
medals = medals.loc[df_events['Medal'] == 'Gold'] 

# this counts the number of Gold Medals that Individuals Received 
medals_gb = medals.groupby(['NOC'])["Medal"].count().rename_axis('country_code').reset_index(name='medal_count')
# medals_gb.sort_values("medal_count", ascending = False)

In [17]:
# Disregard

# # cleaning olympians
# df_olympians1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Name"].count().reset_index(name = 'num_events')
# # df_olympians1.sort_values("num_events", ascending=False).head(20)

# df_olympians2 = df_olympians1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_olympians')
# # df_olympians2.sort_values("num_olympians", ascending=False).head()


# # cleaning medalists
# df_medalist1 = df_events.groupby(['NOC', 'Team', 'Year', 'Season', 'Games', 'ID'])["Medal"].count().reset_index(name = 'num_medals')
# df_medalist1 = df_medalist1[df_medalist1['num_medals'] > 0]
# # df_medalist1.sort_values("num_medals", ascending=False).head()

# df_medalist2 = df_medalist1.groupby(['NOC', 'Team', 'Year', 'Season', 'Games'])["ID"].count().reset_index(name = 'num_medalist')
# # df_medalist2.sort_values("num_medalist", ascending=False).head()

# # merging the two
# df_medalist3 = df_medalist2[["Team", "Games", "num_medalist"]]
# df_olympians = pd.merge(df_olympians2, df_medalist3, how="outer", on=['Team', 'Games'])
# df_olympians["perc_medalist"] = df_olympians["num_medalist"]/df_olympians["num_olympians"]

# ### Need to keep the medal types for it to be filterable...

# # # creating a json file.. doesn't work
# # df_olympians.pivot_table(["num_olympians", "num_medalist", "perc_medalist"], ["Team"], ["Games"]).reset_index()
# # # .to_json('HYLab/olympiandata.json', orient='records')

# # df_olympians.to_csv('HYLab/olympiandata.csv')

## Add Country Name corrected

In [18]:
## DEBUG STARTS HERE

In [13]:
# Add the Country name to the table, based on the gdp data base
df_events_new = pd.merge(df_events, df_gdp, how="left", left_on='NOC', right_on = 'country_code')

df_events_new.drop(columns=['series_name', 'country_code', '1960', '1964', '1968', '1972', '1976',
       '1980', '1984', '1988', '1992', '1994', '1996', '1998', '2000', '2002',
       '2004', '2006', '2008', '2010', '2012', '2014', '2016'], axis=1, inplace=True)
df_events_new = df_events_new.rename(columns={'country_name': 'Country'})
df_events_new['Country'] = np.where(df_events_new['Country'].isnull(), df_events_new['Team'], df_events_new['Country'])
df_events_new.head()

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Country
0,1,A Dijiang,M,24.0,180.0,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,,China
1,2,A Lamusi,M,23.0,170.0,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,,China
2,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NLD,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,,Netherlands
3,5,Christine Jacoba Aaftink,F,21.0,185.0,82.0,Netherlands,NLD,1988 Winter,1988,Winter,Calgary,Speed Skating,"Speed Skating Women's 1,000 metres",,Netherlands
4,5,Christine Jacoba Aaftink,F,25.0,185.0,82.0,Netherlands,NLD,1992 Winter,1992,Winter,Albertville,Speed Skating,Speed Skating Women's 500 metres,,Netherlands


## Diana's code

In [20]:
df_events_new.columns

Index(['ID', 'Name', 'Sex', 'Age', 'Height', 'Weight', 'Team', 'NOC', 'Games',
       'Year', 'Season', 'City', 'Sport', 'Event', 'Medal', 'Country'],
      dtype='object')

In [21]:
class Events(Base):
    __tablename__ = 'events'
    ID = Column(Integer, primary_key=True)
    Name = Column(String(255))
    Sex = Column(String(255))
    Age = Column(Integer)
    Height = Column(Integer)
    Weight = Column(Integer)
    Team = Column(String(255))
    NOC = Column(String(255))
    Games = Column(String(255))
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255))
    City = Column(String(255))
    Sport = Column(String(255))
    Event = Column(String(255), primary_key=True)
    Medal = Column(String(255))
    Country = Column(String(255), primary_key=True)

In [22]:
# class Olympians_Team_Final(Base):
#     __tablename__ = 'olympians_team_final'
#     Year = Column(Integer, primary_key=True)
#     Season = Column(String(255), primary_key=True)
#     Team = Column(String(255), primary_key=True)
#     NOC = Column(String(255), primary_key=True)
#     No_olympians = Column(Integer)

In [23]:
# class Medals_Team_Total(Base):
#     __tablename__ = 'medals_team_total'
#     Year = Column(Integer, primary_key=True)
#     Season = Column(String(255), primary_key=True)
#     Team = Column(String(255), primary_key=True)
#     NOC = Column(String(255), primary_key=True)
#     No_medal = Column(Integer)
#     Bronze = Column(Integer)
#     Gold = Column(Integer)
#     Silver = Column(Integer)
#     Total_Medals = Column(Integer)

In [24]:
class Events_Final(Base):
    __tablename__ = 'events_final'
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255), primary_key=True)
    City = Column(String(255), primary_key=True)
    NOC = Column(String(255), primary_key=True)
    Country = Column(String(255), primary_key=True)
    Game_Label = Column(String(255))
    GDP = Column(Float)
    Population = Column(Integer)
    No_olympians = Column(Integer)
    Bronze_athlete = Column(Integer)
    Gold_athlete = Column(Integer)
    Silver_athlete = Column(Integer)
    Total_Medals_athlete = Column(Integer)
    Bronze_team = Column(Integer)
    Gold_team = Column(Integer)
    Silver_team = Column(Integer)
    Total_Medals_team = Column(Integer)

In [14]:
# Create engine connection and export the gg_movie_final dataframe to sql\n",
disk_engine = create_engine('sqlite:///olympic_events.sqlite')

In [26]:
Base.metadata.create_all(disk_engine)

In [27]:
df_events_new.to_sql('events', disk_engine, if_exists='append', index = False)

In [15]:
session = Session(disk_engine)
conn = disk_engine.connect()

In [16]:
# To check the tables that are in our sqlite db
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
print(table)

# To check the columns in the tables
conn = sqlite3.connect("olympic_events.sqlite")
cursor = conn.execute('select * from events_final')
cursor.description

                   name
0                events
1          events_final
2  olympians_team_final


(('Year', None, None, None, None, None, None),
 ('Season', None, None, None, None, None, None),
 ('City', None, None, None, None, None, None),
 ('NOC', None, None, None, None, None, None),
 ('Country', None, None, None, None, None, None),
 ('Game_Label', None, None, None, None, None, None),
 ('GDP', None, None, None, None, None, None),
 ('Population', None, None, None, None, None, None),
 ('No_olympians', None, None, None, None, None, None),
 ('Bronze_athlete', None, None, None, None, None, None),
 ('Gold_athlete', None, None, None, None, None, None),
 ('Silver_athlete', None, None, None, None, None, None),
 ('Total_Medals_athlete', None, None, None, None, None, None),
 ('Bronze_team', None, None, None, None, None, None),
 ('Gold_team', None, None, None, None, None, None),
 ('Silver_team', None, None, None, None, None, None),
 ('Total_Medals_team', None, None, None, None, None, None))

<h2> Olympians over time

In [30]:
test1 = session.query(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).\
    group_by(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).all()

In [31]:
# Intermediate table
olympians_team_detail = pd.read_sql_query('SELECT year, season, name, team, NOC FROM events \
GROUP BY year, season, name, team, NOC',disk_engine)
olympians_team_detail

Unnamed: 0,Year,Season,Name,Team,NOC
0,1960,Summer,A. Abdul Razzak,Iraq,IRQ
1,1960,Summer,"A. W. Nancy ""Nan"" Rae",Great Britain,GBR
2,1960,Summer,Aage Birch,Chok,DNK
3,1960,Summer,"Aartje Johanna ""Atie"" Voorbij (-Dorresteijn)",Netherlands,NLD
4,1960,Summer,Abbas Khamis,United Arab Republic,UAR
...,...,...,...,...,...
147893,2016,Summer,va Csernoviczki,Hungary,HUN
147894,2016,Summer,va Risztov,Hungary,HUN
147895,2016,Summer,zge Bayrak,Turkey,TUR
147896,2016,Summer,zlem Kaya,Turkey,TUR


In [32]:
# Final table with number of athletes per Event (Year&Season)
olympians_team = olympians_team_detail.groupby(['Year','Season', 'Team', 'NOC']).count()
olympians_team.reset_index(inplace = True)
olympians_team_final = olympians_team.rename(columns = {"Name": "No_olympians"})

In [33]:
# Export to sqlite
olympians_team_final.to_sql('olympians_team_final', disk_engine, if_exists='append', index = False)

In [34]:
# Giving an error - I need to fix it
# test2 = session.query(Olympians_Team_Final.Year, Olympians_Team_Final.Season,\
#                         Olympians_Team_Final.Team, Olympians_Team_Final.NOC, Olympians_Team_Final.No_olympians).all()


<h2> Medals by country / Total Medal count over time / Participating events over time

In [35]:
# Intermediate table 1
medals_team_detail = pd.read_sql_query('SELECT year, season, sport, event, sex, medal, team, NOC FROM events \
GROUP BY year, season, sport, event, sex, medal, team, NOC',disk_engine)
medals_team_detail["Medal"].fillna("No_medal", inplace = True) 

In [36]:
# Intermediate table 2 - may be used for visualizations
medals_team = medals_team_detail.groupby(['Year','Season','Team', 'NOC', 'Medal']).count()[['Sport']]
medals_team.reset_index(inplace = True)
medals_team.rename(columns = {"Sport": "#Medals"}, inplace = True)
medals_team.sort_values(by=["#Medals"], ascending=False)
medals_team.loc[medals_team['Year'] == 1984].sort_values(by=['Year',''"#Medals"], ascending=[True, False])

Unnamed: 0,Year,Season,Team,NOC,Medal,#Medals
1813,1984,Summer,Great Britain,GBR,No_medal,178
1768,1984,Summer,Canada,CAN,No_medal,164
1961,1984,Summer,West Germany,FRG,No_medal,159
1953,1984,Summer,United States,USA,No_medal,138
1835,1984,Summer,Italy,ITA,No_medal,133
...,...,...,...,...,...,...
2027,1984,Winter,Puerto Rico,PRI,No_medal,1
2041,1984,Winter,Switzerland,CHE,Bronze,1
2049,1984,Winter,West Germany,FRG,Bronze,1
2052,1984,Winter,West Germany,FRG,Silver,1


In [37]:
# Summary table with all the participant countries. Total Medals is sum of G,S&B
# If you need a list of medals, check the previous table
medals_team_total = medals_team.pivot_table(index=['Year','Season','Team','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_team_total.reset_index(inplace = True)
medals_team_total.replace(np.nan,0, inplace = True)
medals_team_total["Total_Medals"] = medals_team_total["Bronze"] + medals_team_total["Gold"] + medals_team_total["Silver"]
medals_team_total.sort_values(by=['Year','Season', 'Total_Medals'], ascending=[True, True, False], inplace = True)
medals_team_total.loc[medals_team_total['Year'] == 1960]

Medal,Year,Season,Team,NOC,Bronze,Gold,No_medal,Silver,Total_Medals
149,1960,Summer,Soviet Union,URS,31.0,42.0,96.0,29.0,102.0
171,1960,Summer,United States,USA,15.0,33.0,111.0,21.0,69.0
60,1960,Summer,Germany,DEU,10.0,12.0,120.0,19.0,41.0
79,1960,Summer,Italy,ITA,12.0,13.0,109.0,10.0,35.0
8,1960,Summer,Australia,AUS,6.0,8.0,108.0,8.0,22.0
...,...,...,...,...,...,...,...,...,...
205,1960,Winter,New Zealand,NZL,0.0,0.0,6.0,0.0,0.0
208,1960,Winter,South Africa,ZAF,0.0,0.0,3.0,0.0,0.0
209,1960,Winter,South Korea,KOR,0.0,0.0,13.0,0.0,0.0
211,1960,Winter,Spain,ESP,0.0,0.0,6.0,0.0,0.0


In [38]:
# Export to sqlite
# medals_team_total.to_sql('medals_team_total', disk_engine, if_exists='append', index = False)

In [39]:
# Giving an error - I need to fix it
# test3 = session.query(Medals_Team_Total.Year, Medals_Team_Total.Season, Medals_Team_Total.Team, \
#                       Medals_Team_Total.NOC, Medals_Team_Total.Bronze, Medals_Team_Total.Silver, \
#                       Medals_Team_Total.Gold, Medals_Team_Total.Total_Medals).\
#                     filter(Medals_Team_Total.NOC == 'USA').all()



<h2> Diana's new code

In [17]:
# Base table
events_country = pd.read_sql_query('SELECT year, season, city, NOC, country FROM events \
GROUP BY year, season, city, NOC, country',disk_engine)
events_country['Game_Label'] = events_country['Season'] + ' ' + events_country['Year'].map(str) + ' - '  + events_country['City'] 
events_country = events_country[['Year', 'Season', 'City', 'Game_Label', 'NOC', 'Country']]

In [18]:
# Population and GDP Data
df_pop_gdp_final.head()

series_name,Year,NOC,Country,GDP,Population
0,1960,ABW,Aruba,..,54211
1,1960,AFG,Afghanistan,537777811.111111,8996973
2,1960,AGO,Angola,..,5454933
3,1960,ALB,Albania,..,1608800
4,1960,AND,Andorra,..,13411


In [19]:
# Number of Athletes
# Step 1 Intermediate table
olympians_team_detail = pd.read_sql_query('SELECT year, season, name, NOC FROM events \
GROUP BY year, season, name, NOC',disk_engine)
olympians_team = olympians_team_detail.groupby(['Year','Season', 'NOC']).count()
olympians_team.reset_index(inplace = True)
olympians_team_final = olympians_team.rename(columns = {"Name": "No_olympians"})
olympians_team_final

Unnamed: 0,Year,Season,NOC,No_olympians
0,1960,Summer,AFG,12
1,1960,Summer,AHO,5
2,1960,Summer,ARG,92
3,1960,Summer,AUS,189
4,1960,Summer,AUT,103
...,...,...,...,...
3121,2016,Summer,XKX,8
3122,2016,Summer,YEM,3
3123,2016,Summer,ZAF,135
3124,2016,Summer,ZMB,7


In [20]:
# Number of Medals per athlete
# Step 1
medals_athlete_detail = pd.read_sql_query('SELECT year, season, NOC, sport, event, name, sex, medal FROM events \
GROUP BY year, season, NOC, sport, event, name, sex, medal',disk_engine)
# Step 2
medals_athlete = medals_athlete_detail.groupby(['Year','Season','NOC', 'Medal']).count()[['Name']]
medals_athlete.reset_index(inplace = True)
medals_athlete.rename(columns = {"Name": "#Medals"}, inplace = True)
medals_athlete.sort_values(by=["#Medals"], ascending=False)
# Step 3
medals_athlete_total = medals_athlete.pivot_table(index=['Year','Season','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_athlete_total.reset_index(inplace = True)
medals_athlete_total.replace(np.nan,0, inplace = True)
medals_athlete_total["Total_Medals"] = medals_athlete_total["Bronze"] + medals_athlete_total["Gold"] + medals_athlete_total["Silver"]
medals_athlete_total

Medal,Year,Season,NOC,Bronze,Gold,Silver,Total_Medals
0,1960,Summer,ARG,1.0,0.0,3.0,4.0
1,1960,Summer,AUS,11.0,11.0,24.0,46.0
2,1960,Summer,AUT,0.0,1.0,2.0,3.0
3,1960,Summer,BEL,2.0,0.0,2.0,4.0
4,1960,Summer,BGR,3.0,1.0,3.0,7.0
...,...,...,...,...,...,...,...
1199,2016,Summer,UZB,7.0,4.0,2.0,13.0
1200,2016,Summer,VEN,2.0,0.0,1.0,3.0
1201,2016,Summer,VNM,0.0,1.0,1.0,2.0
1202,2016,Summer,XKX,0.0,1.0,0.0,1.0


In [21]:
# Number of Medals per sport
# Step 1
medals_sport_detail = pd.read_sql_query('SELECT year, season, NOC, sport, event, sex, medal FROM events \
GROUP BY year, season, NOC, sport, event, sex, medal',disk_engine)
# Step 2
medals_sport = medals_sport_detail.groupby(['Year','Season','NOC', 'Medal']).count()[['Event']]
medals_sport.reset_index(inplace = True)
medals_sport.rename(columns = {"Event": "#Medals"}, inplace = True)
medals_sport.sort_values(by=["#Medals"], ascending=False)
# Step 3
medals_sport_total = medals_sport.pivot_table(index=['Year','Season','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_sport_total.reset_index(inplace = True)
medals_sport_total.replace(np.nan,0, inplace = True)
medals_sport_total["Total_Medals"] = medals_sport_total["Bronze"] + medals_sport_total["Gold"] + medals_sport_total["Silver"]
medals_sport_total

Medal,Year,Season,NOC,Bronze,Gold,Silver,Total_Medals
0,1960,Summer,ARG,1.0,0.0,1.0,2.0
1,1960,Summer,AUS,6.0,8.0,8.0,22.0
2,1960,Summer,AUT,0.0,1.0,1.0,2.0
3,1960,Summer,BEL,2.0,0.0,2.0,4.0
4,1960,Summer,BGR,3.0,1.0,3.0,7.0
...,...,...,...,...,...,...,...
1199,2016,Summer,UZB,7.0,4.0,2.0,13.0
1200,2016,Summer,VEN,2.0,0.0,1.0,3.0
1201,2016,Summer,VNM,0.0,1.0,1.0,2.0
1202,2016,Summer,XKX,0.0,1.0,0.0,1.0


In [22]:
# Merge 1: Base table & Pop and GDP table
events_merge_1 = pd.merge(events_country, df_pop_gdp_final, how = "left", left_on = ['Year','NOC'], right_on = ['Year','NOC'], suffixes=('', '_y'))
events_merge_1 = events_merge_1.drop(columns = ['Country_y'])

In [23]:
events_merge_2 = pd.merge(events_merge_1, olympians_team_final, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'])
events_merge_3 = pd.merge(events_merge_2, medals_athlete_total, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'])
events_merge_4 = pd.merge(events_merge_3, medals_sport_total, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'], suffixes=('_athlete', '_team')) 

In [24]:
wld_events_1 = events_merge_4.groupby(['Year','Season','City','Game_Label']).sum()
wld_events_2 = pd.DataFrame(wld_events_1).reset_index()
wld_events_2['NOC'] = 'WLD'
wld_events_2['Country'] = 'World'

# add the WLD Pop and GDP information
wld_base_info = df_pop_gdp_final[df_pop_gdp_final['NOC'] == 'WLD']
wld_all_info = pd.merge(wld_events_2, wld_base_info, how = "left", left_on = ['NOC','Year','Country'], right_on = ['NOC','Year','Country'])

In [25]:
events_final = pd.concat([events_merge_4, wld_all_info], sort=False)
events_final

Unnamed: 0,Year,Season,City,Game_Label,NOC,Country,GDP,Population,No_olympians,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team
0,1960,Summer,Roma,Summer 1960 - Roma,AFG,Afghanistan,537777811.111111,8996973,12,,,,,,,,
1,1960,Summer,Roma,Summer 1960 - Roma,AHO,Netherlands Antilles,,,5,,,,,,,,
2,1960,Summer,Roma,Summer 1960 - Roma,ARG,Argentina,..,20481779,92,1.0,0.0,3.0,4.0,1.0,0.0,1.0,2.0
3,1960,Summer,Roma,Summer 1960 - Roma,AUS,Australia,18577668271.9229,10276477,189,11.0,11.0,24.0,46.0,6.0,8.0,8.0,22.0
4,1960,Summer,Roma,Summer 1960 - Roma,AUT,Austria,6592693841.18495,7047539,103,0.0,1.0,2.0,3.0,0.0,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25,2008,Summer,Beijing,Summer 2008 - Beijing,WLD,World,63611588400560.2,6757887172,10889,710.0,671.0,667.0,2048.0,357.0,305.0,306.0,968.0
26,2010,Winter,Vancouver,Winter 2010 - Vancouver,WLD,World,66051180948591.1,6922947261,2535,171.0,174.0,175.0,520.0,87.0,88.0,89.0,264.0
27,2012,Summer,London,Summer 2012 - London,WLD,World,75085087909756.3,7086993625,10502,679.0,632.0,630.0,1941.0,360.0,306.0,307.0,973.0
28,2014,Winter,Sochi,Winter 2014 - Sochi,WLD,World,79332625229254.6,7255653881,2744,198.0,202.0,197.0,597.0,104.0,104.0,102.0,310.0


In [50]:
# Side Checks 1
a = events_final.groupby(['Year','Season','City','NOC']).count()
a = a.reset_index()
a[a['Country'] > 1]

Unnamed: 0,Year,Season,City,NOC,Game_Label,Country,GDP,Population,No_olympians,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team
6,1960,Summer,Roma,BER,3,3,0,0,3,0,0,0,0,0,0,0,0
64,1960,Summer,Roma,POR,4,4,0,0,4,4,4,4,4,4,4,4,4
67,1960,Summer,Roma,RHO,2,2,0,0,2,0,0,0,0,0,0,0,0
81,1960,Summer,Roma,URS,5,5,0,0,5,5,5,5,5,5,5,5,5
87,1960,Summer,Roma,WIF,2,2,0,0,2,2,2,2,2,2,2,2,2
89,1960,Summer,Roma,YUG,2,2,0,0,2,2,2,2,2,2,2,2,2
197,1964,Summer,Tokyo,POR,2,2,0,0,2,0,0,0,0,0,0,0,0
212,1964,Summer,Tokyo,URS,5,5,0,0,5,5,5,5,5,5,5,5,5
2512,2008,Summer,Beijing,POR,3,3,0,0,3,0,0,0,0,0,0,0,0


In [51]:
# Side Checks 1 - detail
events_final[(events_final['Year'] == 1960) & (events_final['Season'] == 'Summer') & (events_final['NOC'] == 'URS')]

Unnamed: 0,Year,Season,City,Game_Label,NOC,Country,GDP,Population,No_olympians,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team
87,1960,Summer,Roma,Summer 1960 - Roma,URS,Nokaut II,,,283,45.0,61.0,63.0,169.0,31.0,43.0,29.0,103.0
88,1960,Summer,Roma,Summer 1960 - Roma,URS,Persey,,,283,45.0,61.0,63.0,169.0,31.0,43.0,29.0,103.0
89,1960,Summer,Roma,Summer 1960 - Roma,URS,Soviet Union,,,283,45.0,61.0,63.0,169.0,31.0,43.0,29.0,103.0
90,1960,Summer,Roma,Summer 1960 - Roma,URS,Tornado,,,283,45.0,61.0,63.0,169.0,31.0,43.0,29.0,103.0
91,1960,Summer,Roma,Summer 1960 - Roma,URS,Viktoriya,,,283,45.0,61.0,63.0,169.0,31.0,43.0,29.0,103.0


In [52]:
events_final.to_sql('events_final', disk_engine, if_exists='append', index = False)

In [53]:
events_final.columns

Index(['Year', 'Season', 'City', 'Game_Label', 'NOC', 'Country', 'GDP',
       'Population', 'No_olympians', 'Bronze_athlete', 'Gold_athlete',
       'Silver_athlete', 'Total_Medals_athlete', 'Bronze_team', 'Gold_team',
       'Silver_team', 'Total_Medals_team'],
      dtype='object')

In [26]:
events_final[(events_final['Year'] == 1980) & (events_final['Season'] == 'Summer') & (events_final['NOC'] == 'BRA')]

Unnamed: 0,Year,Season,City,Game_Label,NOC,Country,GDP,Population,No_olympians,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team
714,1980,Summer,Moskva,Summer 1980 - Moskva,BRA,Brazil,235024598983.261,120694009,106,5.0,4.0,0.0,9.0,2.0,2.0,0.0,4.0


In [31]:
test = df_events_new[(df_events_new['Year'] == 1980) & (df_events_new['Season'] == 'Summer') & (df_events_new['NOC'] == 'BRA')]
test

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Country
1381,947,Adilson de Freitas Nascimento,M,28.0,195.0,88.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Basketball,Basketball Men's Basketball,,Brazil
3608,2465,Eliana Maria Nagib Aleixo,F,26.0,172.0,63.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Volleyball,Volleyball Women's Volleyball,,Brazil
4743,3202,Gilson Alvaristo,M,24.0,170.0,66.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Cycling,"Cycling Men's Road Race, Individual",,Brazil
4938,3330,Amauri Ribeiro,M,21.0,198.0,87.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Volleyball,Volleyball Men's Volleyball,,Brazil
5892,4056,Andr Ernesto Stoffel,M,20.0,202.0,90.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Basketball,Basketball Men's Basketball,,Brazil
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194627,126553,Regina dos Santos Vilela,F,22.0,171.0,64.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Volleyball,Volleyball Women's Volleyball,,Brazil
197243,128392,Deraldo Peixoto Wanderley,M,24.0,191.0,80.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Volleyball,Volleyball Men's Volleyball,,Brazil
199076,129573,Alexander Welter,M,27.0,186.0,75.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Sailing,Sailing Mixed Multihull,Gold,Brazil
200437,130540,William Carvalho da Silva,M,25.0,185.0,80.0,Brazil,BRA,1980 Summer,1980,Summer,Moskva,Volleyball,Volleyball Men's Volleyball,,Brazil


In [35]:
test.groupby(['Event','Name','Sport','Medal','Sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,ID,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Country
Event,Name,Sport,Medal,Sex,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Athletics Men's Triple Jump,Joo Carlos de Oliveira,Athletics,Bronze,M,1,1,1,1,1,1,1,1,1,1,1
Sailing Mixed Multihull,Alexander Welter,Sailing,Gold,M,1,1,1,1,1,1,1,1,1,1,1
Sailing Mixed Multihull,Lars Sigurd Bjorkstrom,Sailing,Gold,M,1,1,1,1,1,1,1,1,1,1,1
Sailing Mixed Two Person Dinghy,Eduardo Henrique Gomes Penido,Sailing,Gold,M,1,1,1,1,1,1,1,1,1,1,1
Sailing Mixed Two Person Dinghy,Marcos Pinto Rizzo Soares,Sailing,Gold,M,1,1,1,1,1,1,1,1,1,1,1
Swimming Men's 4 x 200 metres Freestyle Relay,Cyro Marques Delgado,Swimming,Bronze,M,1,1,1,1,1,1,1,1,1,1,1
Swimming Men's 4 x 200 metres Freestyle Relay,Djan Garrido Madruga,Swimming,Bronze,M,1,1,1,1,1,1,1,1,1,1,1
Swimming Men's 4 x 200 metres Freestyle Relay,Jorge Luiz Fernandes Leite,Swimming,Bronze,M,1,1,1,1,1,1,1,1,1,1,1
Swimming Men's 4 x 200 metres Freestyle Relay,Marcus Laborne Mattioli,Swimming,Bronze,M,1,1,1,1,1,1,1,1,1,1,1
