In [152]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, Column, Integer, String, Float
from sqlalchemy.ext.declarative import declarative_base
import sqlite3
Base = declarative_base()

In [153]:
# Load CSV Files

# sources: https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#
# Data starts in 1896
events_file = 'Resources/athlete_events.csv'
df_events_orig = pd.read_csv(events_file)

# source: World Bank (https://databank.worldbank.org/reports.aspx?source=2&series=NY.GDP.MKTP.CD&country=#) 
# Data starts in 1960
pop_gdp_file = 'Resources/population_gdp.csv'
df_pop_gdp_orig = pd.read_csv(pop_gdp_file)

<h3> Clean Worldbank data

In [154]:
# step 1 - rename columns
df_pop_gdp = df_pop_gdp_orig
df_pop_gdp = df_pop_gdp.drop(columns = ['Series Code'], axis =1)
df_pop_gdp = df_pop_gdp.rename(
    columns={
        "Series Name": "Series", 
        "Country Name": "Country",
        "Country Code": "NOC"})

df_pop_gdp.columns = df_pop_gdp.columns.str.split(' ').str[0].tolist()

keep_columns = ['Series', 'Country', 'NOC']
event_years = df_events_orig.Year.unique().astype(str)

for i in event_years:
    keep_columns.append(i)

df_pop_gdp = df_pop_gdp[df_pop_gdp.columns.intersection(keep_columns)]

In [155]:
# step 2 - filter by GDP and Population series_names
df_pop_gdp_1 = df_pop_gdp.loc[(df_pop_gdp["Series"] == "GDP (current US$)") | (df_pop_gdp["Series"] == "Population, total")]
df_pop_gdp_2 = df_pop_gdp_1.melt(id_vars=['Series', 'Country', 'NOC'])
df_pop_gdp_2 = df_pop_gdp_2.rename(columns={'variable':'Year'})

In [156]:
# step 3 - Reformat the table
df_pop_gdp_final = df_pop_gdp_2.pivot_table(index=['Year','NOC','Country'], values = 'value', columns='Series', aggfunc='sum')
df_pop_gdp_final = df_pop_gdp_final.reset_index()
df_pop_gdp_final = df_pop_gdp_final.rename(columns = {'GDP (current US$)': 'GDP', 'Population, total': 'Population'})
df_pop_gdp_final['Population'] = np.where(df_pop_gdp_final['Population'] == '..', 0, df_pop_gdp_final['Population'])
df_pop_gdp_final['GDP'] = np.where(df_pop_gdp_final['GDP'] == '..', 0, df_pop_gdp_final['GDP'])
df_pop_gdp_final = df_pop_gdp_final.astype({'GDP': 'float64', 'Year': 'int64', 'Population': 'int64'})
df_pop_gdp_final['GDP_per_capita'] = df_pop_gdp_final['GDP'] / df_pop_gdp_final['Population']

<h3> Clean Olympic Events

In [170]:
# step 1 - define df_events dataframe
df_events = df_events_orig

In [171]:
# step 2 - clean NOC (in df_events) based on worldbank NOC to match each other

# extract WorldBank's NOC (Country code) and Country (name)
df_worldbanknames = pd.DataFrame(df_pop_gdp_final.groupby(['NOC','Country']).size().reset_index())
df_worldbanknames = df_worldbanknames.drop(columns=[0])

# create a column with stripped country_name and sanitize
df_worldbanknames['Wclean'] = df_worldbanknames ['Country'].str.replace(" ", "").str.lower()

# create a column with stripped Team name in Olympic Data and sanitize
df_events['Oclean'] = df_events ['Team'].str.replace(" ", "").str.lower()

# Unique transformations of countries that are in both data bases, but didn't match up.
df_events['NOC'] = df_events['NOC'].str.replace("IRI", "IRN")
df_events['NOC'] = df_events['NOC'].str.replace("GER", "DEU")
df_events['NOC'] = df_events['NOC'].str.replace("BAH", "BHS")
df_events['NOC'] = df_events['NOC'].str.replace("SUI", "CHE")
df_events['NOC'] = df_events['NOC'].str.replace("ISV", "VIR")
df_events['NOC'] = df_events['NOC'].str.replace("GRE", "GRC")
df_events['NOC'] = df_events['NOC'].str.replace("DEN", "DNK")
df_events['NOC'] = df_events['NOC'].str.replace("NED", "NLD")
df_events['NOC'] = df_events['NOC'].str.replace("CGO", "COG")
df_events['NOC'] = df_events['NOC'].str.replace("LAT", "LVA")
df_events['NOC'] = df_events['NOC'].str.replace("INA", "IDN")
df_events['NOC'] = df_events['NOC'].str.replace("GAM", "GMB")
df_events['NOC'] = df_events['NOC'].str.replace("GBS", "GNB")
df_events['NOC'] = df_events['NOC'].str.replace("MAS", "MYS")
df_events['NOC'] = df_events['NOC'].str.replace("NGR", "NGA")
df_events['NOC'] = df_events['NOC'].str.replace("VIN", "VCT")
df_events['NOC'] = df_events['NOC'].str.replace("BRU", "BRN")
df_events['NOC'] = df_events['NOC'].str.replace("SLO", "SVN")
df_events['NOC'] = df_events['NOC'].str.replace("MRI", "MUS")
df_events['NOC'] = df_events['NOC'].str.replace("BUL", "BGR")
df_events['NOC'] = df_events['NOC'].str.replace("PUR", "PRI")
df_events['NOC'] = df_events['NOC'].str.replace("MON", "MCO")
df_events['NOC'] = df_events['NOC'].str.replace("SKN", "KNA")
df_events['NOC'] = df_events['NOC'].str.replace("POR", "PRT")

# merge by stripped country names and make sure Olympic Data NOC matches WorldBank country_code
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='Oclean', right_on = 'Wclean', suffixes=('', '_y'))
df_events['NOC'] = np.where(df_events['NOC_y'].isnull(), df_events['NOC'], df_events['NOC_y'])

# drop unncessary columns from the merge
df_events = df_events.drop(columns = ['Oclean', 'NOC_y', 'Wclean'], axis = 1)

In [172]:
# DELETE THIS LINE

df_events[df_events['Team'] == 'Spirit VI']

Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal,Country
122593,61969,"Claude Lazard Kohler, II",M,28.0,170.0,77.0,Spirit VI,USA,1960 Summer,1960,Summer,Roma,Sailing,Sailing Mixed Three Person Keelboat,,
154040,77339,"Allen Wilford McClure, Jr.",M,25.0,193.0,83.0,Spirit VI,USA,1960 Summer,1960,Summer,Roma,Sailing,Sailing Mixed Three Person Keelboat,,
255805,128068,"Eugene Henry ""Gene"" Walet, III",M,24.0,180.0,84.0,Spirit VI,USA,1960 Summer,1960,Summer,Roma,Sailing,Sailing Mixed Three Person Keelboat,,


In [173]:
#  step 3 - clean Country names based on the NOC (for small teams with Country NOCs)
df_events = pd.merge(df_events, df_worldbanknames, how="left", left_on='NOC', right_on = 'NOC', suffixes=('', '_y'))
df_events['Country'] = np.where(df_events['Country'].isnull(), df_events['Country_y'], df_events['Country'])
# # drop unncessary columns from the merge
df_events = df_events.drop(columns = ['Country_y', 'Wclean'], axis = 1)

In [174]:
# step 4 - clean team names with numbers
def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

teamlist1 = df_events["Team"]
teamlist2 = []

for team in teamlist1:
    if (hasNumbers (team)):
        splitlist = team.split("-")[0]
        teamlist2.append(splitlist)
    else:
        teamlist2.append(team)

df_events["Team"] = teamlist2

In [175]:
# step 5 - use Team name when Country name is NaN
df_events['Country'] = np.where(df_events['Country'].isnull(), df_events['Team'], df_events['Country'])

In [176]:
# step 6 - filter Olympic games database to be only after 1960
df_events = df_events[df_events["Year"] >= 1960]

<h3> Define Classes for the needed sqlite tables

In [163]:
class Events(Base):
    __tablename__ = 'events'
    ID = Column(Integer, primary_key=True)
    Name = Column(String(255))
    Sex = Column(String(255))
    Age = Column(Integer)
    Height = Column(Integer)
    Weight = Column(Integer)
    Team = Column(String(255))
    NOC = Column(String(255))
    Games = Column(String(255))
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255))
    City = Column(String(255))
    Sport = Column(String(255))
    Event = Column(String(255), primary_key=True)
    Medal = Column(String(255))
    Country = Column(String(255), primary_key=True)

In [164]:
class Events_Final(Base):
    __tablename__ = 'events_final'
    Year = Column(Integer, primary_key=True)
    Season = Column(String(255), primary_key=True)
    City = Column(String(255), primary_key=True)
    NOC = Column(String(255), primary_key=True)
    Country = Column(String(255), primary_key=True)
    Game_Label = Column(String(255))
    Chart_Label = Column(String(255))
    GDP = Column(Float)
    Population = Column(Integer)
    GDP_per_capita = Column(Float)
    No_olympians = Column(Integer)
    Bronze_athlete = Column(Integer)
    Gold_athlete = Column(Integer)
    Silver_athlete = Column(Integer)
    Total_Medals_athlete = Column(Integer)
    Bronze_team = Column(Integer)
    Gold_team = Column(Integer)
    Silver_team = Column(Integer)
    Total_Medals_team = Column(Integer)

In [215]:
# class Countries_per_Event(Base):
#     __tablename__ = 'countries_per_event'
#     Year = Column(Integer, primary_key=True)
#     Season = Column(String(255), primary_key=True)
#     City = Column(String(255), primary_key=True)
#     Game_Label = Column(String(255))
#     Chart_Label = Column(String(255))
#     No_countries = Column(Integer)

<h3> Create SQLite database and connect to it

In [216]:
# Step 1 - Create engine connection, classes, session and connection
disk_engine = create_engine('sqlite:///olympic_events.sqlite')
Base.metadata.create_all(disk_engine)
session = Session(disk_engine)
conn = disk_engine.connect()

In [177]:
# Step 2 - Export the df_events dataframe to sqlite
df_events.to_sql('events', disk_engine, if_exists='append', index = False)

In [95]:
# DELETE THIS PART

# check the tables and columns in specific table that are in our sqlite db
table = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table'", conn)
print(table)

# To check the columns in the tables
conn = sqlite3.connect("olympic_events.sqlite")
cursor = conn.execute('select * from events_final')
cursor.description

           name
0        events
1  events_final


(('Year', None, None, None, None, None, None),
 ('Season', None, None, None, None, None, None),
 ('City', None, None, None, None, None, None),
 ('NOC', None, None, None, None, None, None),
 ('Country', None, None, None, None, None, None),
 ('Game_Label', None, None, None, None, None, None),
 ('Chart_Label', None, None, None, None, None, None),
 ('GDP', None, None, None, None, None, None),
 ('Population', None, None, None, None, None, None),
 ('GDP_per_capita', None, None, None, None, None, None),
 ('No_olympians', None, None, None, None, None, None),
 ('Bronze_athlete', None, None, None, None, None, None),
 ('Gold_athlete', None, None, None, None, None, None),
 ('Silver_athlete', None, None, None, None, None, None),
 ('Total_Medals_athlete', None, None, None, None, None, None),
 ('Bronze_team', None, None, None, None, None, None),
 ('Gold_team', None, None, None, None, None, None),
 ('Silver_team', None, None, None, None, None, None),
 ('Total_Medals_team', None, None, None, None, None

<h2> Prepare code for Events Final Table

In [178]:
# Base table
events_country = pd.read_sql_query('SELECT year, season, city, NOC, country FROM events \
GROUP BY year, season, city, NOC, country',disk_engine)
events_country['Game_Label'] = events_country['Season'] + ' ' + events_country['Year'].map(str) + ' - '  + events_country['City'] 
events_country['Chart_Label'] = events_country['Year'].map(str) + ' - '  + events_country['City'] 
events_country = events_country[['Year', 'Season', 'City', 'Game_Label', 'NOC', 'Chart_Label', 'Country']]

In [179]:
# Number of Athletes
# Step 1 Intermediate table
olympians_team_detail = pd.read_sql_query('SELECT year, season, name, NOC FROM events \
GROUP BY year, season, name, NOC',disk_engine)
olympians_team = olympians_team_detail.groupby(['Year','Season', 'NOC']).count()
olympians_team.reset_index(inplace = True)
olympians_team = olympians_team.rename(columns = {"Name": "No_olympians"})

In [180]:
# Number of Medals per athlete
# Step 1
medals_athlete_detail = pd.read_sql_query('SELECT year, season, NOC, sport, event, name, sex, medal FROM events \
GROUP BY year, season, NOC, sport, event, name, sex, medal',disk_engine)
# Step 2
medals_athlete = medals_athlete_detail.groupby(['Year','Season','NOC', 'Medal']).count()[['Name']]
medals_athlete.reset_index(inplace = True)
medals_athlete.rename(columns = {"Name": "#Medals"}, inplace = True)
medals_athlete.sort_values(by=["#Medals"], ascending=False)
# Step 3
medals_athlete_total = medals_athlete.pivot_table(index=['Year','Season','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_athlete_total.reset_index(inplace = True)
medals_athlete_total.replace(np.nan,0, inplace = True)
medals_athlete_total["Total_Medals"] = medals_athlete_total["Bronze"] + medals_athlete_total["Gold"] + medals_athlete_total["Silver"]
medals_athlete_total

Medal,Year,Season,NOC,Bronze,Gold,Silver,Total_Medals
0,1960,Summer,ARG,1.0,0.0,3.0,4.0
1,1960,Summer,AUS,11.0,11.0,24.0,46.0
2,1960,Summer,AUT,0.0,1.0,2.0,3.0
3,1960,Summer,BEL,2.0,0.0,2.0,4.0
4,1960,Summer,BGR,3.0,1.0,3.0,7.0
...,...,...,...,...,...,...,...
1199,2016,Summer,UZB,7.0,4.0,2.0,13.0
1200,2016,Summer,VEN,2.0,0.0,1.0,3.0
1201,2016,Summer,VNM,0.0,1.0,1.0,2.0
1202,2016,Summer,XKX,0.0,1.0,0.0,1.0


In [181]:
# Number of Medals per sport
# Step 1
medals_sport_detail = pd.read_sql_query('SELECT year, season, NOC, sport, event, sex, medal FROM events \
GROUP BY year, season, NOC, sport, event, sex, medal',disk_engine)
# Step 2
medals_sport = medals_sport_detail.groupby(['Year','Season','NOC', 'Medal']).count()[['Event']]
medals_sport.reset_index(inplace = True)
medals_sport.rename(columns = {"Event": "#Medals"}, inplace = True)
medals_sport.sort_values(by=["#Medals"], ascending=False)
# Step 3
medals_sport_total = medals_sport.pivot_table(index=['Year','Season','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
medals_sport_total.reset_index(inplace = True)
medals_sport_total.replace(np.nan,0, inplace = True)
medals_sport_total["Total_Medals"] = medals_sport_total["Bronze"] + medals_sport_total["Gold"] + medals_sport_total["Silver"]
medals_sport_total

Medal,Year,Season,NOC,Bronze,Gold,Silver,Total_Medals
0,1960,Summer,ARG,1.0,0.0,1.0,2.0
1,1960,Summer,AUS,6.0,8.0,8.0,22.0
2,1960,Summer,AUT,0.0,1.0,1.0,2.0
3,1960,Summer,BEL,2.0,0.0,2.0,4.0
4,1960,Summer,BGR,3.0,1.0,3.0,7.0
...,...,...,...,...,...,...,...
1199,2016,Summer,UZB,7.0,4.0,2.0,13.0
1200,2016,Summer,VEN,2.0,0.0,1.0,3.0
1201,2016,Summer,VNM,0.0,1.0,1.0,2.0
1202,2016,Summer,XKX,0.0,1.0,0.0,1.0


In [182]:
# Final data gathering to reach Events_final

# step 1 - merging tables
events_merge_1 = pd.merge(events_country, df_pop_gdp_final, how = "left", left_on = ['Year','NOC'], right_on = ['Year','NOC'], suffixes=('', '_y'))
events_merge_1 = events_merge_1.drop(columns = ['Country_y'])
events_merge_2 = pd.merge(events_merge_1, olympians_team, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'])
events_merge_3 = pd.merge(events_merge_2, medals_athlete_total, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'])
events_merge_4 = pd.merge(events_merge_3, medals_sport_total, how = "left", left_on = ['Year','Season', 'NOC'], right_on = ['Year','Season', 'NOC'], suffixes=('_athlete', '_team')) 

In [183]:
# step 2 - prepare aggregated value for all countries - World (WLD)
wld_events_1 = events_merge_4.groupby(['Year','Season','City','Game_Label', 'Chart_Label']).sum()
wld_events_2 = pd.DataFrame(wld_events_1).reset_index()
wld_events_2['NOC'] = 'WLD'
wld_events_2['Country'] = 'World'

# add the WLD Pop and GDP information
wld_base_info = df_pop_gdp_final[df_pop_gdp_final['NOC'] == 'WLD']
wld_all_info = pd.merge(wld_events_2, wld_base_info, how = "left", left_on = ['NOC','Year','Country'], right_on = ['NOC','Year','Country'], suffixes=('', '_y'))
wld_all_info = wld_all_info.drop(columns = ['GDP_y', 'Population_y', 'GDP_per_capita_y'], axis = 1)

In [184]:
# step 3 - add aggregated value for all countries - World (WLD) in Events_Final
events_final = pd.concat([events_merge_4, wld_all_info], sort=False)


In [205]:
# step 4 - load the events_final in SQLite
events_final.to_sql('events_final', disk_engine, if_exists='append', index = False)

<h2> Which countries to drop

In [185]:
# drop rows with columns not maching world bank and without medals

check = pd.merge(events_final, <h2> Prepare code for Events Final Table, how = "left", left_on = ['NOC'], right_on = ['NOC'], suffixes=('', '_y'))
check

Unnamed: 0,Year,Season,City,Game_Label,NOC,Chart_Label,Country,GDP,Population,GDP_per_capita,...,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team,Country_y,Wclean
0,1960,Summer,Roma,Summer 1960 - Roma,AFG,1960 - Roma,Afghanistan,5.377778e+08,8.996973e+06,5.977319e+01,...,,,,,,,,,Afghanistan,afghanistan
1,1960,Summer,Roma,Summer 1960 - Roma,AHO,1960 - Roma,Netherlands Antilles,,,,...,,,,,,,,,,
2,1960,Summer,Roma,Summer 1960 - Roma,ARG,1960 - Roma,Argentina,0.000000e+00,2.048178e+07,0.000000e+00,...,1.0,0.0,3.0,4.0,1.0,0.0,1.0,2.0,Argentina,argentina
3,1960,Summer,Roma,Summer 1960 - Roma,AUS,1960 - Roma,Australia,1.857767e+10,1.027648e+07,1.807786e+03,...,11.0,11.0,24.0,46.0,6.0,8.0,8.0,22.0,Australia,australia
4,1960,Summer,Roma,Summer 1960 - Roma,AUT,1960 - Roma,Austria,6.592694e+09,7.047539e+06,9.354604e+02,...,0.0,1.0,2.0,3.0,0.0,1.0,1.0,2.0,Austria,austria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3160,2008,Summer,Beijing,Summer 2008 - Beijing,WLD,2008 - Beijing,World,6.301908e+13,6.718735e+09,3.190289e+06,...,710.0,671.0,667.0,2048.0,357.0,305.0,306.0,968.0,World,world
3161,2010,Winter,Vancouver,Winter 2010 - Vancouver,WLD,2010 - Vancouver,World,5.957594e+13,5.016744e+09,2.024564e+06,...,171.0,174.0,175.0,520.0,87.0,88.0,89.0,264.0,World,world
3162,2012,Summer,London,Summer 2012 - London,WLD,2012 - London,World,7.438164e+13,7.042794e+09,3.235125e+06,...,679.0,632.0,630.0,1941.0,360.0,306.0,307.0,973.0,World,world
3163,2014,Winter,Sochi,Winter 2014 - Sochi,WLD,2014 - Sochi,World,7.089451e+13,5.118560e+09,2.381370e+06,...,198.0,202.0,197.0,597.0,104.0,104.0,102.0,310.0,World,world


In [204]:
# Which countries to drop from here?
check_step2 = check[check['Country_y'].isnull()]
check_step3 = check_step2.groupby(['NOC','Country']).sum().reset_index()
check_step3 = check_step3.drop(columns = ['Year', 'GDP', 'Population', 'GDP_per_capita',
       'No_olympians', 'Bronze_athlete', 'Gold_athlete', 'Silver_athlete',
       'Total_Medals_athlete', 'Bronze_team', 'Gold_team', 'Silver_team'], axis =1)
check_step3[check_step3['Total_Medals_team'] > 0]


Unnamed: 0,NOC,Country,Total_Medals_team
0,AHO,Netherlands Antilles,1.0
7,EUN,Unified Team,139.0
8,FRG,West Germany,249.0
9,GDR,East Germany,523.0
10,IOA,Individual Olympic Athletes,5.0
21,SCG,Serbia and Montenegro,9.0
22,TCH,Czechoslovakia,103.0
23,TPE,Chinese Taipei,24.0
24,UAR,United Arab Republic,2.0
25,URS,Almaz,93.0


In [None]:
# Side Checks 1
a = events_final.groupby(['Year','Season','City','NOC']).count()
a = a.reset_index()
a[a['Country'] > 1]

In [None]:
# Side Checks 1 - detail
events_final[(events_final['Year'] == 1960) & (events_final['Season'] == 'Summer') & (events_final['NOC'] == 'URS')]

In [None]:
events_final.columns

<h2> Countries per event

In [214]:
# Base table
# countries_per_event = pd.read_sql_query('SELECT year, season, city, game_Label, chart_Label, count(NOC) FROM events_final \
# GROUP BY year, season, city, game_Label, chart_Label',disk_engine)
# countries_per_event = countries_per_event.rename( columns = {'count(NOC)' : 'No_countries'})
# countries_per_event

Unnamed: 0,Year,Season,City,Game_Label,Chart_Label,No_countries
0,1960,Summer,Roma,Summer 1960 - Roma,1960 - Roma,100
1,1960,Winter,Squaw Valley,Winter 1960 - Squaw Valley,1960 - Squaw Valley,31
2,1964,Summer,Tokyo,Summer 1964 - Tokyo,1964 - Tokyo,102
3,1964,Winter,Innsbruck,Winter 1964 - Innsbruck,1964 - Innsbruck,37
4,1968,Summer,Mexico City,Summer 1968 - Mexico City,1968 - Mexico City,113
...,...,...,...,...,...,...
25,2008,Summer,Beijing,Summer 2008 - Beijing,2008 - Beijing,205
26,2010,Winter,Vancouver,Winter 2010 - Vancouver,2010 - Vancouver,83
27,2012,Summer,London,Summer 2012 - London,2012 - London,206
28,2014,Winter,Sochi,Winter 2014 - Sochi,2014 - Sochi,90


In [217]:
# Load the events_final in SQLite
# countries_per_event.to_sql('countries_per_event', disk_engine, if_exists='append', index = False)

In [212]:
countries_per_event.columns

Index(['Year', 'Season', 'City', 'Game_Label', 'Chart_Label', 'count(NOC)'], dtype='object')

In [210]:

events_final[(events_final['Chart_Label'] == '2016 - Rio de Janeiro')]

Unnamed: 0,Year,Season,City,Game_Label,NOC,Chart_Label,Country,GDP,Population,GDP_per_capita,No_olympians,Bronze_athlete,Gold_athlete,Silver_athlete,Total_Medals_athlete,Bronze_team,Gold_team,Silver_team,Total_Medals_team
2928,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,ABW,2016 - Rio de Janeiro,Aruba,2.646927e+09,1.048720e+05,2.523960e+04,7,,,,,,,,
2929,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,AFG,2016 - Rio de Janeiro,Afghanistan,1.936264e+10,3.538313e+07,5.472281e+02,3,,,,,,,,
2930,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,AGO,2016 - Rio de Janeiro,Angola,1.011239e+11,2.884248e+07,3.506073e+03,26,,,,,,,,
2931,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,ALB,2016 - Rio de Janeiro,Albania,1.186135e+10,2.876101e+06,4.124109e+03,6,,,,,,,,
2932,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,AND,2016 - Rio de Janeiro,Andorra,2.877312e+09,7.729700e+04,3.722411e+04,4,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3131,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,YEM,2016 - Rio de Janeiro,"Yemen, Rep.",3.096824e+10,2.716821e+07,1.139871e+03,3,,,,,,,,
3132,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,ZAF,2016 - Rio de Janeiro,South Africa,2.963573e+11,5.620365e+07,5.272918e+03,135,14.0,2.0,7.0,23.0,2.0,2.0,6.0,10.0
3133,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,ZMB,2016 - Rio de Janeiro,Zambia,2.095475e+10,1.636351e+07,1.280578e+03,7,,,,,,,,
3134,2016,Summer,Rio de Janeiro,Summer 2016 - Rio de Janeiro,ZWE,2016 - Rio de Janeiro,Zimbabwe,2.054868e+10,1.403039e+07,1.464584e+03,30,,,,,,,,


In [None]:
test = df_events_new[(df_events_new['Year'] == 1980) & (df_events_new['Season'] == 'Summer') & (df_events_new['NOC'] == 'BRA')]
test

In [None]:
test.groupby(['Event','Name','Sport','Medal','Sex']).count()

<h2> DELETE - Olympians over time

In [None]:
# # test1 = session.query(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).\
#     group_by(Events.Year, Events.Season, Events.Name, Events.Team, Events.NOC).all()

In [None]:
# # Intermediate table
# olympians_team_detail = pd.read_sql_query('SELECT year, season, name, team, NOC FROM events \
# GROUP BY year, season, name, team, NOC',disk_engine)
# olympians_team_detail

In [None]:
# Final table with number of athletes per Event (Year&Season)
# olympians_team = olympians_team_detail.groupby(['Year','Season', 'Team', 'NOC']).count()
# olympians_team.reset_index(inplace = True)
# olympians_team_final = olympians_team.rename(columns = {"Name": "No_olympians"})

In [None]:
# Export to sqlite
# olympians_team_final.to_sql('olympians_team_final', disk_engine, if_exists='append', index = False)

In [None]:
# Giving an error - I need to fix it
# test2 = session.query(Olympians_Team_Final.Year, Olympians_Team_Final.Season,\
#                         Olympians_Team_Final.Team, Olympians_Team_Final.NOC, Olympians_Team_Final.No_olympians).all()


<h2> Medals by country / Total Medal count over time / Participating events over time

In [None]:
# Intermediate table 1
# medals_team_detail = pd.read_sql_query('SELECT year, season, sport, event, sex, medal, team, NOC FROM events \
# GROUP BY year, season, sport, event, sex, medal, team, NOC',disk_engine)
# medals_team_detail["Medal"].fillna("No_medal", inplace = True) 

In [None]:
# Intermediate table 2 - may be used for visualizations
# medals_team = medals_team_detail.groupby(['Year','Season','Team', 'NOC', 'Medal']).count()[['Sport']]
# medals_team.reset_index(inplace = True)
# medals_team.rename(columns = {"Sport": "#Medals"}, inplace = True)
# medals_team.sort_values(by=["#Medals"], ascending=False)
# medals_team.loc[medals_team['Year'] == 1984].sort_values(by=['Year',''"#Medals"], ascending=[True, False])

In [None]:
# Summary table with all the participant countries. Total Medals is sum of G,S&B
# If you need a list of medals, check the previous table
# medals_team_total = medals_team.pivot_table(index=['Year','Season','Team','NOC'], values = '#Medals', columns='Medal', aggfunc='sum')
# medals_team_total.reset_index(inplace = True)
# medals_team_total.replace(np.nan,0, inplace = True)
# medals_team_total["Total_Medals"] = medals_team_total["Bronze"] + medals_team_total["Gold"] + medals_team_total["Silver"]
# medals_team_total.sort_values(by=['Year','Season', 'Total_Medals'], ascending=[True, True, False], inplace = True)
# medals_team_total.loc[medals_team_total['Year'] == 1960]

In [None]:
# Export to sqlite
# medals_team_total.to_sql('medals_team_total', disk_engine, if_exists='append', index = False)

In [None]:
# Giving an error - I need to fix it
# test3 = session.query(Medals_Team_Total.Year, Medals_Team_Total.Season, Medals_Team_Total.Team, \
#                       Medals_Team_Total.NOC, Medals_Team_Total.Bronze, Medals_Team_Total.Silver, \
#                       Medals_Team_Total.Gold, Medals_Team_Total.Total_Medals).\
#                     filter(Medals_Team_Total.NOC == 'USA').all()

