### ETL


In [1]:
import pandas as pd
import numpy as np

In [2]:
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

# Parameters to connect to the default PostgreSQL database
params = {
    'dbname': 'postgres',
    'user': 'postgres',  
    'password': 'postgres', 
    'host': 'pgdb'
}

try:
    # Connect to the PostgreSQL server
    conn = psycopg2.connect(**params)
    
    # Enable autocommit so operations like creating a database are committed without having to call conn.commit()
    conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
    
    # Create a cursor object
    cursor = conn.cursor()
    
    # Name of the new database
    new_db_name = 'olympic_olap'  # Replace with the name of the database you want to create
    
    # Ensure the database name is safe to use
    # For example, by checking against a list of allowed names or patterns
    if not new_db_name.isidentifier():
        raise ValueError("Invalid database name.")
    
    # Create a new database using an f-string
    cursor.execute(f"CREATE DATABASE {new_db_name}")
    
    print("Database created successfully")
    
    # Close communication with the database
    cursor.close()
    conn.close()

except Exception as e:
    print(f"An error occurred: {e}")


Database created successfully


In [3]:

from psycopg2 import OperationalError
def create_connection(db_name, db_user, db_password, db_host, db_port):
    connection = None
    try:
        connection = psycopg2.connect(
            database=db_name,
            user=db_user,
            password=db_password,
            host=db_host,
            port=db_port,
        )
        print("Connection to PostgreSQL DB successful")
    except OperationalError as e:
        print(f"The error '{e}' occurred")
    return connection

In [4]:
# Connection details
olap_db_name = "olympic_olap"
db_user = "postgres"
db_password = "postgres"  # Update with your password
db_host = "pgdb"  # Update if your DB is hosted elsewhere
db_port = "5432"

In [5]:
olap_connection = create_connection(olap_db_name, db_user, db_password, db_host, db_port)
olap_cursor = olap_connection.cursor()

Connection to PostgreSQL DB successful


In [6]:
olap_cursor.execute("""
CREATE TABLE DimLocation (
    country_code CHAR(3) NOT NULL PRIMARY KEY,
    country_name VARCHAR(255),
    continent VARCHAR(255)
);
""")



In [7]:
olap_cursor.execute("""
CREATE TABLE DimEvent (
    event_id SERIAL PRIMARY KEY,
    event_title VARCHAR(250),
    event_discipline VARCHAR(250),
    event_gender VARCHAR(250)
);
""")



In [8]:
olap_cursor.execute("""
CREATE TABLE DimParticipant (
    participant_id SERIAL PRIMARY KEY,
    participant_title VARCHAR(255),
    participant_type VARCHAR(100)
);
""")

In [9]:
olap_cursor.execute("""
CREATE TABLE DimAthlete (
    athlete_id SERIAL PRIMARY KEY,
    athlete_name VARCHAR(250),
    athlete_url VARCHAR(250)
);
""")

In [10]:
olap_cursor.execute("""
CREATE TABLE DimYear (
    year INTEGER NOT NULL PRIMARY KEY
);
""")

In [11]:
olap_cursor.execute("""
CREATE TABLE DimGame (
    game_slug VARCHAR(100) NOT NULL PRIMARY KEY,
    game_name VARCHAR(100),
    game_season VARCHAR(10),
    game_year INTEGER,
    country_code CHAR(3)
);
""")

In [12]:
olap_cursor.execute("""
CREATE TABLE FactOlympicMedalsMeasures (
    game_slug VARCHAR(100) REFERENCES DimGame(game_slug),
    participant_id INTEGER REFERENCES DimParticipant(participant_id),
    athlete_id INTEGER REFERENCES DimAthlete(athlete_id),
    event_id INTEGER NOT NULL REFERENCES DimEvent(event_id),
    country_code CHAR(3) NOT NULL REFERENCES DimLocation(country_code),
    year INTEGER NOT NULL REFERENCES DimYear(year),
    bronze_medals INTEGER,
    silver_medals INTEGER,
    gold_medals INTEGER
);
""")

In [13]:
olap_cursor.execute("""
CREATE TABLE FactEconomicMeasure (
    year INTEGER NOT NULL REFERENCES DimYear(year),
    country_code CHAR(3) NOT NULL REFERENCES DimLocation(country_code),
    poverty_count FLOAT,
    gdp_per_capita FLOAT,
    annual_gdp_growth FLOAT,
    servers_count INTEGER
);
""")

In [14]:
olap_cursor.execute("""
CREATE TABLE FactHealthMeasure (
    year INTEGER NOT NULL REFERENCES DimYear(year),
    country_code CHAR(3) NOT NULL REFERENCES DimLocation(country_code),
    daly_depression FLOAT,
    daly_schizophrenia FLOAT,
    daly_bipolar_disorder FLOAT,
    daly_eating_disorder FLOAT,
    daly_anxiety FLOAT,
    life_expectancy FLOAT,
    infant_mortality_rate FLOAT,
    current_health_expenditure FLOAT,
    government_health_expenditure FLOAT,
    private_health_expenditure FLOAT,
    external_health_expenditure FLOAT
);
""")

In [15]:
# Insert values from 1896 - start of the olympic data to 2022

olap_cursor.execute("""
INSERT INTO DimYear (year)
SELECT generate_series AS year
FROM generate_series(1896, 2022);
""")

In [16]:
olap_connection.commit()

In [17]:
oltp_connection = create_connection('olympic_oltp', db_user, db_password, db_host, db_port)

Connection to PostgreSQL DB successful


In [18]:
oltp_cursor = oltp_connection.cursor()

In [19]:
oltp_cursor.execute("""
SELECT DISTINCT 
    event_title,
    discipline_title AS event_discipline,
    event_gender
FROM
    olympic_medals
ORDER BY
    event_title, event_gender;
""")

In [20]:
dim_event_data = oltp_cursor.fetchall()

In [21]:
# all column names
dim_event_data_columns = [desc[0] for desc in oltp_cursor.description]

In [22]:
dim_event_df = pd.DataFrame(dim_event_data, columns = dim_event_data_columns)

In [23]:
dim_event_df

Unnamed: 0,event_title,event_discipline,event_gender
0,0.5-1t mixed,Sailing,Open
1,"0.5t mixed, race one",Sailing,Open
2,"0.5t mixed, race two",Sailing,Open
3,10000m men,Speed skating,Men
4,10000m men,Athletics,Men
...,...,...,...
1583,Women's Uneven Bars,Artistic Gymnastics,Women
1584,Women's Vault,Artistic Gymnastics,Women
1585,Women's Welter (64-69kg),Boxing,Women
1586,Yngling - Keelboat women,Sailing,Women


In [24]:
from sqlalchemy import create_engine

olap_connection_url = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{olap_db_name}"

# Create the engine
olap_engine = create_engine(olap_connection_url)

In [25]:
dim_event_df.to_sql("dimevent", con=olap_engine, if_exists="append", index=False)

588

In [26]:
oltp_cursor.execute("""
SELECT DISTINCT 
    participant_title,
    participant_type
FROM 
    olympic_medals;
""")

In [27]:
dim_participant_data = oltp_cursor.fetchall()

In [28]:
dim_participant_data_columns = [desc[0] for desc in oltp_cursor.description]
dim_participant_data_columns


['participant_title', 'participant_type']

In [29]:
dim_participant_df = pd.DataFrame(dim_participant_data, columns = dim_participant_data_columns)

In [30]:
dim_participant_df.head()

Unnamed: 0,participant_title,participant_type
0,Romania team,GameTeam
1,Tan-Fe-Pah,GameTeam
2,Independent Rowing Club #3,GameTeam
3,Elsie,GameTeam
4,Sans Atout #1,GameTeam


In [31]:
dim_participant_df.to_sql("dimparticipant", con=olap_engine, if_exists="append", index=False)

494

In [32]:
oltp_cursor.execute("""
SELECT DISTINCT 
    athlete_full_name as athlete_name,
    athlete_url
FROM 
    olympic_medals
WHERE 
    athlete_full_name IS NOT NULL AND athlete_url IS NOT NULL
ORDER BY athlete_name;
""")

In [33]:
dim_athlete_data = oltp_cursor.fetchall()

In [34]:
dim_athlete_data_columns = [desc[0] for desc in oltp_cursor.description]

In [35]:
dim_athlete_data_columns

['athlete_name', 'athlete_url']

In [36]:
dim_athlete_df = pd.DataFrame(dim_athlete_data, columns = dim_athlete_data_columns)
dim_athlete_df.head()

Unnamed: 0,athlete_name,athlete_url
0,Aage Ernst LARSEN,https://olympics.com/en/athletes/aage-ernst-la...
1,Aage Ingvar ERIKSEN,https://olympics.com/en/athletes/aage-ingvar-e...
2,Aagje Ada KOK,https://olympics.com/en/athletes/aagje-ada-kok
3,Aarne Eemeli REINI,https://olympics.com/en/athletes/aarne-eemeli-...
4,Aaron CHIA,https://olympics.com/en/athletes/aaron-chia


In [37]:
dim_athlete_df.to_sql("dimathlete", con=olap_engine, if_exists="append", index=False)

116

In [38]:
oltp_cursor.execute("""
SELECT 
    h.game_slug, 
    h.game_name, 
    h.game_season, 
    h.game_year, 
    m.country_3_letter_code as country_code
FROM 
    olympic_hosts h
JOIN 
    olympic_medals m ON (h.game_slug = m.slug_game AND h.game_location = m.country_name)
GROUP BY 
    h.game_slug, h.game_name, h.game_season, h.game_year, m.country_3_letter_code
ORDER BY game_year;
""")

In [39]:
dim_game_data = oltp_cursor.fetchall()

In [40]:
dim_game_data_columns = [desc[0] for desc in oltp_cursor.description]

In [41]:
dim_game_data_columns

['game_slug', 'game_name', 'game_season', 'game_year', 'country_code']

In [42]:
dim_game_df = pd.DataFrame(dim_game_data, columns = dim_game_data_columns)
dim_game_df.head()

Unnamed: 0,game_slug,game_name,game_season,game_year,country_code
0,athens-1896,Athens 1896,Summer,1896,GRE
1,paris-1900,Paris 1900,Summer,1900,FRA
2,london-1908,London 1908,Summer,1908,GBR
3,stockholm-1912,Stockholm 1912,Summer,1912,SWE
4,antwerp-1920,Antwerp 1920,Summer,1920,BEL


In [43]:
dim_game_df.to_sql("dimgame", con=olap_engine, if_exists="append", index=False)

41

In [44]:
oltp_cursor.execute("""
SELECT * FROM countries;
""")

In [45]:
dim_location_data = oltp_cursor.fetchall()

In [47]:
dim_location_data_columns = [desc[0] for desc in oltp_cursor.description]

In [48]:
dim_location_data_columns

['country_code', 'country_name', 'continent']

In [49]:
dim_location_df = pd.DataFrame(dim_location_data, columns = dim_location_data_columns)
dim_location_df.head()

Unnamed: 0,country_code,country_name,continent
0,AFG,Afghanistan,Asia
1,ALA,Åland Islands,Europe
2,ALB,Albania,Europe
3,DZA,Algeria,Africa
4,ASM,American Samoa,Oceania


In [50]:
dim_location_df.to_sql("dimlocation", con=olap_engine, if_exists="append", index=False)

247

```sql
SELECT DISTINCT o.game_location
FROM olympic_hosts o
LEFT JOIN countries c ON o.game_location = c.country_name
WHERE c.country_name IS NULL;
```

Running above code we find that some countries are named differently in olympic_host file

"Australia, Sweden" -> "Australia" \
"Federal Republic of Germany" -> "Germany"
"Great Britain" -> United Kingdom of Great Britain and Northern Ireland \
"United States" -> United States of America \
"USSR" -> Russian Federation \
"Yugoslavia" -> Serbia

In [51]:
oltp_cursor.execute("""
CREATE TABLE olympic_hosts_backup AS
SELECT *
FROM olympic_hosts;
""")

In [52]:
oltp_connection.commit()

In [53]:
oltp_cursor.execute("""
UPDATE olympic_hosts
SET game_location = CASE
    WHEN game_location = 'Australia, Sweden' THEN 'Australia'
    WHEN game_location = 'Federal Republic of Germany' THEN 'Germany'
    WHEN game_location = 'Great Britain' THEN 'United Kingdom of Great Britain and Northern Ireland'
    WHEN game_location = 'United States' THEN 'United States of America'
    WHEN game_location = 'USSR' THEN 'Russian Federation'
    WHEN game_location = 'Yugoslavia' THEN 'Serbia'
    ELSE game_location
END;
""")

In [54]:
oltp_connection.commit()