## Extract

In [1]:
# Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [2]:
# Read alcohol-consumption-vs-gdp-per-capita.csv
# Read CSV file into a DataFrame
alcohol_consumption_vs_gdp = pd.read_csv('/Users/heather/Documents/CodeRepos/Global_Alcohol_Consumption_Patterns_and_Health_Impacts/Resources/alcohol-consumption-vs-gdp-per-capita.csv')

# Display the first 5 rows of the DataFrame
alcohol_consumption_vs_gdp.head()

Unnamed: 0,Entity,Code,Year,"Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)","GDP per capita, PPP (constant 2017 international $)",Continent
0,Abkhazia,OWID_ABK,2015,,,Asia
1,Afghanistan,AFG,2000,0.00277,,
2,Afghanistan,AFG,2002,,1280.4631,
3,Afghanistan,AFG,2003,,1292.3335,
4,Afghanistan,AFG,2004,,1260.0605,


In [3]:
# Read alcohol-attributable-fraction-of-mortality.csv
# Read CSV file into a DataFrame
alcohol_related_mortality = pd.read_csv('/Users/heather/Documents/CodeRepos/Global_Alcohol_Consumption_Patterns_and_Health_Impacts/Resources/alcohol-attributable-fraction-of-mortality.csv')

# Display the first 5 rows of the DataFrame
alcohol_related_mortality.head()

Unnamed: 0,Entity,Code,Year,"Alcohol-attributable fractions, all-cause deaths (%) - Sex: both sexes"
0,Afghanistan,AFG,2016,0.2
1,Albania,ALB,2016,6.4
2,Algeria,DZA,2016,1.0
3,Angola,AGO,2016,6.8
4,Antigua and Barbuda,ATG,2016,4.5


## Transform

In [4]:
#Clean alcohol_consumption_vs_gdp data

# 1. Remove the 'Code' column
alcohol_consumption_vs_gdp = alcohol_consumption_vs_gdp.drop(columns=['Code', 'Continent'])

# 2. Rename the columns with the long titles
alcohol_consumption_vs_gdp = alcohol_consumption_vs_gdp.rename(columns={
    'Total alcohol consumption per capita (liters of pure alcohol, projected estimates, 15+ years of age)': 'alcohol_consumption_per_capita',
    'GDP per capita, PPP (constant 2017 international $)': 'gdp_per_capita'
})

# 3. Drop rows where both 'Total Alcohol Consumption' and 'GDP per Capita' are NaN
alcohol_consumption_vs_gdp = alcohol_consumption_vs_gdp.dropna(subset=['alcohol_consumption_per_capita', 'gdp_per_capita'], how='all')

# 4. Sort by 'Entity' and 'Year' columns
alcohol_consumption_vs_gdp = alcohol_consumption_vs_gdp.sort_values(by=['Entity', 'Year'])

# 5. Remove index when displaying or saving the DataFrame
# Save to a CSV without the index:
alcohol_consumption_vs_gdp.to_csv('Resources/alcohol_consumption_vs_gdp.csv', index=False)

# 6. Display the final DataFrame without index
alcohol_consumption_vs_gdp.head()


Unnamed: 0,Entity,Year,alcohol_consumption_per_capita,gdp_per_capita
1,Afghanistan,2000,0.00277,
2,Afghanistan,2002,,1280.4631
3,Afghanistan,2003,,1292.3335
4,Afghanistan,2004,,1260.0605
5,Afghanistan,2005,0.02684,1352.3207


In [5]:
#Clean alcohol_related_mortality data

# 1. Remove the 'Code' column
alcohol_related_mortality = alcohol_related_mortality.drop(columns=['Code'])

# 2. Rename the column with the long title
alcohol_related_mortality = alcohol_related_mortality.rename(columns={
    'Alcohol-attributable fractions, all-cause deaths (%) - Sex: both sexes': 'alcohol_related_mortality',
})

# 3. Sort by 'Entity' and 'Year' columns
alcohol_related_mortality = alcohol_related_mortality.sort_values(by=['Entity', 'Year'])

# 4. Remove index when displaying or saving the DataFrame
# Save to a CSV without the index:
alcohol_related_mortality.to_csv('Resources/alcohol_related_mortality.csv', index=False)

# 5. Display the final DataFrame without index
alcohol_related_mortality.head()

Unnamed: 0,Entity,Year,alcohol_related_mortality
0,Afghanistan,2016,0.2
1,Albania,2016,6.4
2,Algeria,2016,1.0
3,Angola,2016,6.8
4,Antigua and Barbuda,2016,4.5


## Load

In [6]:
# Load environment variables
load_dotenv()

# Database connection parameters
db_host = os.getenv('DB_HOST')
db_name = os.getenv('DB_NAME')
db_user = os.getenv('DB_USER')
db_password = os.getenv('DB_PASSWORD')
db_port = os.getenv('DB_PORT')

# Create the database connection string
engine = create_engine(f'postgresql+psycopg2://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}')


# Load DataFrames into PostgreSQL tables using to_sql
try:
    # Load df_alcohol_consumption_vs_gdp into the corresponding table
    alcohol_consumption_vs_gdp.to_sql(
        'alcohol_consumption_vs_gdp',
        engine,
        if_exists='replace',  # Use 'append' if you want to add to existing data
        index=False
    )
    print("Data loaded successfully into alcohol_consumption_vs_gdp.")

    # Load df_alcohol_related_mortality into the corresponding table
    alcohol_related_mortality.to_sql(
        'alcohol_related_mortality',
        engine,
        if_exists='replace',  # Use 'append' if you want to add to existing data
        index=False
    )
    print("Data loaded successfully into alcohol_related_mortality.")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the database connection
    engine.dispose()

Data loaded successfully into alcohol_consumption_vs_gdp.
Data loaded successfully into alcohol_related_mortality.
