In [1]:
# Imports
import pandas as pd

# SQLAlchemy Engine Configuration documentation: https://docs.sqlalchemy.org/en/14/core/engines.html#database-urls
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

# config.py file needed to hold PostgreSQL password
from config import db_password

# Default DBAPI for the PostgreSQL dialect
import psycopg2

In [2]:
# Load in the Arabica bean data
arabica_df = pd.read_csv(
    "arabica_cleaned_data.csv"
)

In [3]:
arabica_df.head()

Unnamed: 0,Quality_Score,Species,Owner,Country of Origin,Company,Altitude,Region,Producer,Number of Bags,In-Country Partner,...,Clean Cup,Sweetness,Cupper Points,Total Cup Points,Moisture,Category One Defects,Color,Category Two Defects,Certification Body,date_diff
0,90.58,Arabica,metad plc,Ethiopia,METAD Agricultural Developmet plc,1950-2200,GUJI-HAMBELA/GOYO,METAD PLC,300,METAD Agricultural Development plc,...,10.0,10.0,8.75,Sample 90.58,12%,0 full defects,Green,0 full defects,METAD Agricultural Development plc,365
1,89.92,Arabica,metad plc,Ethiopia,METAD Agricultural Developmet plc,1950-2200,GUJI-HAMBELA/ALAKA,METAD PLC,300,METAD Agricultural Development plc,...,10.0,10.0,8.58,Sample 89.92,12%,0 full defects,Green,1 full defects,METAD Agricultural Development plc,365
2,89.75,Arabica,Grounds for Health Admin,Guatemala,,1600 - 1800 m,Unknown,,5,Specialty Coffee Association,...,10.0,10.0,9.25,Sample 89.75,0%,0 full defects,,0 full defects,Specialty Coffee Association,365
3,89.0,Arabica,Yidnekachew Dabessa,Ethiopia,Yidnekachew Debessa Coffee Plantation,1800-2200,Oromia,Yidnekachew Dabessa Coffee Plantation,320,METAD Agricultural Development plc,...,10.0,10.0,8.67,Sample 89.00,11%,0 full defects,Green,2 full defects,METAD Agricultural Development plc,365
4,88.83,Arabica,metad plc,Ethiopia,METAD Agricultural Developmet plc,1950-2200,GUJI-HAMBELA/BISHAN FUGU,METAD PLC,300,METAD Agricultural Development plc,...,10.0,10.0,8.58,Sample 88.83,12%,0 full defects,Green,2 full defects,METAD Agricultural Development plc,365


In [4]:
# Drop unneeded company data
arabica_df.drop(columns=['Company', 'Producer', 'In-Country Partner', 'Harvest Year'], inplace=True)

In [5]:
# Drop unneeded production data
arabica_df.drop(columns=['Variety', 'Processing Method', 'Color', 'Number of Bags', 'Grading Date', 'Status', 'date_diff'], inplace=True)

In [6]:
# Drop unneeded certification data
arabica_df.drop(columns=['Quality_Score', 'Category One Defects', 'Category Two Defects', 'Certification Body'], inplace=True)

In [7]:
# Drop rows where Country of Origin is United States as incorrect
arabica_df.drop(arabica_df.index[arabica_df['Country of Origin'] == 'United States'], inplace = True)

In [8]:
# Convert Moisture and Total Cup Points to decimal (remove '%' and 'Sample')
arabica_df["Moisture"].replace(regex=True, inplace=True, to_replace=r'[%]', value=r'')
arabica_df["Moisture"] = pd.to_numeric(arabica_df["Moisture"], downcast='float')
arabica_df["Moisture"] = arabica_df["Moisture"]/100
arabica_df["Total Cup Points"].replace(regex=True, inplace=True, to_replace=r'[Sample]', value=r'')
arabica_df["Total Cup Points"] = pd.to_numeric(arabica_df["Total Cup Points"], downcast='float')
arabica_df.head()

Unnamed: 0,Species,Owner,Country of Origin,Altitude,Region,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,Clean Cup,Sweetness,Cupper Points,Total Cup Points,Moisture
0,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/GOYO,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,8.75,90.580002,0.12
1,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/ALAKA,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,8.58,89.919998,0.12
2,Arabica,Grounds for Health Admin,Guatemala,1600 - 1800 m,Unknown,8.42,8.5,8.42,8.42,8.33,8.42,10.0,10.0,10.0,9.25,89.75,0.0
3,Arabica,Yidnekachew Dabessa,Ethiopia,1800-2200,Oromia,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,8.67,89.0,0.11
4,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/BISHAN FUGU,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,8.58,88.830002,0.12


In [9]:
# Rename columns to remove spaces
arabica_df.rename(columns={ 
    'Country of Origin': 'countryOfOrigin',
    'Clean Cup': 'cleanCup',
    'Cupper Points': 'cupperPoints',
    'Total Cup Points': 'totalCupPoints'
}, inplace=True)
arabica_df.head()

Unnamed: 0,Species,Owner,countryOfOrigin,Altitude,Region,Aroma,Flavor,Aftertaste,Acidity,Body,Balance,Uniformity,cleanCup,Sweetness,cupperPoints,totalCupPoints,Moisture
0,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/GOYO,8.67,8.83,8.67,8.75,8.5,8.42,10.0,10.0,10.0,8.75,90.580002,0.12
1,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/ALAKA,8.75,8.67,8.5,8.58,8.42,8.42,10.0,10.0,10.0,8.58,89.919998,0.12
2,Arabica,Grounds for Health Admin,Guatemala,1600 - 1800 m,Unknown,8.42,8.5,8.42,8.42,8.33,8.42,10.0,10.0,10.0,9.25,89.75,0.0
3,Arabica,Yidnekachew Dabessa,Ethiopia,1800-2200,Oromia,8.17,8.58,8.42,8.42,8.5,8.25,10.0,10.0,10.0,8.67,89.0,0.11
4,Arabica,metad plc,Ethiopia,1950-2200,GUJI-HAMBELA/BISHAN FUGU,8.25,8.5,8.25,8.5,8.42,8.33,10.0,10.0,10.0,8.58,88.830002,0.12


In [10]:
origin_df = arabica_df.filter(['countryOfOrigin', 'Region', 'Altitude'], axis=1)
origin_df.drop_duplicates(subset=['countryOfOrigin', 'Region'], keep='last')
origin_df.rename(columns={'countryOfOrigin': 'Country'}, inplace=True)
origin_df.head()

Unnamed: 0,Country,Region,Altitude
0,Ethiopia,GUJI-HAMBELA/GOYO,1950-2200
1,Ethiopia,GUJI-HAMBELA/ALAKA,1950-2200
2,Guatemala,Unknown,1600 - 1800 m
3,Ethiopia,Oromia,1800-2200
4,Ethiopia,GUJI-HAMBELA/BISHAN FUGU,1950-2200


In [11]:
# Load in the Robusta bean data
robusta_df = pd.read_csv(
    "robusta_cleaned_data.csv"
)

In [12]:
robusta_df.head()

Unnamed: 0,Quality_Score,Species,Owner,Country of Origin,Farm Name,Company,Altitude,Region,Producer,Number of Bags,...,Clean Cup,Balance,Cupper Points,Total Cup Points,Moisture,Category One Defects,Color,Category Two Defects,Certification Body,date_diff
0,83.75,Robusta,Ankole coffee producers coop,Uganda,Kyangundu cooperative society,Ankole Coffee Producers Coop,1488,Sheema South Western,Ankole coffee producers coop,300,...,10.0,7.92,8.0,Sample 83.75,12 %,0 full defects,Green,2 full defects,Uganda Coffee Development Authority,365
1,83.5,Robusta,Nishant Gurjer,India,Sethuraman Estate Kaapi Royale,Kaapi Royale,3170,Chikmagalur Karnataka Indua,Nishant Gurjer Kaapi Royale,320,...,10.0,7.92,8.0,Sample 83.50,0 %,0 full defects,,2 full defects,Specialty Coffee Association,365
2,83.25,Robusta,Andrew Hetzel,India,Sethuraman Estate,Sethuraman Estate,1000m,Chikmagalur,Nishant Gurjer,300,...,10.0,7.92,8.0,Sample 83.25,0 %,0 full defects,Green,0 full defects,Specialty Coffee Association,365
3,83.0,Robusta,UGACOF,Uganda,UGACOF project area,UGACOF Ltd,1212,Central,UGACOF,320,...,10.0,7.75,8.08,Sample 83.00,12 %,0 full defects,Green,7 full defects,Uganda Coffee Development Authority,365
4,83.0,Robusta,Katuka Development Trust Ltd,Uganda,Katikamu capca farmers association,Katuka Development Trust Ltd,1200-1300,Luwero central region,Katuka Development Trust Ltd,1,...,10.0,7.75,7.67,Sample 83.00,12 %,0 full defects,Green,3 full defects,Uganda Coffee Development Authority,365


In [13]:
# Drop unneeded company data
robusta_df.drop(columns=['Company', 'Producer', 'In-Country Partner', 'Harvest Year', 'Farm Name'], inplace=True)

In [14]:
# Drop unneeded production data
robusta_df.drop(columns=['Variety', 'Processing Method', 'Color', 'Number of Bags', 'Grading Date', 'Status', 'date_diff'], inplace=True)

In [15]:
# Drop unneeded certification data
robusta_df.drop(columns=['Quality_Score', 'Category One Defects', 'Category Two Defects', 'Certification Body'], inplace=True)

In [16]:
# Drop rows where Country of Origin is United States as incorrect
robusta_df.drop(robusta_df.index[robusta_df['Country of Origin'] == 'United States'], inplace = True)

In [17]:
# Convert Moisture and Total Cup Points to decimal (remove '%' and 'Sample')
robusta_df["Moisture"].replace(regex=True, inplace=True, to_replace=r'[%]', value=r'')
robusta_df["Moisture"] = pd.to_numeric(robusta_df["Moisture"], downcast='float')
robusta_df["Moisture"] = robusta_df["Moisture"]/100
robusta_df["Total Cup Points"].replace(regex=True, inplace=True, to_replace=r'[Sample]', value=r'')
robusta_df["Total Cup Points"] = pd.to_numeric(robusta_df["Total Cup Points"], downcast='float')
robusta_df.head()

Unnamed: 0,Species,Owner,Country of Origin,Altitude,Region,Aroma,Flavor,Aftertaste,Acidity,Sweetness,Body,Uniformity,Clean Cup,Balance,Cupper Points,Total Cup Points,Moisture
0,Robusta,Ankole coffee producers coop,Uganda,1488,Sheema South Western,7.83,8.08,7.75,7.92,8.0,8.25,10.0,10.0,7.92,8.0,83.75,0.12
1,Robusta,Nishant Gurjer,India,3170,Chikmagalur Karnataka Indua,8.0,7.75,7.92,8.0,8.0,7.92,10.0,10.0,7.92,8.0,83.5,0.0
2,Robusta,Andrew Hetzel,India,1000m,Chikmagalur,7.92,7.83,7.92,8.0,7.83,7.83,10.0,10.0,7.92,8.0,83.25,0.0
3,Robusta,UGACOF,Uganda,1212,Central,8.0,7.92,7.92,7.75,7.75,7.83,10.0,10.0,7.75,8.08,83.0,0.12
4,Robusta,Katuka Development Trust Ltd,Uganda,1200-1300,Luwero central region,8.33,7.83,7.83,7.75,7.58,8.25,10.0,10.0,7.75,7.67,83.0,0.12


In [18]:
# Rename columns to remove spaces
robusta_df.rename(columns={  
    'Country of Origin': 'countryOfOrigin',
    'Clean Cup': 'cleanCup',
    'Cupper Points': 'cupperPoints',
    'Total Cup Points': 'totalCupPoints'
}, inplace=True)
robusta_df.head()

Unnamed: 0,Species,Owner,countryOfOrigin,Altitude,Region,Aroma,Flavor,Aftertaste,Acidity,Sweetness,Body,Uniformity,cleanCup,Balance,cupperPoints,totalCupPoints,Moisture
0,Robusta,Ankole coffee producers coop,Uganda,1488,Sheema South Western,7.83,8.08,7.75,7.92,8.0,8.25,10.0,10.0,7.92,8.0,83.75,0.12
1,Robusta,Nishant Gurjer,India,3170,Chikmagalur Karnataka Indua,8.0,7.75,7.92,8.0,8.0,7.92,10.0,10.0,7.92,8.0,83.5,0.0
2,Robusta,Andrew Hetzel,India,1000m,Chikmagalur,7.92,7.83,7.92,8.0,7.83,7.83,10.0,10.0,7.92,8.0,83.25,0.0
3,Robusta,UGACOF,Uganda,1212,Central,8.0,7.92,7.92,7.75,7.75,7.83,10.0,10.0,7.75,8.08,83.0,0.12
4,Robusta,Katuka Development Trust Ltd,Uganda,1200-1300,Luwero central region,8.33,7.83,7.83,7.75,7.58,8.25,10.0,10.0,7.75,7.67,83.0,0.12


In [19]:
origin_r_df = robusta_df.filter(['countryOfOrigin', 'Region', 'Altitude'], axis=1)
origin_r_df.drop_duplicates(subset=['countryOfOrigin', 'Region'], keep='last')
origin_r_df.rename(columns={'countryOfOrigin': 'Country'}, inplace=True)

origin_all_df = pd.concat([origin_df, origin_r_df])
origin_all_df.head()

Unnamed: 0,Country,Region,Altitude
0,Ethiopia,GUJI-HAMBELA/GOYO,1950-2200
1,Ethiopia,GUJI-HAMBELA/ALAKA,1950-2200
2,Guatemala,Unknown,1600 - 1800 m
3,Ethiopia,Oromia,1800-2200
4,Ethiopia,GUJI-HAMBELA/BISHAN FUGU,1950-2200


In [20]:
# Create connection to database
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/CoffeeDB"
    
# Create the database engine
engine = create_engine(db_string)
conn = engine.connect()

In [21]:
# Save the data from dataframe to
# postgres table "ArabicaRatings"
arabica_df.to_sql(
    'ArabicaRatings', con=conn,
    if_exists='replace',
    index=True
)

robusta_df.to_sql(
    'RobustaRatings', con=conn,
    if_exists='replace',
    index=True
)

origin_all_df.to_sql(
    'Regions', con=conn,
    if_exists='replace',
    index=True
)
conn = psycopg2.connect(db_string
                        )
conn.autocommit = True
cursor = conn.cursor()

# conn.commit()
conn.close()