# ETL Project (16-Oct-2019)
## Michael Bell
## Rob Chesser
## Adam Durar

In [6]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

## Note: "winemag-data-130k.csv" originally scraped from Wine Enthusiast magazine.
## Pull in wine data from "winemag-data-130k.csv".  
## Clean data by removing entries where "province = America" (should be a US state).

In [7]:
winemag = "../csv/winemag-data-130k.csv"
winemag_data = pd.read_csv(winemag)
winemag_data = winemag_data[winemag_data.province != 'America']

## Remove unnecessary/unused columns from "winemag_data" dataframe.
## Note: The columns listed were not required for analysis/plotting.

In [8]:
winemag_df = winemag_data.drop(columns=["Unnamed: 0","description","region_1","region_2","taster_name","taster_twitter_handle","title"])
winemag_df.head()

Unnamed: 0,country,designation,points,price,province,variety,winery
0,Italy,Vulkà Bianco,87,,Sicily & Sardinia,White Blend,Nicosia
1,Portugal,Avidagos,87,15.0,Douro,Portuguese Red,Quinta dos Avidagos
2,US,,87,14.0,Oregon,Pinot Gris,Rainstorm
3,US,Reserve Late Harvest,87,13.0,Michigan,Riesling,St. Julian
4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Pinot Noir,Sweet Cheeks


## Check row counts to ensure that we have the correct data.

In [9]:
winemag_all_rows = winemag_df
winemag_all_rows.count()

country        129813
designation     92437
points         129876
price          120880
province       129813
variety        129875
winery         129876
dtype: int64

## Look at the "country" counts and averages.
## Get "country" to show with "reset_index".

In [10]:
by_country_count = winemag_all_rows.groupby("country").count()
by_country_count
by_country_mean = winemag_all_rows.groupby("country").mean().reset_index() 
by_country_mean.head()

Unnamed: 0,country,points,price
0,Argentina,86.710263,24.510117
1,Armenia,87.5,14.5
2,Australia,88.580507,35.437663
3,Austria,90.101345,30.762772
4,Bosnia and Herzegovina,86.5,12.5


## Export to CSV and load in Microsoft Excel (or equivalent) to view for additional verification.

In [11]:
winemag_all_rows.to_csv("../csv/winemag_cleaned.csv")

## Connect to PostgreSQL relational database.

In [17]:
connection_string = "postgres:postgres@localhost:5432/etl_project"
engine = create_engine(f'postgresql://{connection_string}')

## Create table for data import using sqlalchemy.

In [19]:
engine.execute('CREATE TABLE "winedata" ('
'id INTEGER PRIMARY KEY,'
'country varchar,'
'designation varchar,'
'points int,'
'price float,'
'province varchar,'
'variety varchar,'
'winery varchar'
');')

<sqlalchemy.engine.result.ResultProxy at 0x1cc8f34f048>

## Verify that a table was created.

In [21]:
engine.table_names()

['winedata']

## Specify the column "id" in the dataframe and view the first five rows.

In [23]:
winemag_all_rows.index.name = 'id'
winemag_all_rows.head()

Unnamed: 0_level_0,country,designation,points,price,province,variety,winery
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Italy,Vulkà Bianco,87,,Sicily & Sardinia,White Blend,Nicosia
1,Portugal,Avidagos,87,15.0,Douro,Portuguese Red,Quinta dos Avidagos
2,US,,87,14.0,Oregon,Pinot Gris,Rainstorm
3,US,Reserve Late Harvest,87,13.0,Michigan,Riesling,St. Julian
4,US,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Pinot Noir,Sweet Cheeks


## Import the data from the pandas dataframe into the PostgreSQL table we created above.

In [25]:
winemag_all_rows.to_sql(name='winedata', con=engine, if_exists='append', index=True)

## View the first five rows of imported data directly from the PostgreSQL table.

In [27]:
engine.execute('SELECT * FROM winedata LIMIT 5').fetchall()

[(0, 'Italy', 'Vulkà Bianco', 87, None, 'Sicily & Sardinia', 'White Blend', 'Nicosia'),
 (1, 'Portugal', 'Avidagos', 87, 15.0, 'Douro', 'Portuguese Red', 'Quinta dos Avidagos'),
 (2, 'US', None, 87, 14.0, 'Oregon', 'Pinot Gris', 'Rainstorm'),
 (3, 'US', 'Reserve Late Harvest', 87, 13.0, 'Michigan', 'Riesling', 'St. Julian'),
 (4, 'US', "Vintner's Reserve Wild Child Block", 87, 65.0, 'Oregon', 'Pinot Noir', 'Sweet Cheeks')]