## Import libraries and config Pandas display

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
pd.options.display.max_columns = 25
pd.options.display.max_rows = 100

## Import Dataset

#### Read CSV

In [20]:
winery = pd.read_csv("../data/Winery_Data_csv.csv")
winery.head()

Unnamed: 0,Customer ID,Order ID,Customer Segment,Date,Zip Code,State,Sales 2008,Sales 2009,Sales 2010,Sale Amount,Orders 2008,Orders 2009,Orders 2010,Year Acquired,Email Subscr,Newsletter Subscr,Winemaker call,Email Sales,Newsletter Sales,Tasting Room Sales,Winemaker Call Sales
0,1,1532,High Roller,08-Jul-08,33467,FL,213.0,30903.1,13340.94,44.0,4.0,8.0,4.0,2008,1,1,1,0.0,0.0,44.0,0.0
1,1,14378,High Roller,05-Oct-08,33467,FL,213.0,30903.1,13340.94,47.0,4.0,8.0,4.0,2008,1,1,1,0.0,0.0,47.0,0.0
2,1,17690,High Roller,26-Oct-08,33467,FL,213.0,30903.1,13340.94,57.0,4.0,8.0,4.0,2008,1,1,1,0.0,57.0,0.0,0.0
3,1,19808,High Roller,08-Nov-08,33467,FL,213.0,30903.1,13340.94,65.0,4.0,8.0,4.0,2008,1,1,1,0.0,0.0,65.0,0.0
4,1,25406,High Roller,02-Jan-09,33467,FL,213.0,30903.1,13340.94,3889.0,4.0,8.0,4.0,2008,1,1,1,0.0,0.0,3889.0,0.0


#### Print basic attributes

In [21]:
print(winery.dtypes, "\n")
print("Dataframe shape:", winery.shape, "\n")
print(winery.info())

Customer ID               int64
Order ID                  int64
Customer Segment         object
Date                     object
Zip Code                  int64
State                    object
Sales 2008              float64
Sales 2009              float64
Sales 2010              float64
Sale Amount             float64
 Orders 2008            float64
Orders 2009             float64
Orders 2010             float64
Year Acquired             int64
Email Subscr              int64
Newsletter Subscr         int64
Winemaker call            int64
Email Sales             float64
Newsletter Sales        float64
Tasting Room Sales      float64
Winemaker Call Sales    float64
dtype: object 

Dataframe shape: (65534, 21) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65534 entries, 0 to 65533
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Customer ID           65534 non-null  int64  
 1   Order ID      

## Data cleaning & transformation

#### Check for NAs

In [22]:
winery.columns = winery.columns.str.replace(' ', '')
winery.rename(columns={"Winemakercall": "WinemakerCallSubscr"}, inplace = True)
print(winery.isna().sum())

CustomerID               0
OrderID                  0
CustomerSegment          0
Date                     0
ZipCode                  0
State                    6
Sales2008                0
Sales2009                0
Sales2010                0
SaleAmount               0
Orders2008             683
Orders2009             683
Orders2010             683
YearAcquired             0
EmailSubscr              0
NewsletterSubscr         0
WinemakerCallSubscr      0
EmailSales               0
NewsletterSales          0
TastingRoomSales         0
WinemakerCallSales       0
dtype: int64


#### Investigate NA Orders

In [23]:
orders_na_condition = (winery["Orders2008"].isna()) | (winery["Orders2009"].isna()) | (winery["Orders2010"].isna())
winery_order_nas = winery.loc[orders_na_condition, :]
winery_order_nas.sample(5)

Unnamed: 0,CustomerID,OrderID,CustomerSegment,Date,ZipCode,State,Sales2008,Sales2009,Sales2010,SaleAmount,Orders2008,Orders2009,Orders2010,YearAcquired,EmailSubscr,NewsletterSubscr,WinemakerCallSubscr,EmailSales,NewsletterSales,TastingRoomSales,WinemakerCallSales
56966,18807,34998,Casual Visitor,09-Apr-09,71303,LA,0.0,73.0,0.0,73.0,,,,2009,0,1,0,0.0,0.0,73.0,0.0
47603,14506,70931,Luxury Estate,31-Dec-09,10007,NY,0.0,92.0,0.0,92.0,,,,2008,1,1,0,0.0,0.0,92.0,0.0
56418,18502,43424,Casual Visitor,30-Jun-09,91316,CA,0.0,73.0,0.0,73.0,,,,2008,0,1,0,0.0,0.0,73.0,0.0
12118,2626,36125,Wine Enthusiast,21-Apr-09,98101,WA,0.0,0.0,0.0,334.0,,,,2008,0,0,0,0.0,0.0,334.0,0.0
4192,918,69305,High Roller,23-Dec-09,83401,ID,0.0,671.0,0.0,671.0,,,,2005,0,1,0,0.0,0.0,671.0,0.0


#### Investigate NA States

In [None]:
state_na_condition = (winery["State"].isna())
winery_state_nas = winery.loc[state_na_condition, :]
winery_state_nas.sample(5)

#### Drop NAs

In [24]:
winery = winery.dropna(ignore_index = True)     # ignore_index resets the row labels to 0 -> n-1 after dropping NA rows
print(winery.isna().sum(), "\n")
print("Shape", winery.shape)

CustomerID             0
OrderID                0
CustomerSegment        0
Date                   0
ZipCode                0
State                  0
Sales2008              0
Sales2009              0
Sales2010              0
SaleAmount             0
Orders2008             0
Orders2009             0
Orders2010             0
YearAcquired           0
EmailSubscr            0
NewsletterSubscr       0
WinemakerCallSubscr    0
EmailSales             0
NewsletterSales        0
TastingRoomSales       0
WinemakerCallSales     0
dtype: int64 

Shape (64845, 21)


#### Configure column types

In [25]:
winery["Date"] = pd.to_datetime(winery["Date"], format="%d-%b-%y")
winery = winery.astype({"CustomerID": object,
                        "OrderID": object,
                        "Orders2008": int,
                        "Orders2009": int,
                        "Orders2010": int, 
                        "EmailSubscr": bool,
                        "NewsletterSubscr": bool, 
                        "WinemakerCallSubscr": bool, 
                        "ZipCode": object})

categorical_columns = ["CustomerSegment", "State"]
winery[categorical_columns] = winery[categorical_columns].astype("category")

winery.dtypes

CustomerID                     object
OrderID                        object
CustomerSegment              category
Date                   datetime64[ns]
ZipCode                        object
State                        category
Sales2008                     float64
Sales2009                     float64
Sales2010                     float64
SaleAmount                    float64
Orders2008                      int32
Orders2009                      int32
Orders2010                      int32
YearAcquired                    int64
EmailSubscr                      bool
NewsletterSubscr                 bool
WinemakerCallSubscr              bool
EmailSales                    float64
NewsletterSales               float64
TastingRoomSales              float64
WinemakerCallSales            float64
dtype: object

#### Filter for valid US states

In [32]:
us_states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS',
                          'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY',
                          'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
winery = winery[winery["State"].isin(us_states)]

#### Map states to division

In [33]:
state_to_division = {
    'AL': 'East South Central',
    'AK': 'Pacific',
    'AZ': 'Mountain',
    'AR': 'West South Central',
    'CA': 'Pacific',
    'CO': 'Mountain',
    'CT': 'New England',
    'DE': 'South Atlantic',
    'FL': 'South Atlantic',
    'GA': 'South Atlantic',
    'HI': 'Pacific',
    'ID': 'Mountain',
    'IL': 'East North Central',
    'IN': 'East North Central',
    'IA': 'West North Central',
    'KS': 'West North Central',
    'KY': 'East South Central',
    'LA': 'West South Central',
    'ME': 'New England',
    'MD': 'South Atlantic',
    'MA': 'New England',
    'MI': 'East North Central',
    'MN': 'West North Central',
    'MS': 'East South Central',
    'MO': 'West North Central',
    'MT': 'Mountain',
    'NE': 'West North Central',
    'NV': 'Mountain',
    'NH': 'New England',
    'NJ': 'Middle Atlantic',
    'NM': 'Mountain',
    'NY': 'Middle Atlantic',
    'NC': 'South Atlantic',
    'ND': 'West North Central',
    'OH': 'East North Central',
    'OK': 'West South Central',
    'OR': 'Pacific',
    'PA': 'Middle Atlantic',
    'RI': 'New England',
    'SC': 'South Atlantic',
    'SD': 'West North Central',
    'TN': 'East South Central',
    'TX': 'West South Central',
    'UT': 'Mountain',
    'VT': 'New England',
    'VA': 'South Atlantic',
    'WA': 'Pacific',
    'WV': 'South Atlantic',
    'WI': 'East North Central',
    'WY': 'Mountain',
}

winery["Division"] = winery["State"].map(state_to_division)
winery.sample(5)[["CustomerID", "OrderID", "State", "Division"]]

Unnamed: 0,CustomerID,OrderID,State,Division
4535,1081,61260,MA,New England
16626,3960,65203,AZ,Mountain
3763,853,61954,NH,New England
14869,3481,6399,ND,West North Central
15272,3566,70250,TX,West South Central


#### Map states to region

In [42]:
state_to_region = {
    'AL': 'South',
    'AK': 'West',
    'AZ': 'West',
    'AR': 'South',
    'CA': 'West',
    'CO': 'West',
    'CT': 'Northeast',
    'DE': 'South',
    'FL': 'South',
    'GA': 'South',
    'HI': 'West',
    'ID': 'West',
    'IL': 'Midwest',
    'IN': 'Midwest',
    'IA': 'Midwest',
    'KS': 'Midwest',
    'KY': 'South',
    'LA': 'South',
    'ME': 'Northeast',
    'MD': 'South',
    'MA': 'Northeast',
    'MI': 'Midwest',
    'MN': 'Midwest',
    'MS': 'South',
    'MO': 'Midwest',
    'MT': 'West',
    'NE': 'Midwest',
    'NV': 'West',
    'NH': 'Northeast',
    'NJ': 'Northeast',
    'NM': 'West',
    'NY': 'Northeast',
    'NC': 'South',
    'ND': 'Midwest',
    'OH': 'Midwest',
    'OK': 'South',
    'OR': 'West',
    'PA': 'Northeast',
    'RI': 'Northeast',
    'SC': 'South',
    'SD': 'Midwest',
    'TN': 'South',
    'TX': 'South',
    'UT': 'West',
    'VT': 'Northeast',
    'VA': 'South',
    'WA': 'West',
    'WV': 'South',
    'WI': 'Midwest',
    'WY': 'West',
}

winery["Region"] = winery["State"].map(state_to_region)
winery.sample(5)[["State", "Region", "Division"]]


Unnamed: 0,State,Region,Division
39862,NY,Northeast,Middle Atlantic
5144,VA,South,South Atlantic
46206,IL,Midwest,East North Central
58749,VA,South,South Atlantic
60922,FL,South,South Atlantic


In [51]:
year_acquired_condition = (winery["Date"].dt.year < winery["YearAcquired"])
winery[year_acquired_condition]
winery.tail(50)[["CustomerID", "OrderID", "Date", "YearAcquired", "EmailSubscr", "NewsletterSubscr", "WinemakerCallSubscr"]]

Unnamed: 0,CustomerID,OrderID,Date,YearAcquired,EmailSubscr,NewsletterSubscr,WinemakerCallSubscr
64795,22834,68929,2009-12-22,2009,True,True,False
64796,22836,72683,2010-01-10,2005,False,False,False
64797,22836,86411,2010-04-14,2005,False,False,False
64798,22837,14498,2008-10-06,2010,False,False,False
64799,22837,57472,2009-10-06,2010,False,True,False
64800,22838,37086,2009-04-30,2008,False,False,False
64801,22838,48708,2009-08-06,2008,False,True,False
64802,22839,31234,2009-03-03,2010,False,False,False
64803,22839,40883,2009-06-08,2010,False,False,False
64804,22841,64395,2009-11-19,2008,False,False,False


In [52]:
winery.head(50)

Unnamed: 0,CustomerID,OrderID,CustomerSegment,Date,ZipCode,State,Sales2008,Sales2009,Sales2010,SaleAmount,Orders2008,Orders2009,Orders2010,YearAcquired,EmailSubscr,NewsletterSubscr,WinemakerCallSubscr,EmailSales,NewsletterSales,TastingRoomSales,WinemakerCallSales,Division,Region
0,1,1532,High Roller,2008-07-08,33467,FL,213.0,30903.1,13340.94,44.0,4,8,4,2008,True,True,True,0.0,0.0,44.0,0.0,South Atlantic,South
1,1,14378,High Roller,2008-10-05,33467,FL,213.0,30903.1,13340.94,47.0,4,8,4,2008,True,True,True,0.0,0.0,47.0,0.0,South Atlantic,South
2,1,17690,High Roller,2008-10-26,33467,FL,213.0,30903.1,13340.94,57.0,4,8,4,2008,True,True,True,0.0,57.0,0.0,0.0,South Atlantic,South
3,1,19808,High Roller,2008-11-08,33467,FL,213.0,30903.1,13340.94,65.0,4,8,4,2008,True,True,True,0.0,0.0,65.0,0.0,South Atlantic,South
4,1,25406,High Roller,2009-01-02,33467,FL,213.0,30903.1,13340.94,3889.0,4,8,4,2008,True,True,True,0.0,0.0,3889.0,0.0,South Atlantic,South
5,1,26019,High Roller,2009-01-09,33467,FL,213.0,30903.1,13340.94,5410.83,4,8,4,2008,True,True,True,0.0,5410.83,0.0,0.0,South Atlantic,South
6,1,39765,High Roller,2009-05-27,33467,FL,213.0,30903.1,13340.94,1573.88,4,8,4,2008,True,True,True,0.0,0.0,1573.88,0.0,South Atlantic,South
7,1,40916,High Roller,2009-06-08,33467,FL,213.0,30903.1,13340.94,1928.73,4,8,4,2008,True,True,True,0.0,0.0,1928.73,0.0,South Atlantic,South
8,1,47301,High Roller,2009-07-27,33467,FL,213.0,30903.1,13340.94,2069.86,4,8,4,2008,True,True,True,0.0,0.0,2069.86,0.0,South Atlantic,South
9,1,57351,High Roller,2009-10-05,33467,FL,213.0,30903.1,13340.94,5747.28,4,8,4,2008,True,True,True,0.0,0.0,5747.28,0.0,South Atlantic,South
