In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Initial imports
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder
import os
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# import tensorflow as tf
# from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import classification_report




In [3]:
#  Import and read the csv file(s)
#file_path = "../Mod20_Group_Challenge/iris.csv"
wine_df=pd.read_csv("https://raw.githubusercontent.com/saraegregg/Mod20_Group_Challenge/main/data_cleaning/ml_master_table.csv", encoding='latin1')


In [4]:
# Pull in top keywords
keywords = ['ripe', 'crisp', 'bright', 'dry', 'full', 'sweet', 'fresh', 'earthy', 'bitter', 'aftertaste']
for k in keywords:
    wine_df[k] = wine_df.description.apply(lambda x : 1 if x.find(k)>-1 else 0)


In [5]:
# Drop the non-beneficial ID columns, 'Description'.
wine_df=wine_df.drop(columns=['description', 'wine_id', 'province_id', 'country_id', 'winery_id'])


In [6]:
point_counts=wine_df.points.value_counts()
point_counts

88     13407
90     12071
87     12036
89      9878
91      9087
86      8931
92      7201
85      6646
93      4909
84      4091
94      2746
83      2015
95      1106
82      1004
81       426
96       364
80       274
97       145
98        53
99        16
100       14
Name: points, dtype: int64

In [7]:
def getPoints(points):
    if(points <= 85):
        return '1'
    elif(points<=90):
        return '2'
    elif(points<=95):
        return '3'
    elif(points<=100):
        return '4'
    else:
        return 'If this gets hit, we did something wrong!'

In [8]:
wine_df['Points'] = wine_df['points'].apply(getPoints)

In [9]:
price_counts=wine_df.price.value_counts()
price_counts

20     5729
15     5078
25     4737
30     3951
18     3853
       ... 
973       1
757       1
247       1
322       1
672       1
Name: price, Length: 377, dtype: int64

In [10]:
# Determine which values to replace if counts are less than ..?
replace_price = list(price_counts[price_counts < 2500].index)

# Replace in dataframe
for pri in replace_price:
    wine_df.price= wine_df.price.replace(pri,"Other")
    
# Check to make sure binning was successful
wine_df.price.value_counts()

Other    44716
20        5729
15        5078
25        4737
30        3951
18        3853
12        3317
13        3037
35        2955
40        2917
16        2861
10        2832
22        2689
14        2661
50        2559
17        2528
Name: price, dtype: int64

In [11]:
variety_counts=wine_df.variety.value_counts()
variety_counts

Pinot Noir                      9831
Chardonnay                      8298
Red Blend                       6844
Cabernet Sauvignon              6199
Bordeaux-style Red Blend        4812
                                ... 
Athiri                             1
Cercial                            1
Fruburgunder                       1
Muscat Blanc a Petits Grains       1
Bobal-Cabernet Sauvignon           1
Name: variety, Length: 654, dtype: int64

In [12]:
# Determine which values to replace if counts are less than ..?
replace_variety = list(variety_counts[variety_counts < 2000].index)

# Replace in dataframe
for var in replace_variety:
    wine_df.variety= wine_df.variety.replace(var,"Other")
    
# Check to make sure binning was successful
wine_df.variety.value_counts()

Other                       38950
Pinot Noir                   9831
Chardonnay                   8298
Red Blend                    6844
Cabernet Sauvignon           6199
Bordeaux-style Red Blend     4812
Riesling                     4722
Sauvignon Blanc              3841
Syrah                        3103
Rose                         3033
Malbec                       2524
Portuguese Red               2196
Merlot                       2067
Name: variety, dtype: int64

In [13]:
country_counts=wine_df.country_name.value_counts()
country_counts

US                        37503
France                    17439
Italy                     10113
Spain                      6528
Portugal                   4864
Chile                      4300
Argentina                  3835
Austria                    2810
Germany                    2093
Australia                  2001
New Zealand                1264
South Africa               1236
Israel                      484
Greece                      460
Canada                      253
Bulgaria                    141
Hungary                     138
Romania                     119
Uruguay                     109
Turkey                       90
Georgia                      84
Slovenia                     79
Croatia                      73
England                      69
Mexico                       65
Moldova                      59
Brazil                       47
Lebanon                      34
Morocco                      25
Peru                         16
Ukraine                      14
Macedoni

In [14]:
# Determine which values to replace if counts are less than ..?
replace_country = list(country_counts[country_counts < 2000].index)

# Replace in dataframe
for coun in replace_country:
    wine_df.country_name= wine_df.country_name.replace(coun,"Other")
    
# Check to make sure binning was successful
wine_df.country_name.value_counts()

US           37503
France       17439
Italy        10113
Spain         6528
Other         4934
Portugal      4864
Chile         4300
Argentina     3835
Austria       2810
Germany       2093
Australia     2001
Name: country_name, dtype: int64

In [15]:
wine_df['Points']=wine_df['Points'].astype(int)

In [16]:
wine_df['price']=wine_df['price'].astype(str)

In [17]:
# wine_df = wine_df.drop(columns=['province', 'title', 'winery', 'taster_name'], axis=1) 

In [18]:
# Generate our categorical variable lists
wine_cat=wine_df.dtypes[wine_df.dtypes =="object"].index.tolist()

In [19]:
wine_df[wine_cat].nunique()

variety         13
price           16
country_name    11
dtype: int64

In [20]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
wine_encode_df = pd.DataFrame(enc.fit_transform(wine_df[wine_cat]))
wine_encode_df.columns = enc.get_feature_names(wine_cat)



In [21]:
# Add the encoded variable names to the dataframe
wine_df = wine_df.merge(wine_encode_df,left_index=True, right_index=True)
wine_df = wine_df.drop(wine_cat,1)


In [22]:
# # drop unnecessary columns province, region_1, region_2, taster_twitter_handle, title, variety and winery. 
wine_df = wine_df.drop(columns=['points']) 
# # del wine_df['winery']
# # # hot encoding for country and taster name as they are limited categories. 
# wine_df = pd.get_dummies(wine_df, columns=['country', 'price', 'variety'])

In [23]:
#wine_df.to_csv(r'../data_cleaning/ML_data2.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data_cleaning/ML_data2.csv'

In [None]:
wine_df.dtypes

In [None]:
# Split our preprocessed data into our features and target arrays
y = wine_df["Points"].values
X = wine_df.drop(["Points"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [None]:
# Create a StandardScaler instances
# scaler = StandardScaler()

# # Fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)

In [None]:
clf = RandomForestClassifier(random_state=1).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}') 

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["FINE", "GOOD", "VERY GOOD", "EXCEPTIONAL"])

# Calculating the accuracy score
acc_score = accuracy_score(y_test, y_pred)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
import pickle

In [None]:
pickle.dump(rf_model, open("../model.p", "wb"))