## Models

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [181]:
df = pd.read_csv('wineReview_top100.csv')  # top 100 (90) wine varieties with more than 100 reviews
df_10 = pd.read_csv('wineReview_top10.csv')  # top 10 wine varieties

In [183]:
df.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [196]:
# drop the na in df_10
df_10.dropna( axis=0, inplace = True)
#df_10.drop(['Unnamed: 0'], axis=1, inplace=True)


In [197]:
df_10.isna().count()


description    67024
points         67024
price          67024
variety        67024
acidity        67024
               ...  
wood           67024
years          67024
cluster        67024
rating_bin     67024
price_bin      67024
Length: 104, dtype: int64

In [211]:
df_10.describe


<bound method NDFrame.describe of                                              description  points  price  \
0      Pineapple rind, lemon pith and orange blossom ...      87   13.0   
1      Much like the regular bottling from 2012, this...      87   65.0   
2      Soft, supple plum envelopes an oaky structure ...      87   19.0   
3      Slightly reduced, this wine offers a chalky, t...      87   34.0   
4      Building on 150 years and six generations of w...      87   12.0   
...                                                  ...     ...    ...   
71317  This blend of Cabernet Sauvignon-Merlot and Ca...      90   35.0   
71318  Fresh and fruity, this is full of red cherry f...      90   48.0   
71319  A bouquet of black cherry, tart cranberry and ...      90   20.0   
71320  Notes of honeysuckle and cantaloupe sweeten th...      90   28.0   
71321  Citation is given as much as a decade of bottl...      90   75.0   

                        variety   acidity       age  along  along

In [123]:
# limit variety to a winelist
wineList = df_10.variety.unique().tolist()

In [198]:
desc = df_10.description.values
variety = df_10.variety.values
rate = df_10.rating_bin.values
price = df_10.price_bin.values

train_reviews = desc[:45000]
train_sentiments = variety[:45000]
train_sentiments_rate = rate[:45000]
train_sentiments_price = price[:45000]

test_reviews = desc[45001:]
test_sentiments = variety[45001:]
test_sentiments_rate = rate[45001:]
test_sentiments_price = price[45001:]

In [212]:
len(train_sentiments)
len(train_reviews)
len(test_reviews)

22023

In [243]:
#Feature Engineering

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.05, max_df=1.0, ngram_range=(1,3))
cv_train_features = cv.fit_transform(train_reviews)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.05, max_df=1.0, ngram_range=(1,3),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(train_reviews)


In [244]:
# transform test reviews into features

cv_test_features = cv.transform(test_reviews)
tv_test_features = tv.transform(test_reviews)

In [245]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (45000, 145)  Test features shape: (22023, 145)
TFIDF model:> Train features shape: (45000, 145)  Test features shape: (22023, 145)


In [246]:
# logistic Regression
#%%time

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=750, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, train_sentiments)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

In [247]:
from sklearn.metrics import confusion_matrix, classification_report

labels = wineList
print(classification_report(test_sentiments, lr_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_bow_predictions), index=labels, columns=labels)

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.64      0.62      0.63      1746
      Cabernet Sauvignon       0.57      0.63      0.60      3050
              Chardonnay       0.63      0.83      0.71      3645
                  Merlot       0.75      0.38      0.50      1027
              Pinot Noir       0.66      0.73      0.70      4237
               Red Blend       0.67      0.65      0.66      2667
                Riesling       0.66      0.66      0.66      1640
                    Rosé       0.65      0.46      0.54      1102
         Sauvignon Blanc       0.70      0.47      0.56      1531
                   Syrah       0.69      0.46      0.55      1378

                accuracy                           0.64     22023
               macro avg       0.66      0.59      0.61     22023
            weighted avg       0.65      0.64      0.64     22023



Unnamed: 0,Riesling,Pinot Noir,Cabernet Sauvignon,Chardonnay,Red Blend,Merlot,Sauvignon Blanc,Bordeaux-style Red Blend,Rosé,Syrah
Riesling,1077,181,56,36,128,231,6,23,1,7
Pinot Noir,121,1935,97,49,480,256,12,15,11,74
Cabernet Sauvignon,8,15,3038,2,90,11,293,30,155,3
Chardonnay,37,274,28,386,178,74,5,9,4,32
Red Blend,221,330,255,3,3094,122,48,83,24,57
Merlot,187,326,44,31,208,1721,8,49,5,88
Sauvignon Blanc,2,9,428,0,25,3,1087,32,54,0
Bordeaux-style Red Blend,18,32,198,5,184,47,65,509,31,13
Rosé,1,6,632,0,34,5,111,13,720,9
Syrah,16,275,77,4,233,84,10,24,22,633


In [248]:
# logistic Regression
#%%time

# Logistic Regression model on TF_IDF features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr_tv = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr_tv.fit(tv_train_features, train_sentiments)

# predict on test data
lr_tv_tfidf_predictions = lr_tv.predict(tv_test_features)

In [216]:
labels = wineList
print(classification_report(test_sentiments, lr_tv_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, lr_tv_tfidf_predictions), index=labels, columns=labels)

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.63      0.63      0.63      1746
      Cabernet Sauvignon       0.57      0.63      0.60      3050
              Chardonnay       0.63      0.83      0.72      3645
                  Merlot       0.76      0.37      0.50      1027
              Pinot Noir       0.66      0.73      0.69      4237
               Red Blend       0.66      0.65      0.66      2667
                Riesling       0.66      0.66      0.66      1640
                    Rosé       0.66      0.43      0.52      1102
         Sauvignon Blanc       0.70      0.47      0.56      1531
                   Syrah       0.70      0.45      0.55      1378

                accuracy                           0.64     22023
               macro avg       0.66      0.59      0.61     22023
            weighted avg       0.65      0.64      0.64     22023



Unnamed: 0,Riesling,Pinot Noir,Cabernet Sauvignon,Chardonnay,Red Blend,Merlot,Sauvignon Blanc,Bordeaux-style Red Blend,Rosé,Syrah
Riesling,1099,180,48,32,110,243,5,21,4,4
Pinot Noir,128,1924,89,49,484,261,15,16,12,72
Cabernet Sauvignon,14,15,3035,1,107,11,290,23,146,3
Chardonnay,43,265,29,381,187,74,7,10,3,28
Red Blend,240,319,239,5,3095,135,43,80,30,51
Merlot,174,312,46,28,225,1741,9,42,7,83
Sauvignon Blanc,3,8,427,0,39,2,1081,25,55,0
Bordeaux-style Red Blend,24,35,201,5,197,53,67,478,26,16
Rosé,3,9,622,0,36,9,107,14,722,9
Syrah,17,282,76,2,239,91,10,17,23,621


In [249]:
# Logistic Rating

# Logistic Regression model on TF_IDF features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr_tv = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr_tv.fit(tv_train_features, train_sentiments_rate)

# predict on test data
lr_tv_tfidf_predictions = lr_tv.predict(tv_test_features)

In [250]:
labels = [0,1,2,3]
print(classification_report(test_sentiments_rate, lr_tv_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments_rate, lr_tv_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

         0.0       0.75      0.12      0.21      1191
         1.0       0.65      0.72      0.68     10220
         2.0       0.62      0.69      0.65      9379
         3.0       0.68      0.02      0.03      1233

    accuracy                           0.64     22023
   macro avg       0.67      0.39      0.39     22023
weighted avg       0.64      0.64      0.61     22023



Unnamed: 0,0,1,2,3
0,143,971,77,0
1,42,7320,2853,5
2,6,2863,6506,4
3,0,118,1096,19


## Logistic - Price

In [220]:
# Logistic Rating

# Logistic Regression model on TF_IDF features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr_tv = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr_tv.fit(tv_train_features, train_sentiments_price)

# predict on test data
lr_tv_tfidf_predictions = lr_tv.predict(tv_test_features)

In [221]:
labels = [0,1,2,3,4,5]
print(classification_report(test_sentiments_price, lr_tv_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments_price, lr_tv_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

         0.0       0.45      0.59      0.51      5073
         1.0       0.30      0.18      0.23      4889
         2.0       0.33      0.40      0.36      5635
         3.0       0.41      0.47      0.44      4882
         4.0       0.00      0.00      0.00       970
         5.0       0.42      0.03      0.05       574

    accuracy                           0.38     22023
   macro avg       0.32      0.28      0.26     22023
weighted avg       0.36      0.38      0.36     22023



Unnamed: 0,0,1,2,3,4,5
0,2985,790,946,350,0,2
1,1811,897,1551,626,1,3
2,1151,760,2264,1455,0,5
3,543,387,1659,2288,0,5
4,78,88,263,535,0,6
5,27,28,133,370,1,15


## Random Forest Model


In [251]:
# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=42)

# train model
rf.fit(cv_train_features, cv_train_sentiments)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

NameError: name 'cv_train_sentiments' is not defined

In [126]:
labels = wineList
print(classification_report(test_sentiments, rf_bow_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_bow_predictions), index=labels, columns=labels)

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.84      0.76      0.80      4135
      Cabernet Sauvignon       0.75      0.81      0.78      7274
              Chardonnay       0.73      0.94      0.82      8655
                  Merlot       0.93      0.64      0.76      2379
              Pinot Noir       0.78      0.86      0.82      9945
               Red Blend       0.83      0.82      0.82      6580
                Riesling       0.85      0.77      0.81      3822
                    Rosé       0.92      0.60      0.73      2603
         Sauvignon Blanc       0.89      0.67      0.76      3655
                   Syrah       0.89      0.66      0.76      3186

                accuracy                           0.80     52234
               macro avg       0.84      0.75      0.79     52234
            weighted avg       0.81      0.80      0.80     52234



Unnamed: 0,Riesling,Pinot Noir,Cabernet Sauvignon,Chardonnay,Red Blend,Merlot,Sauvignon Blanc,Bordeaux-style Red Blend,Rosé,Syrah
Riesling,3142,248,86,25,220,369,5,29,2,9
Pinot Noir,101,5898,165,44,694,275,16,4,16,61
Cabernet Sauvignon,10,18,8145,1,81,22,226,13,135,4
Chardonnay,50,358,60,1518,265,91,7,6,6,18
Red Blend,222,409,439,3,8572,141,38,62,28,31
Merlot,166,469,70,30,361,5367,2,13,6,96
Sauvignon Blanc,1,8,743,0,64,4,2953,5,44,0
Bordeaux-style Red Blend,23,59,338,7,339,115,87,1567,27,41
Rosé,2,7,1040,0,25,10,126,7,2437,1
Syrah,19,424,127,10,348,109,10,4,23,2112


In [179]:
# Random Forest model on TF-IDF features

# train model
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

In [180]:
labels = wineList
print(classification_report(test_sentiments, rf_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments, rf_tfidf_predictions), index=labels, columns=labels)

                          precision    recall  f1-score   support

Bordeaux-style Red Blend       0.73      0.64      0.68      1759
      Cabernet Sauvignon       0.63      0.70      0.66      3092
              Chardonnay       0.61      0.91      0.73      3678
                  Merlot       0.85      0.44      0.58      1035
              Pinot Noir       0.67      0.78      0.72      4267
               Red Blend       0.72      0.71      0.71      2690
                Riesling       0.75      0.64      0.69      1651
                    Rosé       0.80      0.37      0.51      1118
         Sauvignon Blanc       0.80      0.47      0.59      1551
                   Syrah       0.80      0.48      0.60      1392

                accuracy                           0.68     22233
               macro avg       0.74      0.61      0.65     22233
            weighted avg       0.70      0.68      0.67     22233



Unnamed: 0,Riesling,Pinot Noir,Cabernet Sauvignon,Chardonnay,Red Blend,Merlot,Sauvignon Blanc,Bordeaux-style Red Blend,Rosé,Syrah
Riesling,1121,167,54,21,115,256,2,16,3,4
Pinot Noir,73,2179,115,29,468,168,13,5,10,32
Cabernet Sauvignon,4,12,3331,1,60,16,162,7,82,3
Chardonnay,37,224,40,457,198,56,4,3,4,12
Red Blend,178,255,309,5,3313,91,20,52,17,27
Merlot,101,309,50,19,235,1901,4,8,1,62
Sauvignon Blanc,2,7,515,0,42,4,1049,4,28,0
Bordeaux-style Red Blend,14,34,243,5,232,73,61,414,18,24
Rosé,2,4,692,0,27,9,86,4,727,0
Syrah,11,295,86,2,238,61,5,4,18,672


### RandomForest - Predict Rating

In [224]:
# train model
rf.fit(tv_train_features, train_sentiments)

# predict on test data
rf_tfidf_predictions = rf.predict(tv_test_features)

In [226]:
labels = [0,1,2,3,4,5]
print(classification_report(test_sentiments_rate, lr_tv_tfidf_predictions))
pd.DataFrame(confusion_matrix(test_sentiments_rate, lr_tv_tfidf_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

         0.0       0.12      0.65      0.20      1191
         1.0       0.54      0.16      0.24     10220
         2.0       0.48      0.35      0.40      9379
         3.0       0.15      0.66      0.24      1233
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

    accuracy                           0.29     22023
   macro avg       0.21      0.30      0.18     22023
weighted avg       0.47      0.29      0.31     22023



Unnamed: 0,0,1,2,3,4,5
0,770,142,229,50,0,0
1,4270,1589,3026,1326,1,8
2,1500,1167,3272,3430,0,10
3,55,52,289,818,1,18
4,0,0,0,0,0,0
5,0,0,0,0,0,0


## Decision Tree Classifiers - Estimate Ratings

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics

In [143]:
#df.drop(['points'], axis=1)
#print(df.columns.tolist())
X = df.drop(['points'], axis = 1)
y = df['points']

In [144]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)



In [145]:
class_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = None, random_state = 1)
class_tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

In [146]:
# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = class_tree.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
pd.Series(y_pred)

0        85
1        82
2        90
3        92
4        85
         ..
34124    85
34125    88
34126    90
34127    90
34128    90
Length: 34129, dtype: int64

In [148]:
# Run this block for model evaluation metrics 
print("Model Gini - no max depth")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Yes"' , metrics.precision_score(y_test,y_pred, average = 'macro'))
#print('Precision score for "No"' , metrics.precision_score(y_test,y_pred, average = 'macro'))
print('Recall score for "Yes"' , metrics.recall_score(y_test,y_pred, average = 'macro'))
#print('Recall score for "No"' , metrics.recall_score(y_test,y_pred, average = 'macro'))

Model Gini - no max depth
Accuracy: 0.24046998154062527
Balanced accuracy: 0.15013028373861115
Precision score for "Yes" 0.152697536290423
Recall score for "Yes" 0.15013028373861115


## Decision Tree Classifiers - Estimate Price - 100 different varieties.

In [227]:
print(df.columns.tolist())

['description', 'points', 'price', 'variety', 'acidity', 'age', 'along', 'alongside', 'also', 'apple', 'aromas', 'balance', 'balanced', 'berry', 'bit', 'black', 'blackberry', 'blend', 'bright', 'cabernet', 'character', 'cherry', 'chocolate', 'citrus', 'clean', 'concentrated', 'creamy', 'crisp', 'currant', 'dark', 'dense', 'dried', 'drink', 'dry', 'fine', 'finish', 'firm', 'flavor', 'flavors', 'fresh', 'fruit', 'fruits', 'fruity', 'full', 'fullbodied', 'give', 'good', 'green', 'herb', 'herbal', 'hint', 'juicy', 'leather', 'lemon', 'licorice', 'light', 'like', 'long', 'made', 'merlot', 'mineral', 'mouth', 'nose', 'note', 'notes', 'oak', 'offers', 'opens', 'palate', 'peach', 'pear', 'pepper', 'pinot', 'plum', 'raspberry', 'red', 'rich', 'ripe', 'sauvignon', 'shows', 'smooth', 'soft', 'spice', 'spicy', 'still', 'structure', 'style', 'sweet', 'tannic', 'tannins', 'tart', 'texture', 'theres', 'toast', 'touch', 'vanilla', 'well', 'white', 'wine', 'wood', 'years', 'cluster', 'rating_bin', 'pri

In [242]:
# will need to drop the records that have no Price point.

#df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122213 entries, 0 to 122212
Columns: 104 entries, description to price_bin
dtypes: float64(100), int64(2), object(2)
memory usage: 97.0+ MB


In [238]:
X = df.drop(['price','description','variety'], axis = 1)
y = df['price']

In [239]:
#features = ['points', 'acidity', 'age', 'along', 'alongside', 'also', 'apple', 'aromas', 'balance', 'balanced', 'berry', 'bit', 'black', 'blackberry', 'blend', 'bright', 'cabernet', 'character', 'cherry', 'chocolate', 'citrus', 'clean', 'concentrated', 'creamy', 'crisp', 'currant', 'dark', 'dense', 'dried', 'drink', 'dry', 'fine', 'finish', 'firm', 'flavor', 'flavors', 'fresh', 'fruit', 'fruits', 'fruity', 'full', 'fullbodied', 'give', 'good', 'green', 'herb', 'herbal', 'hint', 'juicy', 'leather', 'lemon', 'licorice', 'light', 'like', 'long', 'made', 'merlot', 'mineral', 'mouth', 'nose', 'note', 'notes', 'oak', 'offers', 'opens', 'palate', 'peach', 'pear', 'pepper', 'pinot', 'plum', 'raspberry', 'red', 'rich', 'ripe', 'sauvignon', 'shows', 'smooth', 'soft', 'spice', 'spicy', 'still', 'structure', 'style', 'sweet', 'tannic', 'tannins', 'tart', 'texture', 'theres', 'toast', 'touch', 'vanilla', 'well', 'white', 'wine', 'wood', 'years', 'variety_Aglianico', 'variety_Albariño', 'variety_Alvarinho', 'variety_Barbera', 'variety_Blaufränkisch', 'variety_Bonarda', 'variety_Bordeaux-style Red Blend', 'variety_Bordeaux-style White Blend', 'variety_Cabernet Franc', 'variety_Cabernet Sauvignon', 'variety_Cabernet Sauvignon-Merlot', 'variety_Cabernet Sauvignon-Syrah', 'variety_Carmenère', 'variety_Champagne Blend', 'variety_Chardonnay', 'variety_Chenin Blanc', 'variety_Corvina, Rondinella, Molinara', 'variety_Dolcetto', 'variety_Fiano', 'variety_Friulano', 'variety_G-S-M', 'variety_Gamay', 'variety_Garganega', 'variety_Garnacha', 'variety_Gewürztraminer', 'variety_Glera', 'variety_Greco', 'variety_Grenache', 'variety_Grenache Blanc', 'variety_Grillo', 'variety_Grüner Veltliner', 'variety_Malbec', 'variety_Melon', 'variety_Mencía', 'variety_Meritage', 'variety_Merlot', 'variety_Monastrell', 'variety_Montepulciano', 'variety_Moscato', 'variety_Mourvèdre', 'variety_Muscat', 'variety_Nebbiolo', 'variety_Nerello Mascalese', "variety_Nero d'Avola", 'variety_Petit Verdot', 'variety_Petite Sirah', 'variety_Pinot Bianco', 'variety_Pinot Blanc', 'variety_Pinot Grigio', 'variety_Pinot Gris', 'variety_Pinot Nero', 'variety_Pinot Noir', 'variety_Pinotage', 'variety_Port', 'variety_Portuguese Red', 'variety_Portuguese White', 'variety_Primitivo', 'variety_Prosecco', 'variety_Red Blend', 'variety_Rhône-style Red Blend', 'variety_Rhône-style White Blend', 'variety_Riesling', 'variety_Rosato', 'variety_Rosé', 'variety_Roussanne', 'variety_Sagrantino', 'variety_Sangiovese', 'variety_Sangiovese Grosso', 'variety_Sauvignon', 'variety_Sauvignon Blanc', 'variety_Shiraz', 'variety_Sparkling Blend', 'variety_Syrah', 'variety_Sémillon', 'variety_Tannat', 'variety_Tempranillo', 'variety_Tempranillo Blend', 'variety_Tinta de Toro', 'variety_Torrontés', 'variety_Touriga Nacional', 'variety_Turbiana', 'variety_Verdejo', 'variety_Verdicchio', 'variety_Vermentino', 'variety_Vernaccia', 'variety_Viognier', 'variety_Viura', 'variety_White Blend', 'variety_Zinfandel', 'variety_Zweigelt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)



In [240]:
class_tree = DecisionTreeClassifier(criterion = 'gini', max_depth = None, random_state = 1)
class_tree.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [31]:
# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = class_tree.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
pd.Series(y_pred)

0        80.0
1        38.0
2        60.0
3        40.0
4        14.0
         ... 
34124    12.0
34125    13.0
34126    15.0
34127    25.0
34128    38.0
Length: 34129, dtype: float64

In [32]:
# Run this block for model evaluation metrics 
print("Model Entropy - no max depth")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))
print('Precision score for "Yes"' , metrics.precision_score(y_test,y_pred, average = 'macro'))
#print('Precision score for "No"' , metrics.precision_score(y_test,y_pred, average = 'macro'))
print('Recall score for "Yes"' , metrics.recall_score(y_test,y_pred, average = 'macro'))
#print('Recall score for "No"' , metrics.recall_score(y_test,y_pred, average = 'macro'))

Model Entropy - no max depth
Accuracy: 0.1503120513346421
Balanced accuracy: 0.0752606285055364
Precision score for "Yes" 0.06313683422227907
Recall score for "Yes" 0.06422894028811249


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
