In [1]:
# Math 
from math import sqrt
from scipy import stats
import statistics
import os

# General
import numpy as np
import pandas as pd

# Sklearn Modeling
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

# Visuals
import matplotlib.pyplot as plt
import seaborn as sns
from graphviz import Graph
from tabulate import tabulate

# Custom Module's
from wrangle import wrangle


import warnings
warnings.filterwarnings("ignore")

# Modeling

In [2]:
train, validate, test = wrangle()

### Threshold allocation

In [3]:
threshold = .05

# Baseline

In [39]:
# Which Value is common fr
train.excellent_rating.value_counts()

0    395
1     74
Name: excellent_rating, dtype: int64

In [42]:
baseline_accuracy = (train.excellent_rating == 0).mean()
print(f"Baseline Accuracy is: {round(baseline_accuracy, 3)*100}%")

Baseline Accuracy is: 84.2%


# Decision Tree Classifier

### All Features just to see what happens

In [15]:
features = list(train.columns)

# Removing target
features.remove('excellent_rating')

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [16]:
# Make the model
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit the model
clf = clf.fit(X_train, y_train)

# Train
y_pred = clf.predict(X_train)
train_accuracy = clf.score(X_train, y_train)

# Validate

y_pred = clf.predict(X_validate)
validate_accuracy = clf.score(X_validate, y_validate)


print('Accuracy of Decision Tree classifier on training set: {:.3f}'
      .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
      .format(clf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of Decision Tree classifier on training set: 0.949
Accuracy of Decision Tree classifier on validate set: 0.965
Passes Treshold Allocation: True


#### All Feataures 
    2 Depth
        - Accuracy of Decision Tree classifier on training set: 0.928
        - Accuracy of Decision Tree classifier on validate set: 0.915
    3 Depth
        - Accuracy of Decision Tree classifier on training set: 0.949
        - Accuracy of Decision Tree classifier on validate set: 0.965
    4 Depth
        - Accuracy of Decision Tree classifier on training set: 0.970
        - Accuracy of Decision Tree classifier on validate set: 0.945
    5 Depth
        - Accuracy of Decision Tree classifier on training set: 0.983
        - Accuracy of Decision Tree classifier on validate set: 0.915

In [17]:
clf.feature_importances_

array([0.        , 0.        , 0.07543937, 0.03267865, 0.64380653,
       0.        , 0.        , 0.13662215, 0.0622291 , 0.        ,
       0.        , 0.04922419, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

    Feature Importance
    1. aftertaste_scaled
    2. balance_scaled
    3. aroma_scaled
    4. uniformity_scaled

### All Features Take Aways
    -These results make sense since these features are directly correlated with the total score.  Lets dig deeper.

## Removing any feature directly involved in the scoring of each coffee

In [147]:
features = list(train.columns)

# Removing target
features.remove('excellent_rating')

# Removing any feature directly involved in the scoring of each coffee
features.remove('aroma_scaled')
features.remove('flavor_scaled')
features.remove('aftertaste_scaled')
features.remove('acidity_scaled')
features.remove('body_scaled')
features.remove('balance_scaled')
features.remove('uniformity_scaled')
features.remove('clean_cup_scaled')
features.remove('sweetness_scaled')
features.remove('cupper_points_scaled')
features.remove('moisture_scaled')
features.remove('category_one_defects_scaled')
features.remove('quakers_scaled')
features.remove('category_two_defects_scaled')

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [148]:
# Make the model
clf = DecisionTreeClassifier(max_depth=2, random_state=123)

# Fit the model
clf = clf.fit(X_train, y_train)

# Train
y_pred = clf.predict(X_train)
train_accuracy = clf.score(X_train, y_train)

# Validate

y_pred = clf.predict(X_validate)
validate_accuracy = clf.score(X_validate, y_validate)


print('Accuracy of Decision Tree classifier on training set: {:.3f}'
      .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
      .format(clf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of Decision Tree classifier on training set: 0.855
Accuracy of Decision Tree classifier on validate set: 0.841
Passes Treshold Allocation: True


#### All features Minus Scoring 
    - Best Depth 2 @ .84 on validate

In [152]:
clf.feature_importances_

array([0.        , 0.26276386, 0.57054195, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.16669419, 0.        ])

In [153]:
X_train.columns

Index(['number_of_bags_scaled', 'harvest_year_scaled',
       'altitude_mean_meters_scaled', 'bag_weight_scaled',
       'variety_Bourbon_scaled', 'variety_Catuai_scaled',
       'variety_Caturra_scaled', 'variety_Mundo Novo_scaled',
       'variety_Other_scaled', 'variety_Typica_scaled',
       'variety_Yellow Bourbon_scaled',
       'processing_method_Natural / Dry_scaled',
       'processing_method_Semi-washed / Semi-pulped_scaled',
       'processing_method_Washed / Wet_scaled', 'color_Green_scaled',
       'grading_month_scaled', 'grading_year_scaled'],
      dtype='object')

    Feature Importance (From depth 2)
    1. 'altitude_mean_meters_scaled'
    2. 'harvest_year_scaled'
    3. 'processing_method_Natural / Dry_scaled'

### All Features minus any feature directly involved in the scoring of each coffee takeaways
    - Altitude was top feature
    - Did not beat baseline

## Only including features from exploration findings

In [154]:
features = [
    'variety_Bourbon_scaled',
    'variety_Catuai_scaled',
    'variety_Caturra_scaled',
    'variety_Mundo Novo_scaled',
    'variety_Other_scaled',
    'variety_Typica_scaled',
    'variety_Yellow Bourbon_scaled',
    'harvest_year_scaled',
    'processing_method_Natural / Dry_scaled',
    'processing_method_Semi-washed / Semi-pulped_scaled',
    'processing_method_Washed / Wet_scaled',
    'altitude_mean_meters_scaled',
    'bag_weight_scaled',
    'grading_month_scaled',
    'grading_year_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [155]:
# Make the model
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

# Fit the model
clf = clf.fit(X_train, y_train)

# Train
y_pred = clf.predict(X_train)
train_accuracy = clf.score(X_train, y_train)

# Validate

y_pred = clf.predict(X_validate)
validate_accuracy = clf.score(X_validate, y_validate)


print('Accuracy of Decision Tree classifier on training set: {:.3f}'
      .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
      .format(clf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of Decision Tree classifier on training set: 0.866
Accuracy of Decision Tree classifier on validate set: 0.851
Passes Treshold Allocation: True


#### Only features from Exploration
    2 Depth
        - Accuracy of Decision Tree classifier on training set: 0.855
        - Accuracy of Decision Tree classifier on validate set: 0.841
        - Passes Treshold Allocation: True
    3 Depth
        - Accuracy of Decision Tree classifier on training set: 0.861
        - Accuracy of Decision Tree classifier on validate set: 0.851
        - Passes Treshold Allocation: True
    4 Depth
        - Accuracy of Decision Tree classifier on training set: 0.866
        - Accuracy of Decision Tree classifier on validate set: 0.851
        - Passes Treshold Allocation: True
    5 Depth
        - Accuracy of Decision Tree classifier on training set: 0.878
        - Accuracy of Decision Tree classifier on validate set: 0.821
        - Passes Treshold Allocation: False

In [34]:
clf.feature_importances_

array([0.05533528, 0.03320117, 0.04618256, 0.        , 0.12469773,
       0.        , 0.        , 0.20406877, 0.        , 0.        ,
       0.        , 0.41310227, 0.12341224, 0.        , 0.        ])

    Feature Importance (from depth 4)
    1. altitude_mean_meters_scaled'
    2. harvest_year_scaled
    3. variety_Other_scaled
    4. bag_weight_scaled

### Features from exploration takeaways
    - Excited to see altitude again
    - Not too happy with harvest year being a driving feature considering it had missing values and outliers
    - Going to run this again without harvest year
    - Depth 4 beating baseline and had a small fallout rate

## Only including features from exploration findings minus harvest year

In [156]:
features = [
    'variety_Bourbon_scaled',
    'variety_Catuai_scaled',
    'variety_Caturra_scaled',
    'variety_Mundo Novo_scaled',
    'variety_Other_scaled',
    'variety_Typica_scaled',
    'variety_Yellow Bourbon_scaled',
    'processing_method_Natural / Dry_scaled',
    'processing_method_Semi-washed / Semi-pulped_scaled',
    'processing_method_Washed / Wet_scaled',
    'altitude_mean_meters_scaled',
    'bag_weight_scaled',
    'grading_month_scaled',
    'grading_year_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [157]:
# Make the model
clf = DecisionTreeClassifier(max_depth=4, random_state=123)

# Fit the model
clf = clf.fit(X_train, y_train)

# Train
y_pred = clf.predict(X_train)
train_accuracy = clf.score(X_train, y_train)

# Validate

y_pred = clf.predict(X_validate)
validate_accuracy = clf.score(X_validate, y_validate)


print('Accuracy of Decision Tree classifier on training set: {:.3f}'
      .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on validate set: {:.3f}'
      .format(clf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of Decision Tree classifier on training set: 0.866
Accuracy of Decision Tree classifier on validate set: 0.841
Passes Treshold Allocation: True


#### Only features from Exploration
    2 Depth
        - Accuracy of Decision Tree classifier on training set: 0.855
        - Accuracy of Decision Tree classifier on validate set: 0.841
        - Passes Treshold Allocation: True
    3 Depth
        - Accuracy of Decision Tree classifier on training set: 0.861
        - Accuracy of Decision Tree classifier on validate set: 0.841
        - Passes Treshold Allocation: True
    4 Depth
        - Accuracy of Decision Tree classifier on training set: 0.866
        - Accuracy of Decision Tree classifier on validate set: 0.841
        - Passes Treshold Allocation: True
    5 Depth
        - Accuracy of Decision Tree classifier on training set: 0.878
        - Accuracy of Decision Tree classifier on validate set: 0.821
        - Passes Treshold Allocation: False

In [34]:
clf.feature_importances_

array([0.05533528, 0.03320117, 0.04618256, 0.        , 0.12469773,
       0.        , 0.        , 0.20406877, 0.        , 0.        ,
       0.        , 0.41310227, 0.12341224, 0.        , 0.        ])

    Feature Importance (from depth 4)
    1. bag_weight_scaled
    2. processing_method_Natural / Dry_scaled
    3. variety_Other scaled
    4. grading_month_scaled

### Features from exploration takeaways
    - Excited to see altitude again
    - Not too happy with harvest year being a driving feature considering it had missing values and outliers
    - Going to run this again without harvest year
    - Best Depth is 4 and it did not beat baseline

# Decision Tree Classifier Takeaways
    - Best model was using all features except those directly involved in the scoring of each coffee
    - Accuracy of Decision Tree classifier on validate set: 0.965

# Random Forest Classifier

### Starting with all features minus those directly involved in the scoring of each coffee 

In [158]:
features = list(train.columns)

# Removing target
features.remove('excellent_rating')

# Removing any feature directly involved in the scoring of each coffee
features.remove('aroma_scaled')
features.remove('flavor_scaled')
features.remove('aftertaste_scaled')
features.remove('acidity_scaled')
features.remove('body_scaled')
features.remove('balance_scaled')
features.remove('uniformity_scaled')
features.remove('clean_cup_scaled')
features.remove('sweetness_scaled')
features.remove('cupper_points_scaled')
features.remove('moisture_scaled')
features.remove('category_one_defects_scaled')
features.remove('quakers_scaled')
features.remove('category_two_defects_scaled')

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']


In [194]:
# Make the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=4, 
                            random_state=123)

# Fit the model
rf.fit(X_train, y_train)

# Train
y_pred = rf.predict(X_train)
train_accuracy = rf.score(X_train, y_train)

# Validate
y_pred = rf.predict(X_validate)
validate_accuracy = rf.score(X_validate, y_validate)

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on validate set: {:.3f}'
      .format(rf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of random forest classifier on training set: 0.86
Accuracy of random forest classifier on validate set: 0.856
Passes Treshold Allocation: True


### Leaf 1

    leaf = 1
    depth = 2
    Accuracy of random forest classifier on training set: 0.84
    Accuracy of random forest classifier on validate set: 0.841
    Passes Treshold Allocation: True
    
    
    leaf = 1
    depth = 3
    Accuracy of random forest classifier on training set: 0.86
    Accuracy of random forest classifier on validate set: 0.851
    Passes Treshold Allocation: True
    
    leaf = 1
    depth = 4
    Accuracy of random forest classifier on training set: 0.87
    Accuracy of random forest classifier on validate set: 0.856
    Passes Treshold Allocation: True

    leaf =1
    depth = 5
    Accuracy of random forest classifier on training set: 0.87
    Accuracy of random forest classifier on validate set: 0.856
    Passes Treshold Allocation: True
    
    leaf =1
    depth = 6
    
    Accuracy of random forest classifier on training set: .88
    Accuracy of random forest classifier on validate set: .851
    Passes Treshold Allocation: True

### Leaf 1 Takeaways
    - Best Depth accuracy was 4 and 5 at .856
    - Same drop off

### Leaf 2

    leaf = 2
    depth = 2
    Accuracy of random forest classifier on training set: 0.84
    Accuracy of random forest classifier on validate set: 0.841
    Passes Treshold Allocation: True
        
        
    leaf = 2
    depth = 3
    Accuracy of random forest classifier on training set: 0.86
    Accuracy of random forest classifier on validate set: 0.846
        
    leaf = 2
    depth = 4
    Accuracy of random forest classifier on training set: 0.86
    Accuracy of random forest classifier on validate set: 0.856
    Passes Treshold Allocation: True
    
    leaf = 2
    depth = 5
    Accuracy of random forest classifier on training set: 0.87
    Accuracy of random forest classifier on validate set: 0.856
    Passes Treshold Allocation: True
        
    leaf = 2
    depth = 6
    Accuracy of random forest classifier on training set: 0.87
    Accuracy of random forest classifier on validate set: 0.856
    Passes Treshold Allocation: True

### Leaf 2 Takeaways
    - best depth accuracy is at .856 at depth 4 with the smallest drop off 

### Leaf 3 Takeaways
    - Highest Accuracy of .856 Depth = 5 and 6
    - Same drop off at 5 and 6

### Leaf 4 Takeaways
    - Nothing was able to beat .856 validate with a .86 train

# Feature importance

In [195]:
rf.feature_importances_

array([0.10780214, 0.10374829, 0.27621181, 0.08795352, 0.02130283,
       0.00471277, 0.03538992, 0.00373342, 0.0467681 , 0.02528065,
       0.00462756, 0.01830175, 0.01368255, 0.01464527, 0.03429909,
       0.11370608, 0.08783422])

In [196]:
i =0
for value in rf.feature_importances_:
    print(str(i)+ ' =  ' + str(float(value)))
    i+=1

0 =  0.10780214483396387
1 =  0.10374828847048027
2 =  0.276211813317326
3 =  0.08795352168208638
4 =  0.02130283345530789
5 =  0.004712773027360678
6 =  0.03538992459790825
7 =  0.0037334202925443745
8 =  0.046768103809999115
9 =  0.02528065175060813
10 =  0.0046275641010602355
11 =  0.018301745868925176
12 =  0.013682552208828936
13 =  0.014645270108920472
14 =  0.03429909494323926
15 =  0.11370607840060698
16 =  0.08783421913083393


In [None]:
'altitude_mean_meters_scaled',
'grading_month_scaled',
'number_of_bags_scaled',
'harvest_year_scaled'
'bag_weight_scaled'

### Only features that were deemed important

In [254]:
features = ['altitude_mean_meters_scaled',
            'grading_month_scaled',
            'number_of_bags_scaled',
            'harvest_year_scaled',
            'bag_weight_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [256]:
# Make the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

# Fit the model
rf.fit(X_train, y_train)

# Train
y_pred = rf.predict(X_train)
train_accuracy = rf.score(X_train, y_train)

# Validate
y_pred = rf.predict(X_validate)
validate_accuracy = rf.score(X_validate, y_validate)

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on validate set: {:.3f}'
      .format(rf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of random forest classifier on training set: 0.87
Accuracy of random forest classifier on validate set: 0.861
Passes Treshold Allocation: True


    - Goal is finding a validate over .856 with less than .004 drop off

leaf 2 , 5 beat it

In [252]:
### Only features that were deemed important

features = ['variety_Bourbon_scaled',
    'variety_Catuai_scaled',
    'variety_Caturra_scaled',
    'variety_Mundo Novo_scaled',
    'variety_Other_scaled',
    'variety_Typica_scaled',
    'variety_Yellow Bourbon_scaled',
    'processing_method_Natural / Dry_scaled',
    'processing_method_Semi-washed / Semi-pulped_scaled',
    'processing_method_Washed / Wet_scaled',
    'altitude_mean_meters_scaled',
    'grading_month_scaled',
    'grading_year_scaled',
    'bag_weight_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

# Make the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=100,
                            max_depth=4, 
                            random_state=123)

# Fit the model
rf.fit(X_train, y_train)

# Train
y_pred = rf.predict(X_train)
train_accuracy = rf.score(X_train, y_train)

# Validate
y_pred = rf.predict(X_validate)
validate_accuracy = rf.score(X_validate, y_validate)

print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))
print('Accuracy of random forest classifier on validate set: {:.3f}'
      .format(rf.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of random forest classifier on training set: 0.86
Accuracy of random forest classifier on validate set: 0.856
Passes Treshold Allocation: True


### Forest Takeaways
    - leaf 2, depth 5  .861 on validate
    - using only these features
        - 'altitude_mean_meters_scaled',
        - 'grading_month_scaled',
        - 'number_of_bags_scaled',
        - 'harvest_year_scaled'
        - 'bag_weight_scaled'

# K Closest neigbors

### Start with all features minus those that directly apply to score

In [292]:
### Only features that were deemed important

features = ['variety_Bourbon_scaled',
    'variety_Catuai_scaled',
    'variety_Caturra_scaled',
    'variety_Mundo Novo_scaled',
    'variety_Other_scaled',
    'variety_Typica_scaled',
    'variety_Yellow Bourbon_scaled',
    'processing_method_Natural / Dry_scaled',
    'processing_method_Semi-washed / Semi-pulped_scaled',
    'processing_method_Washed / Wet_scaled',
    'altitude_mean_meters_scaled',
    'grading_month_scaled',
    'grading_year_scaled',
    'bag_weight_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']



In [309]:
# Make the model
knn = KNeighborsClassifier(n_neighbors=12
                           , weights='uniform')

# Fit the model
knn.fit(X_train, y_train)

# Train
y_pred = knn.predict(X_train)
train_accuracy = knn.score(X_train, y_train)

# Validate
y_pred = knn.predict(X_validate)
validate_accuracy = knn.score(X_validate, y_validate)

print('Accuracy of K Neighbors classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K Neighbors classifier on validate set: {:.3f}'
      .format(knn.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of K Neighbors classifier on training set: 0.84
Accuracy of K Neighbors classifier on validate set: 0.841
Passes Treshold Allocation: True


### KNN Takeaway notes
    - No luck beating .861

# Logistic Regression

In [321]:
features = ['altitude_mean_meters_scaled',
            'grading_month_scaled',
            'number_of_bags_scaled',
            'harvest_year_scaled',
            'bag_weight_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [332]:
# Make the model
logit = LogisticRegression(C=.01, random_state=123)

# Fit the Model
logit.fit(X_train, y_train)

# Train
y_pred = logit.predict(X_train)
train_accuracy = logit.score(X_train, y_train)

# Validate
y_pred = logit.predict(X_validate)
validate_accuracy = logit.score(X_validate, y_validate)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_train, y_train)))
print('Accuracy of Logistic Regression classifier on validate set: {:.3f}'
      .format(logit.score(X_validate, y_validate)))
print(f"Passes Treshold Allocation: {threshold > round(abs(train_accuracy-validate_accuracy), 3)}")

Accuracy of Logistic Regression classifier on training set: 0.84
Accuracy of Logistic Regression classifier on validate set: 0.841
Passes Treshold Allocation: True


### Logistic Regession Takeaway
    -  no luck with beating .861
    - I did try different features and c level. I just changed the values in place due to time

# Best Validate Model Takeaway

### Forest Takeaways
    - leaf 2, depth 5  .861 on validate
    - using only these features
        - 'altitude_mean_meters_scaled',
        - 'grading_month_scaled',
        - 'number_of_bags_scaled',
        - 'harvest_year_scaled'
        - 'bag_weight_scaled'

# Running Test

In [333]:
features = ['altitude_mean_meters_scaled',
            'grading_month_scaled',
            'number_of_bags_scaled',
            'harvest_year_scaled',
            'bag_weight_scaled']

# Setting up X_train
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# Setting up y_train
y_train = train['excellent_rating']
y_validate = validate['excellent_rating']
y_test = test['excellent_rating']

In [334]:
# Make the model
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=2,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=123)

# Fit the model
rf.fit(X_train, y_train)

# Train
y_pred = rf.predict(X_train)
train_accuracy = rf.score(X_train, y_train)

# Validate
y_pred = rf.predict(X_validate)
validate_accuracy = rf.score(X_validate, y_validate)

# Test
y_pred = rf.predict(X_test)
test_accuracy = rf.score(X_test, y_test)

print('Accuracy of Random Forest classifier on training set: {:.3f}'
      .format(rf.score(X_train, y_train)))
print('Accuracy of Random Forest classifier on validate set: {:.3f}'
      .format(rf.score(X_validate, y_validate)))
print('Accuracy of Random Forest classifier on test set: {:.3f}'
      .format(rf.score(X_test, y_test)))

Accuracy of Random Forest classifier on training set: 0.874
Accuracy of Random Forest classifier on validate set: 0.861
Accuracy of Random Forest classifier on test set: 0.851
