In [None]:
This is being used to try out and visualize different methods of classification.

In [124]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import GradientBoostingClassifier as GB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.neighbors import KNeighborsClassifier as KN
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier as GPC
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [115]:
pkl_file = open('../data/df.pkl', 'rb')
df = pickle.load(pkl_file)
pkl_file.close() 

In [116]:
features = ['top_elev_(ft)', 
            'bottom_elev_(ft)', 
            'vert_rise_(ft)', 
            'slope_length_(ft)', 
            'avg_width_(ft)', 
            'slope_area_(acres)', 
            'avg_grade_(%)', 
            'max_grade_(%)', 
            'groomed']

X = df[features].values

In [117]:
y = df['colors'].values

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=True)

In [119]:
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [120]:
rf = RF()
lr = LR()
gb = GB()
mlp = MLP()
kn = KN()
svc = SVC()
gpc = GPC()
rbf = RBF()
dt = DT()
abc = ABC()
gnb = GNB()

In [121]:
def check_model(name, model,X_train,y_train,X_test,y_test):
    model.fit(X_train,y_train)
    print("{} train score: {}".format(name, model.score(X_train,y_train)))
    print("{} test score: {}".format(name, model.score(X_test,y_test)))
    print('\n')

In [122]:
names = ['Random Forest',
         'Logistic Regression',
         'Gradient Boosting Classifier',
         'MLP Classifier',
         'KNeighbors Classifier',
         'SVC',
         'Gaussian Process Classifier',
         'Decision Tree Classifier',
         'AdaBoost Classifier',
         'Gaussian Naive Bayes']
models = [rf,lr,gb,mlp,kn,svc,gpc,dt,abc,gnb]

In [123]:
for name, model in zip(names,models):
    check_model(name, model,X_train,y_train,X_test,y_test)

Random Forest train score: 0.9814077025232404
Random Forest test score: 0.728494623655914


Logistic Regression train score: 0.6812749003984063
Logistic Regression test score: 0.6801075268817204


Gradient Boosting Classifier train score: 0.9654714475431607
Gradient Boosting Classifier test score: 0.7123655913978495


MLP Classifier train score: 0.8100929614873837
MLP Classifier test score: 0.7231182795698925


KNeighbors Classifier train score: 0.8193891102257637
KNeighbors Classifier test score: 0.7043010752688172


SVC train score: 0.7835325365205843
SVC test score: 0.717741935483871


Gaussian Process Classifier train score: 0.8300132802124834
Gaussian Process Classifier test score: 0.696236559139785


Decision Tree Classifier train score: 1.0
Decision Tree Classifier test score: 0.6666666666666666


AdaBoost Classifier train score: 0.6042496679946879
AdaBoost Classifier test score: 0.5456989247311828


Gaussian Naive Bayes train score: 0.7184594953519257
Gaussian Naive Bayes test 

# Trying w/ resort AND color as target

In [107]:
y_resortcolor = zip(df.resort.values,df.colors.values)

In [108]:
y2 = [' '.join(x) for x in list(y_resortcolor)]

In [110]:
y2;

In [111]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(df[features].values, y2, test_size=0.33, random_state=42, shuffle=True)

In [112]:
ss2 = StandardScaler()
ss2.fit(X_train2)
X_train2 = ss2.transform(X_train2)
X_test2 = ss2.transform(X_test2)

In [113]:
for name, model in zip(names,models):
    check_model(name, model,X_train2,y_train2,X_test2,y_test2)

Random Forest train score: 0.9893758300132802
Random Forest test score: 0.3467741935483871


Logistic Regression train score: 0.3545816733067729
Logistic Regression test score: 0.2903225806451613


Gradient Boosting Classifier train score: 1.0
Gradient Boosting Classifier test score: 0.3387096774193548


MLP Classifier train score: 0.5245683930942895
MLP Classifier test score: 0.3575268817204301


KNeighbors Classifier train score: 0.5391766268260292
KNeighbors Classifier test score: 0.3118279569892473


SVC train score: 0.4395750332005312
SVC test score: 0.30913978494623656


Gaussian Process Classifier train score: 0.6321381142098274
Gaussian Process Classifier test score: 0.3521505376344086


Decision Tree Classifier train score: 1.0
Decision Tree Classifier test score: 0.2903225806451613


AdaBoost Classifier train score: 0.18725099601593626
AdaBoost Classifier test score: 0.16129032258064516


Gaussian Naive Bayes train score: 0.31208499335989376
Gaussian Naive Bayes test score: 0

# Trying to correct for overfitting in RF and GB

In [133]:
def param_search(names, models, params, X_train, X_test, y_train, y_test):
    best_models = []
    for name, model, param in zip(names, models, params):
        print("\n########## {} model ##########".format(name))
        clf = GridSearchCV(model, param, cv=5, scoring='accuracy', n_jobs=-1)
        clf.fit(X_train, y_train)
        print("Best parameters set found on development set:")
        print(clf.best_params_)
        print('Best score: {:.4f}'.format(clf.best_score_))
        best_models.append(clf.best_estimator_)
    return best_models

In [134]:
names2 = ['Random Forest', 'Gradient Boosting Classifier']
models2 = [RF(),GB()]
RFparams = [{'n_estimators': [10, 20, 30], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 3, 10], 'min_samples_split': [5, 10, 20], 'min_samples_leaf': [5, 10, 20], 'max_features': ['sqrt', 'log2'], 'max_leaf_nodes': [50, 100, 500], 'bootstrap': [True, False], 'warm_start': [False, True], 'class_weight': [None, 'balanced']}]
GBCparams = [{'learning_rate': np.logspace(-2, 0, num=3), 'max_depth': [1, 3, 10], 'min_samples_leaf': [1, 3, 10], 'subsample': [1.0, 0.5], 'max_features': [None, 'sqrt'], 'n_estimators': [100]}]
params2 = [RFparams,GBCparams]

In [135]:
param_search(names2,models2,params2,X_train, X_test, y_train, y_test)


########## Random Forest model ##########
Best parameters set found on development set:
{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 10, 'max_features': 'log2', 'max_leaf_nodes': 100, 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 30, 'warm_start': True}
Best score: 0.7570

########## Gradient Boosting Classifier model ##########
Best parameters set found on development set:
{'learning_rate': 0.01, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'n_estimators': 100, 'subsample': 0.5}
Best score: 0.7517


[RandomForestClassifier(bootstrap=True, class_weight='balanced',
             criterion='gini', max_depth=10, max_features='log2',
             max_leaf_nodes=100, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=5,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=True),
 GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.01, loss='deviance', max_depth=10,
               max_features='sqrt', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=10, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               presort='auto', random_state=None, subsample=0.5, verbose=0,
               warm_start=False)]

In [137]:
best_RF = RF(bootstrap=True, class_weight='balanced',
             criterion='gini', max_depth=10, max_features='log2',
             max_leaf_nodes=100, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=5,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=30, n_jobs=1, oob_score=False, random_state=None,
             verbose=0, warm_start=True)
best_GB =  GB(criterion='friedman_mse', init=None,
               learning_rate=0.01, loss='deviance', max_depth=10,
               max_features='sqrt', max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=10, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               presort='auto', random_state=None, subsample=0.5, verbose=0,
               warm_start=False)

In [140]:
check_model('Best RF', best_RF,X_train,y_train,X_test,y_test)

Best RF train score: 0.8778220451527224
Best RF test score: 0.7123655913978495




In [141]:
check_model('Best GB', best_GB,X_train,y_train,X_test,y_test)

Best GB train score: 0.8326693227091634
Best GB test score: 0.7123655913978495




# Clearly it is hard to classify runs by their color - since different resorts have different criteria