# Ensembling

In [1]:
pwd

'/Users/kelly/metis_v3/Project_3'

In [111]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [4]:
import pickle
with open('beer_df_formodel.pickle','rb') as read_file:
    beer_df = pickle.load(read_file)
beer_df.head(1)

Unnamed: 0,beerid,name,style,styleid,size,og,fg,abv,ibu,color,...,lager,stout,porter,saison,kolsch,ale,apa,wheat,pilsner,style_num
0,1,Vanilla Cream Ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,0,0,0,0,0,7,0,0,0,7


In [5]:
beer_df.columns

Index(['beerid', 'name', 'style', 'styleid', 'size', 'og', 'fg', 'abv', 'ibu',
       'color', 'boilsize', 'boiltime', 'boilgravity', 'efficiency',
       'brewmethod', 'allgrain', 'biab', 'partialmash', 'extract', 'style',
       'styleid', 'ipa', 'lager', 'stout', 'porter', 'saison', 'kolsch', 'ale',
       'apa', 'wheat', 'pilsner', 'style_num'],
      dtype='object')

In [7]:
X = beer_df.iloc[:,4:-13].drop(columns = 'brewmethod')

In [8]:
y = beer_df.style_num

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 4444)

**Gradient Boost**

In [293]:
gbdt_model = GradientBoostingClassifier(n_estimators = 500, random_state = 4444) #Max Depth Default = 3

In [294]:
gbdt_model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=4444, subsample=1.0, verbose=0,
              warm_start=False)

In [295]:
gbdt_model.score(X_train, y_train)

0.714044900773887

In [296]:
gbdt_model.score(X_test, y_test )

0.6210369487485101

**Random Forest**

In [289]:
rf_model = RandomForestClassifier(n_estimators = 500, max_depth = 10, random_state=4444)

In [290]:
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,
            oob_score=False, random_state=4444, verbose=0,
            warm_start=False)

In [291]:
rf_model.score(X_test, y_test)

0.6122765196662694

**Ensemble Model - Voting Classifer**

In [297]:
ensemble_classifier = VotingClassifier(estimators = [('Gradient Boost Decision Tree', gbdt_model), ('Random Forest Model', rf_model)],voting = 'soft') 

In [298]:
ensemble_classifier = ensemble_classifier.fit(X_train, y_train)

In [299]:
ensemble_pred = ensemble_classifier.predict(X_test)

In [300]:
ensemble_pred.shape

(16780,)

In [301]:
accuracy_score(y_test, ensemble_pred)

0.6244338498212157

In [305]:
with open ('ensemble_model.pickle', 'wb') as to_write:
    pickle.dump(ensemble_classifier, to_write)

**Ensemble Model - Parent Model**

In [95]:
#IPA, Stout, Ale, APA, Wheat, Pilsner performed best using Gradient Boosted Decision Trees
#Style numbers 1, 3, 7, 8, 9, 10
GBDT_styles = [1,3,7,8,9,10]

#Lager, Porter, Saison, Kolsch performed better using Random Forest than GBDT
#Style numbers 2, 4, 5, 6

In [106]:
beer_df['GBDT_styles'] = 1 * beer_df.style_num.isin(GBDT_styles)

In [109]:
beer_df.head(5)

Unnamed: 0,beerid,name,style,styleid,size,og,fg,abv,ibu,color,...,stout,porter,saison,kolsch,ale,apa,wheat,pilsner,style_num,GBDT_styles
0,1,Vanilla Cream Ale,Cream Ale,45,21.77,1.055,1.013,5.48,17.65,4.83,...,0,0,0,0,7,0,0,0,7,1
2,5,Bakke Brygg Belgisk Blonde 50 l,Belgian Blond Ale,20,50.0,1.06,1.01,6.48,17.84,4.57,...,0,0,0,0,7,0,0,0,7,1
3,6,Sierra Nevada Pale Ale Clone,American Pale Ale,10,24.61,1.055,1.013,5.58,40.12,8.0,...,0,0,0,0,0,8,0,0,8,1
4,8,Spotted Clown (New Glarus Spotted Cow clone),Cream Ale,45,20.82,1.054,1.014,5.36,19.97,5.94,...,0,0,0,0,7,0,0,0,7,1
5,9,Chocolate Vanilla Porter,Robust Porter,129,22.71,1.06,1.016,5.77,31.63,34.76,...,0,4,0,0,0,0,0,0,4,0


In [132]:
beer_df.shape

(55933, 33)

In [133]:
beer_df.GBDT_styles.sum()

45912

In [279]:
beer_df.GBDT_styles.sum()/beer_df.shape[0]

0.8208392183505265

Logistic Regression Model

In [229]:
'''

Using Logistic Regression to determine if I want to use Gradient Boosted Decision Trees
or Random Forest.

Add classifiers to training and testing sets.

Split on classifiers to determine which model to perform.

Train each model on its own training set and test on its own testing set.

'''

log_model = LogisticRegression(C = 10, random_state = 4444)


In [230]:
y_parentmodel = beer_df.GBDT_styles

In [231]:
X_parentmodel = X

In [232]:
X_parentmodel.head(1)

Unnamed: 0,size,og,fg,abv,ibu,color,boilsize,boiltime,boilgravity,efficiency,allgrain,biab,partialmash,extract
0,21.77,1.055,1.013,5.48,17.65,4.83,28.39,75,1.038,70.0,1,0,0,0


In [233]:
X_train_parent, X_test_parent, y_train_parent, y_test_parent = train_test_split(X_parentmodel, y_parentmodel, test_size = 0.3, random_state = 4444)

In [234]:
log_model.fit(X_train_parent, y_train_parent)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=4444, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [235]:
log_model.score(X_test_parent, y_test_parent)

0.8227651966626937

In [252]:
classifier_preds_train = log_model.predict(X_train_parent)
classifier_preds_test = log_model.predict(X_test_parent)

In [253]:
classifier_preds_train.shape

(39153,)

In [254]:
classifier_preds_test.shape

(16780,)

In [255]:
classifier_preds_train.sum()

39080

In [276]:
classifier_preds_train.sum()/classifier_preds_train.shape

array([0.99813552])

In [256]:
classifier_preds_test.sum()

16750

In [277]:
classifier_preds_test.sum()/classifier_preds_test.shape

array([0.99821216])

In [257]:
y_train.shape

(39153,)

In [258]:
beer_training_data = X_train.join(y_train)
beer_testing_data = X_test.join(y_test)

In [259]:
beer_training_data['classifier_preds_train'] = classifier_preds_train
beer_testing_data['classifier_preds_test'] = classifier_preds_test

In [260]:
beer_training_data.head(1)

Unnamed: 0,size,og,fg,abv,ibu,color,boilsize,boiltime,boilgravity,efficiency,allgrain,biab,partialmash,extract,style_num,classifier_preds_train
56199,20.82,1.06,1.02,5.28,40.33,8.74,23.47,60,1.053,90.0,1,0,0,0,8,1


In [261]:
beer_testing_data.head(1)

Unnamed: 0,size,og,fg,abv,ibu,color,boilsize,boiltime,boilgravity,efficiency,allgrain,biab,partialmash,extract,style_num,classifier_preds_test
15882,41.64,1.048,1.013,4.55,15.91,5.25,49.21,60,1.041,72.0,1,0,0,0,7,1


Gradient Boost

In [262]:
gbdt_train = beer_training_data[beer_training_data['classifier_preds_train'] == 1]
gbdt_test = beer_testing_data[beer_testing_data['classifier_preds_test'] == 1]

In [263]:
gbdt_y_train = gbdt_train['style_num']
gbdt_y_test = gbdt_test['style_num']

In [264]:
gbdt_X_train = gbdt_train.iloc[:,:14]
gbdt_X_test = gbdt_test.iloc[:,:14]

In [265]:
gbdt_model.fit(gbdt_X_train, gbdt_y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=4444, subsample=1.0, verbose=0,
              warm_start=False)

In [266]:
gbdt_model.score(gbdt_X_train, gbdt_y_train)

0.6469549641760491

In [267]:
gbdt_model.score(gbdt_X_test, gbdt_y_test)

0.6194029850746269

Random Forest

In [268]:
rf_model_parent = RandomForestClassifier(max_depth = 10, random_state=4444)

In [269]:
rf_train = beer_training_data[beer_training_data['classifier_preds_train'] == 0]
rf_test = beer_testing_data[beer_testing_data['classifier_preds_test'] == 0]

In [270]:
rf_y_train = rf_train['style_num']
rf_y_test = rf_test['style_num']

In [271]:
rf_X_train = rf_train.iloc[:,:14]
rf_X_test = rf_test.iloc[:,:14]

In [272]:
rf_model_parent.fit(rf_X_train, rf_y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=4444, verbose=0,
            warm_start=False)

In [273]:
rf_model.score(rf_X_train, rf_y_train)

0.9863013698630136

In [274]:
rf_model.score(rf_X_test, rf_y_test)

0.4666666666666667