# Model Development

## Load prerequisite packages and data

In [1]:
import pandas as pd
import requests
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import string, re
import itertools 
import pickle
import xgboost as xgb
# import local_modules.slack as slack
from local_modules.Pickling import pickle_item

In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

In [19]:
test_features = pickle.load(open('data/20191107-3_test_features.pkl', 'rb'))
test_labels = pickle.load(open('data/20191107-3_test_labels.pkl', 'rb'))
train_features = pickle.load(open('data/20191107-3_train_features.pkl', 'rb'))
train_labels = pickle.load(open('data/20191107-3_train_labels.pkl', 'rb'))

In [20]:
print(train_labels.shape)
print(train_features.shape)
print(test_labels.shape)
print(test_features.shape)

(11666,)
(11666, 13)
(3889,)
(3889, 13)


## Model #1: Random Forest

Testing a random forest model on predicting fake news

In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0)
classifier.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [22]:
pickle_item("models/20191107-3_RandomForestClassifier.pkl", classifier)

Pickling completed


## Model #2: Voting Classifier

In [23]:
# Import the model we are using
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(solver='lbfgs', multi_class='multinomial',
                          random_state=1)
clf2 = RandomForestClassifier(n_estimators=500, random_state=1)
clf3 = GaussianNB()

eclf1 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
eclf1 = eclf1.fit(train_features, train_labels)
print(eclf1.predict(train_features))

[0 0 1 ... 1 0 1]


In [24]:
print(eclf1.predict(train_features)[1:10])
print(test_labels[1:10])

[0 1 0 0 0 0 0 2 0]
12766    2
6461     0
9896     0
13496    0
5099     1
9394     1
1688     1
8447     0
720      1
Name: label, dtype: int64


In [28]:
pickle_item("models/20191108-2_VotingClassifier.pkl", eclf1)

Pickling completed


In [None]:
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
y = np.array([1, 1, 1, 2, 2, 2])



np.array_equal(eclf1.named_estimators_.lr.predict(X),
               eclf1.named_estimators_['lr'].predict(X))

eclf2 = VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
        voting='soft')
eclf2 = eclf2.fit(X, y)
print(eclf2.predict(X))

eclf3 = VotingClassifier(estimators=[
       ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
       voting='soft', weights=[2,1,1],
       flatten_transform=True)
eclf3 = eclf3.fit(X, y)
print(eclf3.predict(X))

print(eclf3.transform(X).shape)


## Model #3: XGBoost

- learning_rate: step size shrinkage used to prevent overfitting. Range is [0,1]
- max_depth: determines how deeply each tree is allowed to grow during any boosting round.
- subsample: percentage of samples used per tree. Low value can lead to underfitting.
- colsample_bytree: percentage of features used per tree. High value can lead to overfitting.
- n_estimators: number of trees you want to build.
- objective: determines the loss function to be used like reg:linear for regression problems, reg:logistic for classification problems with only decision, binary:logistic for classification problems with probability.


In [26]:
model = xgb.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model.fit(train_features, train_labels)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [27]:
pickle_item("models/20191107-3_XGBClassifier.pkl", model)

Pickling completed


## Model #4: SGDClassifier

In [13]:
from sklearn.linear_model import SGDClassifier
SGD_model = SGDClassifier(alpha=0.8164183673469387, class_weight=None, eta0=0.01,
           fit_intercept=True, l1_ratio=0.36734693877551017,
           learning_rate='constant', loss='modified_huber', max_iter=1000,
           n_jobs=-1, penalty='none', power_t=0.8888888888888888,
           random_state=None, tol=0.01)
SGD_model.fit(train_features, train_labels)

SGDClassifier(alpha=0.8164183673469387, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.01, fit_intercept=True,
              l1_ratio=0.36734693877551017, learning_rate='constant',
              loss='modified_huber', max_iter=1000, n_iter_no_change=5,
              n_jobs=-1, penalty='none', power_t=0.8888888888888888,
              random_state=None, shuffle=True, tol=0.01,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [16]:
pickle_item("models/20191104_SGDClassifier.pkl", SGD_model)

Pickling completed


## 4. Neural network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=4, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

## Brute force models using Hunga Bunga

In [4]:
from local_modules.hunga_bunga import HungaBungaClassifier

In [None]:
clf = HungaBungaClassifier(brain=True)
clf.fit(train_features, train_labels)

Scoring criteria: accuracy
--------------- model 1/15 ---------------
SGDClassifier
--------------- model 2/15 ---------------
LogisticRegression
--------------- model 3/15 ---------------
Perceptron
--------------- model 4/15 ---------------
PassiveAggressiveClassifier
--------------- model 5/15 ---------------
MLPClassifier
best score: 0.5143958868894601 time/clf: 2.819 seconds
best params:
{'activation': 'tanh',
 'batch_size': 50,
 'early_stopping': True,
 'hidden_layer_sizes': (64,),
 'learning_rate': 'invscaling',
 'max_iter': 500}
--------------- model 6/15 ---------------
KMeans
best score: 0.407369323050557 time/clf: 0.149 seconds
best params:
{'algorithm': 'elkan', 'init': 'random', 'n_clusters': 3}
--------------- model 7/15 ---------------
KNeighborsClassifier
--------------- model 8/15 ---------------
NearestCentroid
best score: 0.4011139674378749 time/clf: 0.004 seconds
best params:
{'metric': 'manhattan', 'shrink_threshold': 2}
--------------- model 9/15 ---------------
R

In [4]:
loaded_model = pickle.load(open("models/model.pkl", 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

ModuleNotFoundError: No module named 'azureml'