# Kaggle Sentiment Analysis Competition

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import RidgeClassifierCV
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.cross_validation import cross_val_score
import time
import os
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [2]:
# Load training and test data (80/20 Split)
data = np.loadtxt("training_data.txt", delimiter="|", skiprows=1)
dataX = data[:, 0:-1]
dataY = data[:, -1]
training_size = int(data.shape[0] * 0.8)

trainingX = data[0:training_size, 0:-1]
trainingY = data[0:training_size, -1]

testX = data[training_size:, 0:-1]
testY = data[training_size:, -1]

# For testing for submission

test_data = np.loadtxt("testing_data.txt", delimiter="|", skiprows=1)

# Ridge Regression

In [None]:
# Fit Model

# lambdas 
lambdas_arr = np.arange(500, 1000, 0.1)

#clf = RidgeClassifierCV(alphas=lambdas_arr, cv=5)
#clf.fit(dataX, dataY)

In [None]:
#print clf.score(trainingX, trainingY)
#print clf.alpha_

In [None]:
# Ridge Prediction
prediction = clf.predict(test_data)

f = open("RidgeCV.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(prediction)):
    f.write(str(x+1) + "," + str(int(prediction[x])) + "\n")
f.close()

# AdaBoost (Decision Tree)

In [None]:
training_arr = []
test_arr = []
estimator_arr = range(200, 1000, 10)

for i in estimator_arr:
    ada = AdaBoostClassifier(n_estimators=i, learning_rate=1)
    ada.fit(trainingX, trainingY)
    training_arr.append(ada.score(trainingX, trainingY))
    test_arr.append(ada.score(testX, testY))

In [None]:
plt.plot(estimator_arr, training_arr)
plt.plot(estimator_arr, test_arr)
plt.show()

In [None]:
print np.argmax(np.array(test_arr))
print estimator_arr[np.argmax(np.array(test_arr))]

In [None]:
ada1 = AdaBoostClassifier(n_estimators=estimator_arr[np.argmax(np.array(test_arr))], learning_rate=1)
ada1.fit(trainingX, trainingY)

In [None]:
prediction1 = ada1.predict(test_data)

f = open("AdaBoostDecisionTree.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(prediction)):
    f.write(str(x+1) + "," + str(int(prediction1[x])) + "\n")
f.close()

# Random Forest

In [None]:
# Set up lists to store training errors and test errors.
num_estimators_train_error = []
num_estimators_tests_error = []

# Train models with different minimum leaf sizes and record
# scores on the models.
min_n_estimators = 1
max_n_estimators = 200
step_size = 10
num_trials = 5
for n_estimator in np.arange(min_n_estimators, max_n_estimators, step_size):
    # Run trials for each minimum leaf size and average the values.
    train_trial_errors = []
    tests_trial_errors = []
    for i in range(0, num_trials):
        clf = RandomForestClassifier(n_estimators=n_estimator, n_jobs=-1)
        clf.fit(trainingX, trainingY)
        train_trial_errors.append(1 - clf.score(trainingX, trainingY))
        tests_trial_errors.append(1 - clf.score(testX, testY))
    num_estimators_train_error.append(np.mean(train_trial_errors))
    num_estimators_tests_error.append(np.mean(tests_trial_errors))
print num_estimators_train_error
print num_estimators_tests_error

In [None]:
# Plot the data
plt.figure(4, figsize=(8, 6))
plt.plot(np.arange(min_n_estimators, max_n_estimators, step_size), num_estimators_train_error, label='Training Error')
plt.plot(np.arange(min_n_estimators, max_n_estimators, step_size), num_estimators_tests_error, label='Test Error')
plt.legend(loc='best')
plt.title('Error Versus Number of Estimators')
plt.xlabel('Number of Estimators')
plt.ylabel('Error')
plt.show()

In [None]:
# Make a model based on best results from above
randfor_model1 = RandomForestClassifier(n_estimators=100)
randfor_model1.fit(dataX, dataY)

# Predict data and write to file.
randfor_predict1 = randfor_model1.predict(test_data)

f = open("RandomForestJoon.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(randfor_predict1)):
    f.write(str(x+1) + "," + str(int(randfor_predict1[x])) + "\n")
f.close()

In [None]:
# Set up lists to store training errors and test errors.
num_estimators_train_error = []
num_estimators_tests_error = []

# Train models with different minimum leaf sizes and record
# scores on the models.
for i in range(1, 60, 1):
    # Run trials for each minimum leaf size and average the values.
    clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=i, max_features=62, n_jobs=-1)
    clf.fit(trainingX, trainingY)
    num_estimators_train_error.append(1 - clf.score(trainingX, trainingY))
    num_estimators_tests_error.append(1 - clf.score(testX, testY))

In [None]:
# Plot the data
plt.figure(4, figsize=(8, 6))
plt.plot(range(1, 60, 1), num_estimators_train_error, label='Training Error')
plt.plot(range(1, 60, 1), num_estimators_tests_error, label='Test Error')
plt.legend(loc='best')
plt.xlabel('Max Feature')
plt.ylabel('Error')
plt.show()

In [None]:
randfor_model1 = RandomForestClassifier(n_estimators=10000000, min_samples_leaf=5, max_features=62, n_jobs=-1)
randfor_model1.fit(dataX, dataY)

In [None]:
# Predict data and write to file.
randfor_predict1 = randfor_model1.predict(test_data)

f = open("RandomForestJoon4.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(randfor_predict1)):
    f.write(str(x+1) + "," + str(int(randfor_predict1[x])) + "\n")
f.close()

# VotingClassifier

In [5]:
rf1 = RandomForestClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='gini')
rf2 = RandomForestClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='entropy')
et1 = ExtraTreesClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='gini')
et2 = ExtraTreesClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='entropy')

In [6]:
eclf1 = VotingClassifier(estimators=[('rf1', rf1), ('rf2', rf2), ('et1', et1), ('et2', et2)], voting='hard')
eclf1.fit(dataX, dataY)

VotingClassifier(estimators=[('rf1', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=62, max_leaf_nodes=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5000, n_jobs=-1,
            o...ators=5000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))],
         voting='hard', weights=None)

In [8]:
test = np.genfromtxt("VotingTest.csv", delimiter=',', dtype=int)[:, 1]
one = np.genfromtxt("Joon/Voting1.csv", delimiter=',', dtype=int)[:, 1]

In [12]:
right = 0
wrong = 0
for i in range(test.shape[0]):
    if test[i] == one[i]:
        right +=1
    else:
        wrong += 1
print right
print wrong
print wrong*1.0/(right + wrong)

1349
7
0.00516224188791


---

# XGBoost

In [None]:
dtrain = xgb.DMatrix( trainingX, label=trainingY)
dtest = xgb.DMatrix(testX, label=testY)

In [None]:
param = {'max_depth': 3, 'colsample_bytree': 0.5, 'silent':1, 'objective':'binary:logistic'}
param['eval_metric'] = 'error'
watchlist  = [(dtest,'test'), (dtrain,'train')]
bst = xgb.train(param, dtrain, 260)

In [None]:
xgpreds = np.around(bst.predict(dtest))
labels = dtest.get_label()
print ('score=%f' % (1- sum(1 for i in range(len(xgpreds)) if int(xgpreds[i]>0.5)!=labels[i]) /float(len(xgpreds))))

# GridSearchCV

In [1]:
from sklearn.grid_search import GridSearchCV

In [6]:
# Random Forest (Gini)
rf_gini = RandomForestClassifier(n_estimators=100, criterion='gini')
params1 = {'min_samples_leaf': range(1, 30, 4), 'max_features': range(22, 93, 5)}
rf_gini_grid = GridSearchCV(estimator=rf_gini, param_grid = params1, cv = 5, n_jobs=-1)
rf_gini_grid.fit(dataX, dataY)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92], 'min_samples_leaf': [1, 5, 9, 13, 17, 21, 25, 29]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [7]:
# Random Forest (Entropy)
rf_entropy = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf_entropy_grid = GridSearchCV(estimator=rf_entropy, param_grid = params1, cv = 5, n_jobs=-1)
rf_entropy_grid.fit(dataX, dataY)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92], 'min_samples_leaf': [1, 5, 9, 13, 17, 21, 25, 29]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
# Extra Trees (Gini)
et_gini = ExtraTreesClassifier(n_estimators=100, criterion='gini')
params1 = {'min_samples_leaf': range(1, 30, 4), 'max_features': range(22, 93, 5)}
et_gini_grid = GridSearchCV(estimator=et_gini, param_grid = params1, cv = 5, n_jobs=-1)
et_gini_grid.fit(dataX, dataY)

GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92], 'min_samples_leaf': [1, 5, 9, 13, 17, 21, 25, 29]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [11]:
# Extra Trees (Entropy)
et_entropy = ExtraTreesClassifier(n_estimators=100, criterion='entropy')
et_entropy_grid = GridSearchCV(estimator=et_entropy, param_grid = params1, cv = 5, n_jobs=-1)
et_entropy_grid.fit(dataX, dataY)

GridSearchCV(cv=5, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [22, 27, 32, 37, 42, 47, 52, 57, 62, 67, 72, 77, 82, 87, 92], 'min_samples_leaf': [1, 5, 9, 13, 17, 21, 25, 29]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [None]:
from xgboost.sklearn import XGBClassifier
params2 = {
    'max_depth': range(4, 12),
    'learning_rate': [0.3, 0.2, 0.15, 0.1, 0.05, 0.01, 0.005, 0.001],
    'colsample_bytree': np.arange(0.3, 0.55, 0.05),
    'min_child_weight': range(4, 15, 2)
}

xg = XGBClassifier(n_estimators=100)
xg_grid = GridSearchCV(estimator=xg, param_grid = params2, cv = 5, n_jobs=-1)
xg_grid.fit(dataX, dataY)



In [None]:
rf_gini_predict = rf_gini_grid.predict(test_data)
rf_entropy_predict = rf_entropy_grid.predict(test_data)
et_gini_predict = et_gini_grid.predict(test_data)
et_entropy_predict = et_entropy_grid.predict(test_data)
xg_predict = xg_grid.predict(test_data)



In [None]:
grid_vote = np.around(np.average(np.vstack(rf_gini_predict, rf_entropy_predict, et_gini_predict, et_entropy_predict, xg_predict), axis=0))

f = open("GridVoting.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(vote_predict1)):
    f.write(str(x+1) + "," + str(int(vote_predict1[x])) + "\n")
f.close()

In [24]:
print rf_gini_grid.best_estimator_
print rf_entropy_grid.best_estimator_
print et_gini_grid.best_estimator_
print rf_entropy_grid.best_estimator_



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=42, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=27, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=67, max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, 

In [25]:
print rf_gini_grid.best_params_
print rf_entropy_grid.best_params_
print et_gini_grid.best_params_
print rf_entropy_grid.best_params_

{'max_features': 42, 'min_samples_leaf': 1}
{'max_features': 27, 'min_samples_leaf': 1}
{'max_features': 67, 'min_samples_leaf': 1}
{'max_features': 27, 'min_samples_leaf': 1}


In [147]:
print rf_gini_grid.best_score_
print rf_entropy_grid.best_score_
print et_gini_grid.best_score_
print et_entropy_grid.best_score_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [80]:
rf_gini = RandomForestClassifier(n_estimators=10000, min_samples_leaf=5, max_features=42, criterion='gini')
rf_entropy = RandomForestClassifier(n_estimators=10000, min_samples_leaf=5, max_features=27, criterion='entropy')
et_gini = ExtraTreesClassifier(n_estimators=10000, min_samples_leaf=5, max_features=67, criterion='gini')
et_entropy = RandomForestClassifier(n_estimators=10000, min_samples_leaf=5, max_features=27, criterion='entropy')

voting1 = VotingClassifier(estimators=[('rf_gini', rf_gini), ('rf_entropy', rf_entropy), ('et_gini', et_gini), ('et_entropy', et_entropy)], weights=[1, 1, 2, 1], voting='hard')

In [86]:
et_gini2 = ExtraTreesClassifier(n_estimators=10, min_samples_leaf=5, max_features=62, criterion='gini')

In [75]:
a = cross_val_score(voting1, dataX, dataY, cv=5, n_jobs=-1)
a

array([ 0.70167064,  0.7052506 ,  0.7052506 ,  0.67899761,  0.72998805])

In [76]:
np.average(a)

0.70423150072853669

In [92]:
b = cross_val_score(et_gini2, dataX, dataY, cv=5, n_jobs=-1)
np.average(b)

0.68274808028445721

# Test Models

## 2nd Best Model Ensemble

In [93]:
rf1 = RandomForestClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='gini')
rf2 = RandomForestClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='entropy')
et1 = ExtraTreesClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='gini')
et2 = ExtraTreesClassifier(n_estimators=5000, min_samples_leaf=5, max_features=62, n_jobs=-1, criterion='entropy')

In [108]:
start_time = time.time()
ensemble1 = VotingClassifier(estimators=[('rf1', rf1), ('rf2', rf2), ('et1', et1), ('et2', et2)], voting='hard')
ensemble1_cv_score = cross_val_score(ensemble1, dataX, dataY, cv=5, n_jobs=-1)
print ensemble1_cv_score
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
# below time should be 25.982 mins
print np.average(ensemble1_cv_score)
os.system('say "your program has finished"')

[ 0.69689737  0.70883055  0.71002387  0.66348449  0.73715651]
--- 259.82 mins ---


In [109]:
np.average(ensemble1_cv_score)

0.70327855763994029

## Random Forest RandomizedSearchCV

In [139]:
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
param_dist = {"max_features": sp_randint(20, 85),
              "min_samples_leaf": sp_randint(1, 11),
              "criterion": ["gini", "entropy"]}
n_iter_search = 60
rf_grid = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=n_iter_search, cv = 5, n_jobs=-1)

In [140]:
start_time = time.time()
rf_grid.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
print rf_grid.best_score_
print rf_grid.best_params_
os.system('say "your first program has finished"')

--- 21.00 mins ---
0.715445213655
{'max_features': 65, 'criterion': 'entropy', 'min_samples_leaf': 2}


0

## ExtraTrees RandomizedSearchCV

In [141]:
et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1)
et_grid = RandomizedSearchCV(estimator=et, param_distributions=param_dist, n_iter=n_iter_search, cv = 5, n_jobs=-1)

In [142]:
start_time = time.time()
et_grid.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
print et_grid.best_score_
print et_grid.best_params_
os.system('say "your second program has finished"')

--- 43.19 mins ---
0.715445213655
{'max_features': 81, 'criterion': 'entropy', 'min_samples_leaf': 2}


0

## XGBoost RandomizedSearchCV

In [None]:
param_dist2 = {
    'max_depth': sp_randint(3, 15),
    'learning_rate': [0.3, 0.2, 0.15, 0.1, 0.05, 0.01],
    'colsample_bytree': sp_uniform(0.4, 0.55),
    'min_child_weight': sp_randint(1, 10)
}

xg = XGBClassifier(n_estimators=500)
xg_grid = RandomizedSearchCV(estimator=xg, param_distributions=param_dist2, n_iter=n_iter_search, cv = 5, n_jobs=-1, verbose=10)

In [None]:
import datetime
datetime.datetime.now()

In [None]:
start_time = time.time()
xg_grid.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
print xg_grid.best_score_
print xg_grid.best_params_
os.system('say "your x g boost program has finished"')

In [None]:
# Try with more estimators
start_time = time.time()
xg2 = XGBClassifier(n_estimators=10000, learning_rate=0.15, colsample_bytree=0.473923, max_depth=9, min_child_weight=5)
xg2_score = cross_val_score(xg2, dataX, dataY, cv=5, n_jobs=-1)
print np.average(xg2_score)
print xg2_score
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
os.system('say "your x g boost c v has finished"')

## AdaBoosted Random Forest

In [176]:
#param_dist_adrf = {
#    'n_estimators': sp_randint(5, 15),
#    'learning_rate': sp_uniform(0, 1)
#}
adarf = RandomForestClassifier(n_estimators=500, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
ada = AdaBoostClassifier(base_estimator=adarf, n_estimators=10)

#ada_grid = RandomizedSearchCV(estimator=ada, param_distributions=param_dist_adrf, n_iter=30, cv = 5, n_jobs=-1)

In [177]:
start_time = time.time()
ada_cv_score = cross_val_score(ada, dataX, dataY, cv=5, n_jobs=-1)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
print np.average(ada_cv_score)
print ada_cv_score
os.system('say "your adaboosted random forest program has finished"')

--- 4.28 mins ---
0.723328286328
[ 0.71718377  0.72911695  0.73150358  0.69331742  0.74551971]


0

## AdaBoost

In [170]:
param_dist3 = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__max_depth" : sp_randint(1, 11),
              "base_estimator__max_features" : sp_randint(10, 100),
              "n_estimators": sp_randint(1000, 10000)
             }
dt = DecisionTreeClassifier()
ada2 = AdaBoostClassifier(base_estimator=dt, learning_rate=1)
ada2_grid = RandomizedSearchCV(estimator=ada2, param_distributions=param_dist3, cv = 5, n_jobs=-1)

In [171]:
start_time = time.time()
ada2_grid.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
print ada2_grid.best_score_
print ada2_grid.best_params_
os.system('say "your adaboost program has finished"')

--- 14.68 mins ---
0.657197421819
{'n_estimators': 3449, 'base_estimator__criterion': 'entropy', 'base_estimator__max_depth': 1, 'base_estimator__max_features': 32}


0

## Models and Ensembles for Submission

In [None]:
rf_sub = RandomForestClassifier(n_estimators=5000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
et_sub = ExtraTreesClassifier(n_estimators=5000, max_features=81, criterion='entropy', min_samples_leaf=2, n_jobs=-1)

In [291]:
# RF with 100000 trees
start_time = time.time()
rf_sub1 = RandomForestClassifier(n_estimators=100000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
rf_sub1.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))
os.system('say "Master, your random forest program has finished"')

--- 24.11 mins ---


0

In [292]:
# Predict data and write to file.
rf_sub1_predict = rf_sub1.predict(test_data)

f = open("RandomForest5.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(rf_sub1_predict)):
    f.write(str(x+1) + "," + str(int(rf_sub1_predict[x])) + "\n")
f.close()

os.system('say "Master, your file has been created."')

0

# CURRENT BEST MODEL

In [224]:
# Adaboosted Random Tree
start_time = time.time()
adarf_sub = RandomForestClassifier(n_estimators=10000, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
ada_sub = AdaBoostClassifier(base_estimator=adarf_sub, n_estimators=10)
ada_sub.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))

os.system('say "Your adaboost program has finished"')

--- 30.91 mins ---


0

In [225]:
# Predict data and write to file.
ada_sub_predict = ada_sub.predict(test_data)

f = open("AdaBoostRF2.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(ada_sub_predict)):
    f.write(str(x+1) + "," + str(int(ada_sub_predict[x])) + "\n")
f.close()

os.system('say "Your file has been created."')

0

## Try to Improve

In [None]:
params_ada = {'n_estimators': sp_randint(1, 30), 'learning_rate': [0.1, 0.5, 0.8, 0.01, 0.05]}

# Adaboosted Random Tree
start_time = time.time()
adarf_sub2 = RandomForestClassifier(n_estimators=100, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
ada_sub2 = AdaBoostClassifier(base_estimator=adarf_sub2)

ada_grid = RandomizedSearchCV(estimator=ada_sub2, param_distributions = params_ada, n_iter=60, cv = 5, n_jobs=-1)


ada_grid.fit(dataX, dataY)
print ada_grid.best_params_
print ada_grid.best_score_
print("--- %.2f mins ---" % ((time.time() - start_time)/60))

os.system('say "Your adaboost search has finished"')

# Run on Wing IDE

In [6]:
# Adaboosted Random Tree
start_time = time.time()
adarf_sub2 = RandomForestClassifier(n_estimators=100, max_features=65, criterion='entropy', min_samples_leaf=2, n_jobs=-1)
ada_sub2 = AdaBoostClassifier(base_estimator=adarf_sub2, n_estimators=30)
ada_sub2.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))

os.system('say "Your adaboost program has finished"')

--- 0.59 mins ---


0

In [None]:
# Predict data and write to file.
ada_sub2_predict = ada_sub2.predict(test_data)

f = open("AdaBoostRF3.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(ada_sub2_predict)):
    f.write(str(x+1) + "," + str(int(ada_sub2_predict[x])) + "\n")
f.close()

os.system('say "Your file has been created."')

In [None]:
import datetime
datetime.datetime.now()

### XGBoost

In [4]:
# Try with more estimators
start_time = time.time()
xg2 = XGBClassifier(n_estimators=10000, learning_rate=0.15, colsample_bytree=0.473923, max_depth=9, min_child_weight=5)
xg2.fit(dataX, dataY)
print("--- %.2f mins ---" % ((time.time() - start_time)/60))

os.system('say "Master, your adaboost program has finished"')

--- 19.30 mins ---


0

In [None]:
# Predict data and write to file.
xg2_predict = xg2.predict(test_data)

f = open("XGB1.csv", "w")
f.write("Id,Prediction\n")
for x in range(len(xg2_predict)):
    f.write(str(x+1) + "," + str(int(xg2_predict[x])) + "\n")
f.close()

os.system('say "Master, your file has been created."')

0