In [44]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath("../utils"))

from transformdata import X_train , X_test , y_train , y_test

print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 115782 stored elements and shape (5282, 8514)>
  Coords	Values
  (0, 8232)	0.25040301540124993
  (0, 4942)	0.3020279644797659
  (0, 8490)	0.35033272125468834
  (0, 4441)	0.3234652478407264
  (0, 8488)	0.36312218487299586
  (0, 4105)	0.38340866342101343
  (0, 131)	0.33611694335583137
  (0, 2844)	0.2517338974471896
  (0, 1880)	0.40436391696063495
  (1, 7601)	0.2784559751372677
  (1, 7729)	0.09738084451627431
  (1, 5232)	0.09808615400744047
  (1, 4703)	0.1430758576048859
  (1, 749)	0.12250483887130896
  (1, 3058)	0.23850803657884237
  (1, 295)	0.13710348759118834
  (1, 3780)	0.19010422491120965
  (1, 5160)	0.14617333031576848
  (1, 4827)	0.13934484340556888
  (1, 8103)	0.13230030210340882
  (1, 6646)	0.2278493798821864
  (1, 4739)	0.18793978521385618
  (1, 4879)	0.1632334529054089
  (1, 4875)	0.15457500539157024
  (1, 1764)	0.31126502871675993
  :	:
  (5280, 3547)	0.2253921362512485
  (5280, 999)	0.31897367003444643
  (5280, 13

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import time
from xgboost import XGBClassifier

# Notes
## Interpreting Results of these tuners
The tuners both output data in dictionary format with two items:
- n_estimators
- accuracy

The number of estimators will be equal to the model number + 1

In [46]:
def rfTuner(X_train, y_train, X_test, y_test, epochs=100):
    print('Initializing Tuner...')
    output = {} # saving info on models
    best_acc = 0 # tracking highest accuracy
    start = time.time() # timing
    for i in range(epochs): # user-defined loop length
        model = RandomForestClassifier(n_estimators=i+1) # changing estimators
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc = accuracy_score(y_test, preds)
        output[i] = {'estimators':i+1, # output appending
                     'acc':acc}
        print(f'model complete: {i}', end='\r')
        if acc > best_acc: # updating highest accuracy
            print(f'New best Accuracy! Model {i}')
            print(f'{round(acc*100,2)}%')
            best_acc = acc
    print(f'Total runtime: {(time.time() - start)/60} minutes')
    return output


In [47]:
saved = rfTuner(X_train, y_train, X_test, y_test, epochs=200)

Initializing Tuner...
New best Accuracy! Model 0
80.69%
New best Accuracy! Model 1
81.89%
New best Accuracy! Model 2
83.42%
New best Accuracy! Model 3
83.48%
New best Accuracy! Model 4
84.21%
New best Accuracy! Model 5
84.67%
New best Accuracy! Model 7
85.12%
New best Accuracy! Model 10
85.52%
New best Accuracy! Model 11
85.75%
New best Accuracy! Model 12
85.97%
New best Accuracy! Model 13
86.48%
New best Accuracy! Model 18
86.66%
New best Accuracy! Model 19
87.68%
New best Accuracy! Model 39
87.79%
New best Accuracy! Model 51
88.42%
Total runtime: 9.164245422681173 minutes


In [48]:
def xgbTuner(X_train, y_train, X_test, y_test, epochs=100):
    output = {} # this one is basically the same as the other 
    best_acc = 0
    start = time.time()
    for i in range(epochs):
        model = XGBClassifier(objective='multi:softmax', num_class=5, n_estimators=i+1, learning_rate=0.1)
        ytrain_mod = [item-1 for item in y_train] # 1-5 in data, xgb likes starting from 0
        ytest_mod = [item-1 for item in y_test] # these lines just make a new list thats the same as before but -1
        model.fit(X_train, ytrain_mod)
        preds = model.predict(X_test)
        acc = accuracy_score(ytest_mod, preds)
        output[i] = {'estimators':i+1,
                     'acc':acc}
        print(f'model complete: {i}', end='\r')
        if acc > best_acc:
            print(f'New best Accuracy! Model {i}')
            print(f'{round(acc*100,2)}%')
            best_acc = acc
    print(f'Total runtime: {(time.time() - start)/60} minutes')
        

In [49]:
saved2 = xgbTuner(X_train, y_train, X_test, y_test, epochs=200)

New best Accuracy! Model 0
75.07%
New best Accuracy! Model 1
75.87%
New best Accuracy! Model 2
76.66%
New best Accuracy! Model 3
77.06%
New best Accuracy! Model 4
77.8%
New best Accuracy! Model 5
78.76%
New best Accuracy! Model 6
78.82%
New best Accuracy! Model 7
78.88%
New best Accuracy! Model 8
79.16%
New best Accuracy! Model 9
79.22%
New best Accuracy! Model 12
79.44%
New best Accuracy! Model 13
79.67%
New best Accuracy! Model 14
79.9%
New best Accuracy! Model 15
80.01%
New best Accuracy! Model 16
80.07%
New best Accuracy! Model 17
80.35%
New best Accuracy! Model 18
80.58%
New best Accuracy! Model 19
80.69%
New best Accuracy! Model 20
81.26%
New best Accuracy! Model 23
81.32%
New best Accuracy! Model 24
81.49%
New best Accuracy! Model 26
81.6%
New best Accuracy! Model 27
81.66%
New best Accuracy! Model 28
81.83%
New best Accuracy! Model 30
82.17%
New best Accuracy! Model 31
82.34%
New best Accuracy! Model 32
82.62%
New best Accuracy! Model 34
82.79%
New best Accuracy! Model 35
83.08

In [52]:
estimator1 = []
estimator2 = []
acc1 = []
acc2 = []

for key in saved.keys():
    estimator1.append(saved[key]['estimators'])
    acc1.append(saved[key]['acc'])

#for key in saved2.keys():
#    estimator2.append(saved2[key]['estimators'])
#    acc2.append(saved2[key]['acc'])

acc_last = 0
acc_last2 = 0
i = 0
peak1 = 0
peak2 = 0
for acc in acc1:
    if acc > acc_last:
        peak1 = i
    acc_last = acc
    i += 1
#i = 0
#for acc in acc2:
#    if acc > acc_last2:
#        peak2 = i
#    acc_last2 = acc
#    i += 1
print(peak1)
print('Highest Scoring Models:')
print(f'Random Forest: {estimator1[peak1]} estimators, {round(acc1[peak1],2)}% accuracy ')
#print(f'XGBoost: {estimator2[peak2]} estimators, {round(acc2[peak2],2)}% accuracy')

198
Highest Scoring Models:
Random Forest: 199 estimators, 0.88% accuracy 
