# Non-linear Model 2 - Random Forest

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd
# Extra imports
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix,\
        accuracy_score, classification_report, f1_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from auxiliars import *

In [28]:
df = pd.read_csv("./data/HTRU2/HTRU_2.csv", names = ['Profile_mean', 'Profile_stdev', 'Profile_skewness', 
                                                      'Profile_kurtosis', 'DM_mean', 'DM_stdev', 'DM_skewness',
                                                      'DM_kurtosis', 'Class'])

train, test = train_test_split(data, test_size = 0.2)
print(train)

       Profile_mean  Profile_stdev  Profile_skewness  Profile_kurtosis  \
15004      0.212812       0.837665         -0.378328         -0.357311   
5941       0.920597       1.796574         -0.317900         -0.410943   
12232      0.673603       5.760833         -0.447990         -0.523949   
9179      -0.165750       0.866723          0.026090         -0.317847   
17313      0.082767       0.775691         -0.160143         -0.264681   
...             ...            ...               ...               ...   
5855      -2.152054      -1.880344          2.120566          1.668052   
3091       0.314533       0.866854         -0.474416         -0.356059   
12165     -0.569284      -1.119227          0.096137         -0.065330   
4757       0.177179       1.002912         -0.233831         -0.317512   
16457      0.023379       0.221304         -0.202338         -0.237919   

        DM_mean  DM_stdev  DM_skewness  DM_kurtosis  Class  
15004 -0.355896 -0.485684     0.165454    -0.12995

## Decision Tree

In [29]:
model_tree = DecisionTreeClassifier().fit(train.loc[:,:df.columns[-2]], 
                                          train.Class)

In [34]:
classes = ['no_pulsar','pulsar']

In [35]:
pred = model_tree.predict(test.loc[:,:df.columns[-2]])

confusionMatrix(test.Class, pred, ['no_pulsar','pulsar'])

Predicted,no_pulsar,pulsar
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
no_pulsar,3209,51
pulsar,53,267


In [36]:
print((1-accuracy_score(test.Class,pred))*100)
print(f1_score(test.Class, pred))

2.905027932960891
0.8369905956112853


## Random Forest

In [39]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(train.loc[:,:df.columns[-2]],
                                                       train.Class)

pred = model_rf1.predict(train.loc[:,:df.columns[-2]])

print(confusionMatrix(train.Class, pred, classes))

print(classification_report(train.Class,
                            pred,
                            target_names=classes,))

print('OOB error =', 1-model_rf1.oob_score_)

Predicted  no_pulsar  pulsar
Real                        
no_pulsar      12999       0
pulsar             2    1317
              precision    recall  f1-score   support

   no_pulsar       1.00      1.00      1.00     12999
      pulsar       1.00      1.00      1.00      1319

    accuracy                           1.00     14318
   macro avg       1.00      1.00      1.00     14318
weighted avg       1.00      1.00      1.00     14318

OOB error= 0.02032406760720773


Now the real error with the test df

In [41]:
pred = model_rf1.predict(test.loc[:,:df.columns[-2]])

confusionMatrix(test.Class,pred, classes)

Predicted,no_pulsar,pulsar
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
no_pulsar,3240,20
pulsar,50,270


In [43]:
(1-accuracy_score(test.Class,pred))*100

1.9553072625698276

In [45]:
f1_score(test.Class, pred)

0.8852459016393444

## Random Forest with class weights

In [54]:
model_rf2 = RandomForestClassifier(n_estimators=100, 
                                   oob_score=True, 
                                   class_weight={0:10,1:1}).fit(train.loc[:,:df.columns[-2]],
                                                       train.Class)

pred = model_rf2.predict(train.loc[:,:df.columns[-2]])

print(confusionMatrix(train.Class, pred, classes))

print(classification_report(train.Class,
                            pred,
                            target_names=classes,))

print('OOB error =', 1-model_rf2.oob_score_)

Predicted  no_pulsar  pulsar
Real                        
no_pulsar      12999       0
pulsar             0    1319
              precision    recall  f1-score   support

   no_pulsar       1.00      1.00      1.00     12999
      pulsar       1.00      1.00      1.00      1319

    accuracy                           1.00     14318
   macro avg       1.00      1.00      1.00     14318
weighted avg       1.00      1.00      1.00     14318

OOB error = 0.020254225450481944


In [55]:
pred = model_rf2.predict(test.loc[:,:df.columns[-2]])

confusionMatrix(test.Class,pred, classes)

Predicted,no_pulsar,pulsar
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
no_pulsar,3237,23
pulsar,50,270


In [56]:
(1-accuracy_score(test.Class,pred))*100

2.0391061452514014

In [57]:
f1_score(test.Class, pred)

0.8809135399673736

### Resampling 

In [58]:
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_sample(train.loc[:,:df.columns[-2]], 
                                          train.Class)
X_resampled.shape

(2638, 8)

In [61]:
model_rf3 = RandomForestClassifier(n_estimators=100, 
                                   oob_score=True).fit(X_resampled, 
                                                       y_resampled)

pred = model_rf3.predict(train.loc[:,:df.columns[-2]])

confusionMatrix(train.Class, pred, classes)

print(classification_report(train.Class,
                            pred,
                            target_names=classes,))

print('OOB error=', 1- model_rf3.oob_score_)

              precision    recall  f1-score   support

   no_pulsar       1.00      0.97      0.99     12999
      pulsar       0.78      1.00      0.88      1319

    accuracy                           0.97     14318
   macro avg       0.89      0.99      0.93     14318
weighted avg       0.98      0.97      0.98     14318

OOB error= 0.056103108415466285


In [62]:
pred = model_rf3.predict(test.loc[:,:df.columns[-2]])

confusionMatrix(test.Class,pred, classes)

Predicted,no_pulsar,pulsar
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
no_pulsar,3161,99
pulsar,25,295


## Optimitzation

In [92]:
ntrees = np.array(np.round(13**np.arange(1,3.0,0.2)),dtype=int)
print(ntrees)

[  13   22   36   61  101  169  282  471  788 1315]


In [94]:
rf_results= []
b_options= [True, False]
for nt in ntrees:
    for b in b_options:
        model_rf = RandomForestClassifier(n_estimators=nt, 
                                          oob_score=True).fit(train.loc[:,:df.columns[-2]],
                                                           train.Class)
        rf_results.append((1-model_rf.oob_score_, b))

rf_results = pd.DataFrame({'ntrees':ntrees, 'b':rf_results[1], 'OOB':rf_results[0]})

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


ValueError: arrays must all be same length

In [105]:
rf_results0 = [a for a, b in rf_results]
rf_results1 = [b for a, b in rf_results]
treesn = []
for n in ntrees:
    treesn.append(n)
    treesn.append(n)


rf_results = pd.DataFrame({'ntrees':treesn, 'b':rf_results1, 'OOB':rf_results0})

print(rf_results)

    ntrees      b       OOB
0       13   True  0.024165
1       13  False  0.022769
2       22   True  0.021092
3       22  False  0.021302
4       36   True  0.020603
5       36  False  0.021302
6       61   True  0.020953
7       61  False  0.019835
8      101   True  0.020115
9      101  False  0.020184
10     169   True  0.020603
11     169  False  0.019835
12     282   True  0.020115
13     282  False  0.019905
14     471   True  0.019975
15     471  False  0.020045
16     788   True  0.019975
17     788  False  0.019905
18    1315   True  0.020045
19    1315  False  0.019765


In [106]:
model_rf = RandomForestClassifier(n_estimators=rf_results.ntrees.loc[rf_results.OOB.idxmin], 
                                      oob_score=True).fit(train.loc[:,:df.columns[-2]],
                                                       train.Class)

pred = model_rf.predict(train.loc[:,:df.columns[-2]])

print(confusionMatrix(train.Class, pred, classes))

print(classification_report(train.Class,
                            pred,
                            target_names=classes,))

print('OOB error =', 1-model_rf.oob_score_)

Predicted  no_pulsar  pulsar
Real                        
no_pulsar      12999       0
pulsar             0    1319
              precision    recall  f1-score   support

   no_pulsar       1.00      1.00      1.00     12999
      pulsar       1.00      1.00      1.00      1319

    accuracy                           1.00     14318
   macro avg       1.00      1.00      1.00     14318
weighted avg       1.00      1.00      1.00     14318

OOB error = 0.01997485682357869


In [107]:
pred = model_rf.predict(test.loc[:,:df.columns[-2]])

confusionMatrix(test.Class,pred, classes)

Predicted,no_pulsar,pulsar
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
no_pulsar,3239,21
pulsar,50,270


In [108]:
print((1-accuracy_score(test.Class,pred))*100)
print(f1_score(test.Class, pred))

1.9832402234636892
0.8837970540098199
