# Random Forest


Datasets:
- Australian
- German
- Iris
- Tic-tac-toe
- Zoo
- Monks
- Messidor 
- Seeds

In [480]:
seedno = [1,12,123,1234,12345,123456, 1234567, 12345678, 123456789, 1234567890]

In [535]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import preprocessing
from sklearn.metrics import f1_score

In [664]:
australian = pd.read_csv('australia.txt', sep=",")
german = pd.read_csv('german.txt', sep=",")
iris = pd.read_csv('iris.txt', sep=",")
tictactoe = pd.read_csv('tictactoe.txt', sep=",")
zoo = pd.read_csv('zoo.txt', sep=",")
monks = pd.read_csv('monks.txt', sep=",")
messidor = pd.read_csv('messidor.txt', sep=",")
seeds = pd.read_csv('seeds.txt', sep=",")

In [537]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    error = 0
    for i in range(0, len(predictions)):
        if test_labels.reset_index().iloc[i, 1] != predictions[i]:
            error = error + 1
    accuracy = 100-(error/len(predictions))*100
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    f1score = f1_score(test_labels, predictions, average='weighted') 
    print('F1 Score = {:0.2f}.'.format(f1score))
    
    return accuracy, f1score

In [484]:
def RandomForestBaseModel(X_train, X_test, y_train, y_test, index_col, seed):
    clf=RandomForestClassifier(n_estimators=100, random_state=seed)
    clf.fit(X_train,y_train)
    print('Parameters currently in use:\n')
    print(clf.get_params())
    base_accuracy.append(evaluate(clf, X_test, y_test))
    feature_imp = pd.Series(clf.feature_importances_, index=index_col).sort_values(ascending=False)
    feature_importance.append(feature_imp)
    

In [485]:
def RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, seed):
    clf = RandomForestClassifier(random_state = seed)
    grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                              cv = 3, n_jobs = -1, verbose = 2, error_score='raise')
    grid_search.fit(X_train, y_train)
    grid_cv_best_param.append(grid_search.best_params_)
    b = evaluate(grid_search.best_estimator_, X_test, y_test)
    grid_cv_accuracy.append(b)

In [486]:
def EliminationFeatures(X, y, clf_features):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)
    clf_features.fit(X_train, y_train)
    b = evaluate(clf_features, X_test, y_test)
    elimination_features_accuracy.append(b)
    
    

In [487]:
def labelEncoded(columns, dataset):
    le = preprocessing.LabelEncoder()
    for i in range(0, len(columns)):
        fit = le.fit(dataset[columns[i]].unique())
        print(fit.classes_)
        dataset[columns[i]] = le.transform(dataset[columns[i]])

# Australian

In [538]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [573]:
base_accuracy = []

In [574]:
X_train, X_test, y_train, y_test = train_test_split(australian.iloc[:, 0:(len(australian.columns)-1)], australian.iloc[:, -1], test_size=0.3, random_state = 42)

In [575]:
for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, australian.columns[0:len(australian.columns)-1], i)

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 85.99%.
F1 Score = 0.86.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 86.96%.
F1 Score = 0.87.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [576]:
base_accuracy

[(85.9903381642512, 0.8590353513421484),
 (86.95652173913044, 0.8697054938421825),
 (87.43961352657004, 0.8734411863835524),
 (86.47342995169082, 0.8644214310167752),
 (87.43961352657004, 0.8737874931205284),
 (86.95652173913044, 0.8687570512495865),
 (86.47342995169082, 0.8644214310167752),
 (87.43961352657004, 0.8743961352657005),
 (86.95652173913044, 0.8683825480459421),
 (87.43961352657004, 0.8741056145155769)]

In [558]:
print("Average of the base accuracy for Australian dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10
    

Average of the base accuracy for Australian dataset


86.95652173913044

In [559]:
print("Average of the f1 score for Australian dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the f1 score for Australian dataset


0.8690453735798769

In [560]:
param_grid = {'n_estimators': [500, 600, 700], 
              'min_samples_split': [2, 5, 10], 
              'min_samples_leaf': [1,2,4], 
              'max_depth': [10, 20, 30],
}

In [561]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 87.44%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 86.96%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 86.96%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 85.51%.
F1 Score = 0.85.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 87.44%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 86.96%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 87.92%.
F1 Score = 0.88.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 87.44%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 86.96%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 86.96%.
F1 Score = 0.87.


In [562]:
grid_cv_accuracy

[(87.43961352657004, 0.8737874931205284),
 (86.95652173913044, 0.8687570512495865),
 (86.95652173913044, 0.8683825480459421),
 (85.5072463768116, 0.8543701843698405),
 (87.43961352657004, 0.8737874931205284),
 (86.95652173913044, 0.8691019640013511),
 (87.92270531400966, 0.878798114816066),
 (87.43961352657004, 0.8743961352657005),
 (86.95652173913044, 0.8687570512495865),
 (86.95652173913044, 0.8691019640013511)]

In [563]:
print("Average of the grid cv accuracy for Australian dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the grid cv accuracy for Australian dataset


87.05314009661838

In [564]:
print("Average of the grid cv f1score for Australian dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the grid cv f1score for Australian dataset


0.8699239999240481

In [496]:
grid_cv_best_param

[{'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 700},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 700}]

## Elimination of less important features

In [498]:
feature_importance

[A8     0.264822
 A10    0.098073
 A7     0.097632
 A14    0.086956
 A3     0.085105
 A5     0.074160
 A2     0.072572
 A13    0.070018
 A9     0.065403
 A6     0.028835
 A4     0.019589
 A12    0.014643
 A11    0.012057
 A1     0.010135
 dtype: float64,
 A8     0.287483
 A10    0.105153
 A7     0.094973
 A5     0.080552
 A3     0.079170
 A2     0.074116
 A14    0.072126
 A13    0.068422
 A9     0.061438
 A6     0.025821
 A4     0.016052
 A11    0.012483
 A12    0.011119
 A1     0.011092
 dtype: float64,
 A8     0.266993
 A10    0.110947
 A7     0.088701
 A3     0.084958
 A5     0.082320
 A14    0.074195
 A2     0.070246
 A13    0.069960
 A9     0.068717
 A6     0.030337
 A4     0.015424
 A11    0.013277
 A12    0.012280
 A1     0.011646
 dtype: float64,
 A8     0.284715
 A10    0.102166
 A7     0.092950
 A3     0.086319
 A14    0.081065
 A5     0.071168
 A2     0.070160
 A13    0.069753
 A9     0.056079
 A6     0.029156
 A4     0.019553
 A11    0.012599
 A12    0.012330
 A1     0.0119

From the figure above, we can see the less important features such as: A4, A12, A11, A1. We can try eliminating it and see it will affect the accuracy. Moving forward, we will use the best parameters from the base model.

In [499]:
aus_new = australian[australian.columns.difference(['A1', 'A4', 'A11', 'A12'])]

In [500]:
for i in seedno:
    clf_features = RandomForestClassifier(random_state = i, max_depth = 10,
  min_samples_leaf = 1,
  min_samples_split = 2,
  n_estimators = 500)
    EliminationFeatures(aus_new.iloc[:, 0:(len(aus_new.columns)-1)], aus_new.iloc[:, -1], clf_features)

Accuracy = 86.47%.
Accuracy = 85.99%.
Accuracy = 85.99%.
Accuracy = 86.47%.
Accuracy = 85.99%.
Accuracy = 86.47%.
Accuracy = 86.47%.
Accuracy = 85.99%.
Accuracy = 86.47%.
Accuracy = 86.47%.


In [501]:
elimination_features_accuracy

[86.47342995169082,
 85.9903381642512,
 85.9903381642512,
 86.47342995169082,
 85.9903381642512,
 86.47342995169082,
 86.47342995169082,
 85.9903381642512,
 86.47342995169082,
 86.47342995169082]

In [502]:
print("Average of the eliminated features accuracy for Australian dataset")
sum(elimination_features_accuracy)/10

Average of the eliminated features accuracy for Australian dataset


86.28019323671496

# German

In [577]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [578]:
columns = []
for i in range(0, len(german.columns)):
    if german[german.columns[i]].dtypes == 'O':
        columns.append(german.columns[i])

In [579]:
labelEncoded(columns, german)

In [580]:
X_train, X_test, y_train, y_test = train_test_split(german.iloc[:, 0:(len(german.columns)-1)], german.iloc[:, -1], test_size=0.3, random_state = 42)

In [581]:
for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, german.columns[0:len(german.columns)-1], i)

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 76.33%.
F1 Score = 0.74.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 77.33%.
F1 Score = 0.76.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [582]:
base_accuracy

[(76.33333333333333, 0.7394944297656021),
 (77.33333333333334, 0.7572148148148148),
 (76.0, 0.7383333333333333),
 (77.66666666666667, 0.7572529998736895),
 (78.0, 0.7601388888888888),
 (75.33333333333333, 0.7257556935817805),
 (77.66666666666667, 0.7572529998736895),
 (75.0, 0.7298959318826868),
 (76.33333333333333, 0.7394944297656021),
 (76.0, 0.7383333333333333)]

In [583]:
print("Average of the base accuracy for German dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10

Average of the base accuracy for German dataset


76.56666666666668

In [584]:
print("Average of the base f1 score for German dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the base f1 score for German dataset


0.7443166855113421

In [585]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.00%.
F1 Score = 0.74.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.67%.
F1 Score = 0.75.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.33%.
F1 Score = 0.74.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 77.33%.
F1 Score = 0.75.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.67%.
F1 Score = 0.74.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 75.67%.
F1 Score = 0.73.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.00%.
F1 Score = 0.73.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.00%.
F1 Score = 0.73.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 76.33%.
F1 Score = 0.74.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 75.67%.
F1 Score = 0.73.


In [586]:
grid_cv_best_param

[{'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 20,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 4,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 20,
  'min_samples_leaf': 4,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 500},
 {'max_depth': 20,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 500}]

In [587]:
grid_cv_accuracy

[(76.0, 0.7399263774063122),
 (76.66666666666667, 0.7471506447005812),
 (76.33333333333333, 0.7377642181265833),
 (77.33333333333334, 0.746235632183908),
 (76.66666666666667, 0.7423188405797101),
 (75.66666666666666, 0.7321562446885768),
 (76.0, 0.7349565217391304),
 (76.0, 0.7349565217391304),
 (76.33333333333333, 0.7377642181265833),
 (75.66666666666666, 0.7303772946935294)]

In [588]:
print("Average of the grid cv accuracy for German dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the grid cv accuracy for German dataset


76.26666666666668

In [589]:
print("Average of the grid cv f1 score for German dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the grid cv f1 score for German dataset


0.7383606513984045

## Elimination of less important features

In [590]:
feature_importance

[Credit amount                                               0.132941
 Age in years                                                0.108673
 Status of existing checking account                         0.099959
 Duration in month                                           0.094919
 Purpose                                                     0.067655
 Credit history                                              0.065638
 Present employment since                                    0.055214
 Property                                                    0.046660
 Savings account/bonds                                       0.045901
 Present residence since                                     0.044557
 Installment rate in percentage of disposable income         0.043565
 Personal status and sex                                     0.034707
 Job                                                         0.031053
 Housing                                                     0.028676
 Number of existing 

From the figure above, we can see the less important features such as: foreign worker. We can try eliminating it and see it will affect the accuracy. Moving forward, we will use the best parameters from the GridCV.

In [591]:
ger_new = german[german.columns.difference(['foreign worker'])]

In [592]:
for i in seedno:
    clf_features = RandomForestClassifier(bootstrap= True, max_depth= 10, max_features= 'sqrt', min_samples_leaf= 4, min_samples_split= 2, n_estimators= 700, random_state = i)
    EliminationFeatures(ger_new.iloc[:, 0:(len(ger_new.columns)-1)], ger_new.iloc[:, -1], clf_features)



Accuracy = 75.67%.
F1 Score = 0.72.
Accuracy = 76.67%.
F1 Score = 0.74.
Accuracy = 76.33%.
F1 Score = 0.73.
Accuracy = 76.67%.
F1 Score = 0.74.
Accuracy = 76.67%.
F1 Score = 0.74.
Accuracy = 76.67%.
F1 Score = 0.74.
Accuracy = 76.00%.
F1 Score = 0.73.
Accuracy = 76.33%.
F1 Score = 0.73.
Accuracy = 76.33%.
F1 Score = 0.74.
Accuracy = 76.67%.
F1 Score = 0.73.


In [593]:
elimination_features_accuracy

[(75.66666666666666, 0.7246079867763627),
 (76.66666666666667, 0.7387719743069642),
 (76.33333333333333, 0.7321529734400241),
 (76.66666666666667, 0.7368927892725217),
 (76.66666666666667, 0.7387719743069642),
 (76.66666666666667, 0.7368927892725217),
 (76.0, 0.7293754403945936),
 (76.33333333333333, 0.7340963759458383),
 (76.33333333333333, 0.7359658001082541),
 (76.66666666666667, 0.734939134939135)]

In [594]:
print("Average of the eliminated features accuracy for German dataset")
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation= summation + elimination_features_accuracy[i][0]
summation/10



Average of the eliminated features accuracy for German dataset


76.4

In [595]:
print("Average of the eliminated features f1 score for German dataset")
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation= summation + elimination_features_accuracy[i][1]
summation/10


Average of the eliminated features f1 score for German dataset


0.734246723876318

We can see that the average accuracy of the ones with eliminated feature performs worse than the GridCV one.

# Iris

In [596]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [597]:
iris

Unnamed: 0,sepal length,sepal width,petal length,petal width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [598]:
columns = []
for i in range(0, len(iris.columns)):
    if iris[iris.columns[i]].dtypes == 'O':
        columns.append(iris.columns[i])

In [599]:
labelEncoded(columns, iris)

In [600]:
X_train, X_test, y_train, y_test = train_test_split(iris.iloc[:, 0:(len(iris.columns)-1)], iris.iloc[:, -1], test_size=0.3, random_state = 42)

In [601]:
for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, iris.columns[0:len(iris.columns)-1], i)

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 100.00%.
F1 Score = 1.00.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 100.00%.
F1 Score = 1.00.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': N

In [602]:
base_accuracy

[(100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0),
 (100.0, 1.0)]

In [603]:
print("Average of the base accuracy for iris dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10

Average of the base accuracy for iris dataset


100.0

In [604]:
print("Average of the f1 score for iris dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the f1 score for iris dataset


1.0

No need to do GridCV or Elimination of features since its 100%.

# Tic-Tac-Toe

In [665]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [666]:
columns = []
for i in range(0, len(tictactoe.columns)):
    if tictactoe[tictactoe.columns[i]].dtypes == 'O':
        columns.append(tictactoe.columns[i])

In [667]:
columns

['top-left-square',
 'top-middle-square',
 'top-right-square',
 'middle-left-square',
 'middle-middle-square',
 'middle-right-square',
 'bottom-left-square',
 'bottom-middle-square',
 'bottom-right-square',
 'class']

In [668]:
labelEncoded(columns, tictactoe)

['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['b' 'o' 'x']
['negative' 'positive']


In [669]:
X_train, X_test, y_train, y_test = train_test_split(tictactoe.iloc[:, 0:(len(tictactoe.columns)-1)], tictactoe.iloc[:, -1], test_size=0.3, random_state = 42)

for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, tictactoe.columns[0:len(tictactoe.columns)-1], i)


Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 93.06%.
F1 Score = 0.93.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 94.79%.
F1 Score = 0.95.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [670]:
base_accuracy

[(93.05555555555556, 0.9281810425319552),
 (94.79166666666667, 0.9466586509915688),
 (93.05555555555556, 0.9281810425319552),
 (93.75, 0.935621977895464),
 (93.40277777777777, 0.9319101508916323),
 (93.40277777777777, 0.9319101508916323),
 (92.70833333333333, 0.9247427983539094),
 (94.79166666666667, 0.9466586509915688),
 (94.44444444444444, 0.9432070707070707),
 (94.09722222222223, 0.9393170162445755)]

In [671]:
print("Average of the base accuracy for tic tac toe dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10

Average of the base accuracy for tic tac toe dataset


93.75

In [672]:
print("Average of the base f1 for tic tac toe dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the base f1 for tic tac toe dataset


0.9356388552031332

In [673]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 92.71%.
F1 Score = 0.92.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 92.71%.
F1 Score = 0.92.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 94.44%.
F1 Score = 0.94.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.75%.
F1 Score = 0.94.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.06%.
F1 Score = 0.93.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.40%.
F1 Score = 0.93.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.06%.
F1 Score = 0.93.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.40%.
F1 Score = 0.93.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.75%.
F1 Score = 0.94.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 93.40%.
F1 Score = 0.93.


In [674]:
grid_cv_accuracy

[(92.70833333333333, 0.924434150395689),
 (92.70833333333333, 0.924434150395689),
 (94.44444444444444, 0.9429957492232326),
 (93.75, 0.935621977895464),
 (93.05555555555556, 0.9281810425319552),
 (93.40277777777777, 0.9319101508916323),
 (93.05555555555556, 0.9281810425319552),
 (93.40277777777777, 0.9319101508916323),
 (93.75, 0.935621977895464),
 (93.40277777777777, 0.9319101508916323)]

In [675]:
print("Average of the grid cv accuracy for tic tac toe dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the grid cv accuracy for tic tac toe dataset


93.36805555555554

In [676]:
print("Average of the grid cv f1 for tic tac toe dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the grid cv f1 for tic tac toe dataset


0.9315200543544344

In [677]:
grid_cv_best_param

[{'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500}]

## Elimination of features

In [678]:
feature_importance

[middle-middle-square    0.152195
 top-left-square         0.119097
 top-right-square        0.116939
 bottom-left-square      0.114601
 bottom-right-square     0.112986
 middle-right-square     0.100915
 bottom-middle-square    0.095594
 middle-left-square      0.094584
 top-middle-square       0.093088
 dtype: float64,
 middle-middle-square    0.169807
 top-right-square        0.123767
 top-left-square         0.111886
 bottom-left-square      0.111719
 bottom-right-square     0.104311
 middle-right-square     0.102336
 bottom-middle-square    0.094379
 top-middle-square       0.091148
 middle-left-square      0.090646
 dtype: float64,
 middle-middle-square    0.160267
 top-right-square        0.121832
 top-left-square         0.120726
 bottom-right-square     0.107414
 bottom-left-square      0.106803
 middle-right-square     0.100152
 bottom-middle-square    0.098945
 top-middle-square       0.092850
 middle-left-square      0.091012
 dtype: float64,
 middle-middle-square    0.1555

I would say that the elimination of features is unnecessary here as the difference in feature importance is not very huge.

# Zoo

In [679]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []
zoo

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,enomous,fins,legs,tail,domestic,catsize,class
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1,0,0,1,0,0,0,1,1,1,0,0,2,1,0,1,1
97,1,0,1,0,1,0,0,0,0,1,1,0,6,0,0,0,6
98,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
99,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,7


In [680]:
X_train, X_test, y_train, y_test = train_test_split(zoo.iloc[:, 0:(len(zoo.columns)-1)], zoo.iloc[:, -1], test_size=0.3, random_state = 42)

for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, zoo.columns[0:len(zoo.columns)-1], i)


Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 93.55%.
F1 Score = 0.92.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 93.55%.
F1 Score = 0.92.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [681]:
base_accuracy

[(93.54838709677419, 0.9181451612903225),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9181155992810519),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015)]

In [682]:
print("Average of the base accuracy for zoo dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10

Average of the base accuracy for zoo dataset


93.54838709677418

In [683]:
print("Average of the base f1 for zoo dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the base f1 for zoo dataset


0.9200937706934067

In [685]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits




KeyboardInterrupt: 

In [686]:
grid_cv_accuracy

[(93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015)]

In [687]:
print("Average of the grid cv accuracy for zoo dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the grid cv accuracy for zoo dataset


93.54838709677418

In [688]:
print("Average of the grid cv f1 for zoo dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the grid cv f1 for zoo dataset


0.9206319175101694

In [689]:
grid_cv_best_param

[{'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500}]

## Elimination of features

In [690]:
feature_importance

[feathers    0.152414
 eggs        0.134620
 hair        0.105448
 milk        0.096956
 legs        0.092963
 fins        0.077487
 breathes    0.071687
 backbone    0.058846
 toothed     0.057158
 tail        0.051609
 aquatic     0.037213
 airborne    0.032463
 predator    0.015006
 catsize     0.010668
 enomous     0.005329
 domestic    0.000134
 dtype: float64,
 feathers    0.144764
 milk        0.134720
 hair        0.104221
 legs        0.099208
 eggs        0.095447
 fins        0.078299
 breathes    0.074841
 toothed     0.067273
 backbone    0.055116
 tail        0.052252
 aquatic     0.040938
 airborne    0.027041
 predator    0.010475
 catsize     0.010280
 enomous     0.003922
 domestic    0.001205
 dtype: float64,
 milk        0.167271
 feathers    0.142783
 legs        0.103623
 eggs        0.084165
 breathes    0.079536
 hair        0.077049
 fins        0.075901
 backbone    0.069490
 toothed     0.068497
 tail        0.042489
 aquatic     0.032850
 airborne    0.02575

From the figure above, we can see the less important features such as: cat_size, domestic, enomous. We can try eliminating it and see it will affect the accuracy. Moving forward, we will use the best parameters from the GridCV.

In [691]:
zoo_new = zoo[zoo.columns.difference(['cat_size', 'enomous', 'domestic'])]

In [692]:
for i in seedno:
    clf_features = RandomForestClassifier(bootstrap= True, max_depth= 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 500, random_state = i)
    EliminationFeatures(zoo_new[zoo_new.columns.difference(['class'])], zoo_new.loc[:, "class"], clf_features)



Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.
Accuracy = 93.55%.
F1 Score = 0.92.


In [693]:
elimination_features_accuracy

[(93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9198751300728408),
 (93.54838709677419, 0.9208211143695015),
 (93.54838709677419, 0.9208211143695015)]

In [694]:
print("Average of the eliminated features accuracy for zoo dataset")
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the eliminated features accuracy for zoo dataset


93.54838709677418

In [695]:
print("Average of the eliminated features f1 for zoo dataset")
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the eliminated features f1 for zoo dataset


0.9206319175101694

The average accuracy is lower than the Grid CV model.

# Monks

In [605]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [606]:
X_train, X_test, y_train, y_test = train_test_split(monks.iloc[:, 0:(len(monks.columns)-1)], monks.iloc[:, -1], test_size=0.3, random_state = 42)

for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, monks.columns[0:len(monks.columns)-1], i)
    
    

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 98.46%.
F1 Score = 0.98.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 97.69%.
F1 Score = 0.98.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [607]:
base_accuracy

[(98.46153846153847, 0.984626331922347),
 (97.6923076923077, 0.9769435897435897),
 (97.6923076923077, 0.9769435897435897),
 (98.46153846153847, 0.984626331922347),
 (98.46153846153847, 0.984626331922347),
 (97.6923076923077, 0.9769435897435897),
 (97.6923076923077, 0.9769435897435897),
 (98.46153846153847, 0.984626331922347),
 (97.6923076923077, 0.9769435897435897),
 (97.6923076923077, 0.9769435897435897)]

In [608]:
print("Average of the base accuracy for monks dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][0]
summation/10

Average of the base accuracy for monks dataset


98.00000000000001

In [610]:
print("Average of the f1 score for monks dataset")
summation = 0
for i in range(0, len(base_accuracy)):
    summation= summation + base_accuracy[i][1]
summation/10

Average of the f1 score for monks dataset


0.9800166866150926

In [609]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 99.23%.
F1 Score = 0.99.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 98.46%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 98.46%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 99.23%.
F1 Score = 0.99.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 97.69%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 98.46%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 97.69%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 99.23%.
F1 Score = 0.99.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 98.46%.
F1 Score = 0.98.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 97.69%.
F1 Score = 0.98.


In [615]:
grid_cv_accuracy

[(99.23076923076923, 0.9923108877354917),
 (98.46153846153847, 0.984626331922347),
 (98.46153846153847, 0.984626331922347),
 (99.23076923076923, 0.9923108877354917),
 (97.6923076923077, 0.9769435897435897),
 (98.46153846153847, 0.984626331922347),
 (97.6923076923077, 0.9769435897435897),
 (99.23076923076923, 0.9923108877354917),
 (98.46153846153847, 0.984626331922347),
 (97.6923076923077, 0.9769435897435897)]

In [611]:
print("Average of the gridcv accuracy for monks dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][0]
summation/10

Average of the gridcv accuracy for monks dataset


98.46153846153848

In [612]:
print("Average of the gridcv f1 for monks dataset")
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation= summation + grid_cv_accuracy[i][1]
summation/10

Average of the gridcv f1 for monks dataset


0.9846268760126632

In [437]:
grid_cv_best_param

[{'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 20,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500}]

## Elimination of features

In [613]:
feature_importance

[a5    0.395959
 a1    0.302538
 a2    0.225102
 a4    0.033950
 a3    0.021898
 a6    0.020553
 dtype: float64,
 a5    0.386198
 a1    0.281249
 a2    0.248479
 a4    0.041405
 a6    0.021940
 a3    0.020728
 dtype: float64,
 a5    0.391711
 a1    0.285761
 a2    0.240424
 a4    0.035836
 a6    0.026132
 a3    0.020136
 dtype: float64,
 a5    0.393514
 a1    0.293169
 a2    0.241968
 a4    0.032946
 a6    0.019236
 a3    0.019167
 dtype: float64,
 a5    0.394821
 a1    0.287249
 a2    0.238908
 a4    0.035794
 a6    0.022658
 a3    0.020569
 dtype: float64,
 a5    0.389764
 a1    0.300328
 a2    0.230966
 a4    0.035740
 a3    0.022566
 a6    0.020634
 dtype: float64,
 a5    0.396735
 a1    0.279645
 a2    0.243736
 a4    0.036432
 a6    0.023878
 a3    0.019573
 dtype: float64,
 a5    0.386849
 a1    0.286689
 a2    0.246126
 a4    0.038752
 a3    0.021124
 a6    0.020461
 dtype: float64,
 a5    0.392917
 a1    0.286085
 a2    0.234357
 a4    0.039861
 a6    0.024442
 a3    0.022338


We can eliminate less impt features like a3, a6 and a4.

In [614]:
monks_new = monks[monks.columns.difference(['a3', 'a6', 'a4'])]

In [616]:
for i in seedno:
    clf_features = RandomForestClassifier(bootstrap= True, max_depth= 20, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 500, random_state = i)
    EliminationFeatures(monks_new[monks_new.columns.difference(['class'])], monks_new.loc[:, "class"], clf_features)

Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.
Accuracy = 100.00%.
F1 Score = 1.00.


# Messidor

In [617]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

In [618]:
messidor

Unnamed: 0,quality assessment,pre-screening,MA detection 1,MA detection 2,MA detection 3,MA detection 4,MA detection 5,MA detection 6,exudates info 1,exudates info 2,exudates info 3,exudates info 4,exudates info 5,exudates info 6,exudates info 7,exudates info 8,distance,diameter,result of the AM/FM-based classification,class
0,1,1,22,22,22,19,18,14,49.895756,17.775994,5.270920,0.771761,0.018632,0.006864,0.003923,0.003923,0.486903,0.100025,1,0
1,1,1,24,24,22,18,16,13,57.709936,23.799994,3.325423,0.234185,0.003903,0.003903,0.003903,0.003903,0.520908,0.144414,0,0
2,1,1,62,60,59,54,47,33,55.831441,27.993933,12.687485,4.852282,1.393889,0.373252,0.041817,0.007744,0.530904,0.128548,0,1
3,1,1,55,53,53,50,43,31,40.467228,18.445954,9.118901,3.079428,0.840261,0.272434,0.007653,0.001531,0.483284,0.114790,0,0
4,1,1,44,44,44,41,39,27,18.026254,8.570709,0.410381,0.000000,0.000000,0.000000,0.000000,0.000000,0.475935,0.123572,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146,1,1,34,34,34,33,31,24,6.071765,0.937472,0.031145,0.003115,0.000000,0.000000,0.000000,0.000000,0.537470,0.116795,0,0
1147,1,1,49,49,49,49,45,37,63.197145,27.377668,8.067688,0.979548,0.001552,0.000000,0.000000,0.000000,0.516733,0.124190,0,0
1148,1,0,49,48,48,45,43,33,30.461898,13.966980,1.763305,0.137858,0.011221,0.000000,0.000000,0.000000,0.560632,0.129843,0,0
1149,1,1,39,36,29,23,13,7,40.525739,12.604947,4.740919,1.077570,0.563518,0.326860,0.239568,0.174584,0.485972,0.106690,1,1


In [619]:
X_train, X_test, y_train, y_test = train_test_split(messidor.iloc[:, 0:(len(messidor.columns)-1)], messidor.iloc[:, -1], test_size=0.3, random_state = 42)

for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, messidor.columns[0:len(messidor.columns)-1], i)
    

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 68.50%.
F1 Score = 0.68.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 67.92%.
F1 Score = 0.68.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [620]:
base_accuracy

[(68.4971098265896, 0.684873733112288),
 (67.91907514450867, 0.6794560668870152),
 (69.07514450867052, 0.6911519223657953),
 (69.36416184971098, 0.6938156392297198),
 (69.07514450867052, 0.6912760463272094),
 (69.94219653179191, 0.6995927026404798),
 (68.4971098265896, 0.6850632004382277),
 (68.78612716763006, 0.6886130860957067),
 (71.67630057803468, 0.7171889972489258),
 (68.78612716763006, 0.6881950541092616)]

In [627]:
summation = 0
for i in range(0, len(base_accuracy)):
    summation = summation + base_accuracy[i][0]
    
print("Average accuracy for base for messidor")
summation/10

Average accuracy for base for messidor


69.16184971098266

In [630]:
summation = 0
for i in range(0, len(base_accuracy)):
    summation = summation + base_accuracy[i][1]
    
print("Average f1 for base for messidor")
summation/10

Average f1 for base for messidor


0.6919226448454628

In [628]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 70.81%.
F1 Score = 0.71.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 70.23%.
F1 Score = 0.70.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 69.36%.
F1 Score = 0.69.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 70.81%.
F1 Score = 0.71.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 71.10%.
F1 Score = 0.71.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 68.79%.
F1 Score = 0.69.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 69.65%.
F1 Score = 0.70.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 69.94%.
F1 Score = 0.70.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 70.81%.
F1 Score = 0.71.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 69.94%.
F1 Score = 0.70.


In [629]:
grid_cv_best_param

[{'max_depth': 30,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 4,
  'min_samples_split': 2,
  'n_estimators': 700},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 500},
 {'max_depth': 30,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 30,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 600},
 {'max_depth': 20,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 600}]

In [634]:
grid_cv_accuracy

[(70.8092485549133, 0.708763705637548),
 (70.23121387283237, 0.7028171287075003),
 (69.36416184971098, 0.6942151507193598),
 (70.8092485549133, 0.7073582466875445),
 (71.09826589595376, 0.7105963366991661),
 (68.78612716763006, 0.6874440436350993),
 (69.65317919075144, 0.6957684742791304),
 (69.94219653179191, 0.7001459347588287),
 (70.8092485549133, 0.7086854017192468),
 (69.94219653179191, 0.6998740378968192)]

In [633]:
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation = summation + grid_cv_accuracy[i][0]
    
print("Average accuracy for grid cv for messidor")
summation/10

Average accuracy for grid cv for messidor


70.14450867052022

In [631]:
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation = summation + grid_cv_accuracy[i][1]
    
print("Average f1 for grid cv for messidor")
summation/10

Average f1 for grid cv for messidor


0.7015668460740242

## Elimination of features

In [635]:
feature_importance

[exudates info 1                             0.099272
 MA detection 1                              0.091182
 exudates info 2                             0.071591
 distance                                    0.067258
 exudates info 3                             0.066822
 diameter                                    0.065646
 MA detection 2                              0.064859
 exudates info 4                             0.061658
 MA detection 5                              0.058013
 exudates info 7                             0.056804
 MA detection 6                              0.054035
 MA detection 3                              0.053854
 MA detection 4                              0.052212
 exudates info 5                             0.044159
 exudates info 6                             0.041501
 exudates info 8                             0.033415
 pre-screening                               0.008754
 result of the AM/FM-based classification    0.008230
 quality assessment         

From the figure above, we can see the less important features such as: quality_assessment, pre-screening, result of the AM/FM-based classification. We can try eliminating it and see it will affect the accuracy. Moving forward, we will use the best parameters from the GridCV.

In [636]:
messidor_new = messidor[messidor.columns.difference(['quality_assessment', 'pre-screening', 'result of the AM/FM-based classification'])]

In [637]:
for i in seedno:
    clf_features = RandomForestClassifier(bootstrap= True, max_depth= 10, max_features= 'sqrt', min_samples_leaf= 4, min_samples_split= 2, n_estimators= 700, random_state = i)
    EliminationFeatures(messidor_new[messidor_new.columns.difference(['class'])], messidor_new.loc[:, "class"], clf_features)



Accuracy = 70.23%.
F1 Score = 0.70.
Accuracy = 70.52%.
F1 Score = 0.71.
Accuracy = 71.68%.
F1 Score = 0.72.
Accuracy = 71.39%.
F1 Score = 0.71.
Accuracy = 70.52%.
F1 Score = 0.71.
Accuracy = 71.39%.
F1 Score = 0.71.
Accuracy = 72.54%.
F1 Score = 0.73.
Accuracy = 71.39%.
F1 Score = 0.71.
Accuracy = 70.23%.
F1 Score = 0.70.
Accuracy = 70.52%.
F1 Score = 0.71.


In [638]:
elimination_features_accuracy

[(70.23121387283237, 0.7020211826458158),
 (70.52023121387283, 0.7050151577659215),
 (71.67630057803468, 0.7169238928727597),
 (71.38728323699422, 0.7137843997992342),
 (70.52023121387283, 0.7050151577659215),
 (71.38728323699422, 0.7129042069435189),
 (72.54335260115607, 0.7251651684597329),
 (71.38728323699422, 0.7137843997992342),
 (70.23121387283237, 0.7018022757347762),
 (70.52023121387283, 0.7050151577659215)]

In [640]:
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation = summation + elimination_features_accuracy[i][0]
    
print("Average accuracy for eliminated features for messidor")
summation/10

Average accuracy for eliminated features for messidor


71.04046242774567

In [639]:
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation = summation + elimination_features_accuracy[i][1]
    
print("Average f1 for eliminated features for messidor")
summation/10

Average f1 for eliminated features for messidor


0.7101430999552837

It does not perform as well as the GridCV model's performance.

# Seeds

In [641]:
base_accuracy = []
grid_cv_accuracy = []
feature_importance = []
grid_cv_best_param = []
elimination_features_accuracy = []

seeds

Unnamed: 0,area,perimeter,compactness,length of kernel,width of kernel,asymmetry coefficient,length of kernel groove,class
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


In [642]:
X_train, X_test, y_train, y_test = train_test_split(seeds.iloc[:, 0:(len(seeds.columns)-1)], seeds.iloc[:, -1], test_size=0.3, random_state = 42)

for i in seedno:
    RandomForestBaseModel(X_train, X_test, y_train, y_test, seeds.columns[0:len(seeds.columns)-1], i)
    

Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 1, 'verbose': 0, 'warm_start': False}
Accuracy = 88.89%.
F1 Score = 0.89.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 12, 'verbose': 0, 'warm_start': False}
Accuracy = 88.89%.
F1 Score = 0.89.
Parameters currently in use:

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': Non

In [643]:
base_accuracy

[(88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (90.47619047619048, 0.9059130553336858),
 (87.3015873015873, 0.8757266542980829),
 (88.88888888888889, 0.8903037108206868)]

In [644]:
summation = 0
for i in range(0, len(base_accuracy)):
    summation = summation + base_accuracy[i][0]
    
print("Average accuracy for base for seeds")
summation/10

Average accuracy for base for seeds


88.8888888888889

In [645]:
summation = 0
for i in range(0, len(base_accuracy)):
    summation = summation + base_accuracy[i][1]
    
print("Average f1 for base for seeds")
summation/10

Average f1 for base for seeds


0.8904069396197265

In [646]:
for i in seedno:
    RandomForestGridCV(X_train, X_test, y_train, y_test, param_grid, i)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 87.30%.
F1 Score = 0.87.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Accuracy = 88.89%.
F1 Score = 0.89.


In [647]:
grid_cv_accuracy

[(88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (87.3015873015873, 0.8745292647731672),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868)]

In [648]:
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation = summation + base_accuracy[i][0]
    
print("Average accuracy for gridcv for seeds")
summation/10

Average accuracy for gridcv for seeds


88.8888888888889

In [650]:
summation = 0
for i in range(0, len(grid_cv_accuracy)):
    summation = summation + base_accuracy[i][1]
    
print("Average f1 for gridcv for seeds")
summation/10

Average f1 for gridcv for seeds


0.8904069396197265

In [653]:
grid_cv_accuracy

[(88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868),
 (87.3015873015873, 0.8745292647731672),
 (88.88888888888889, 0.8903037108206868),
 (88.88888888888889, 0.8903037108206868)]

In [649]:
grid_cv_best_param

[{'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 600},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500},
 {'max_depth': 10,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 500}]

## Elimination of features

In [651]:
feature_importance

[length of kernel groove    0.215229
 perimeter                  0.214531
 area                       0.173712
 width of kernel            0.172463
 length of kernel           0.109626
 asymmetry coefficient      0.064057
 compactness                0.050381
 dtype: float64,
 area                       0.218189
 length of kernel groove    0.195882
 width of kernel            0.157909
 length of kernel           0.156485
 perimeter                  0.145090
 compactness                0.064767
 asymmetry coefficient      0.061678
 dtype: float64,
 area                       0.199979
 perimeter                  0.198147
 length of kernel groove    0.187925
 length of kernel           0.138028
 width of kernel            0.121118
 asymmetry coefficient      0.081109
 compactness                0.073695
 dtype: float64,
 perimeter                  0.220653
 area                       0.181722
 length of kernel groove    0.178744
 width of kernel            0.144029
 length of kernel       

From the figure above, we can see the less important features such as: compactness and asymmetry coefficient. We can try eliminating it and see it will affect the accuracy. Moving forward, we will use the best parameters from the GridCV.

In [652]:
seeds_new = seeds[seeds.columns.difference(['compactness', 'asymmetry coefficient'])]

In [654]:
for i in seedno:
    clf_features = RandomForestClassifier(bootstrap= True, max_depth= 10, max_features= 'sqrt', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 500, random_state = i)
    EliminationFeatures(seeds_new[seeds_new.columns.difference(['class'])], seeds_new.loc[:, "class"], clf_features)



Accuracy = 87.30%.
F1 Score = 0.87.
Accuracy = 88.89%.
F1 Score = 0.89.
Accuracy = 85.71%.
F1 Score = 0.86.
Accuracy = 87.30%.
F1 Score = 0.87.
Accuracy = 87.30%.
F1 Score = 0.87.
Accuracy = 88.89%.
F1 Score = 0.89.
Accuracy = 87.30%.
F1 Score = 0.87.
Accuracy = 85.71%.
F1 Score = 0.86.
Accuracy = 85.71%.
F1 Score = 0.86.
Accuracy = 88.89%.
F1 Score = 0.89.


In [655]:
elimination_features_accuracy

[(87.3015873015873, 0.8745537415823454),
 (88.88888888888889, 0.8900503290747194),
 (85.71428571428572, 0.8586914440572977),
 (87.3015873015873, 0.8745537415823454),
 (87.3015873015873, 0.8740568675528025),
 (88.88888888888889, 0.8900503290747194),
 (87.3015873015873, 0.8745537415823454),
 (85.71428571428572, 0.8589092408880108),
 (85.71428571428572, 0.8589092408880108),
 (88.88888888888889, 0.8900503290747194)]

In [658]:
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation = summation + elimination_features_accuracy[i][0]
    
print("Average eliminated features accuracy for seeds")
summation/10

Average eliminated features accuracy for seeds


87.30158730158732

In [659]:
summation = 0
for i in range(0, len(elimination_features_accuracy)):
    summation = summation + elimination_features_accuracy[i][1]
    
print("Average eliminated features f1 for seeds")
summation/10

Average eliminated features f1 for seeds


0.8744379005357314