In [1]:
%store -r normalized_df

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn import utils

X = normalized_df.drop(['kreftform'], axis=1)
y = normalized_df[['kreftform']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

lab_enc = preprocessing.LabelEncoder()
encoded_y_train = lab_enc.fit_transform(y_train.values.ravel())
encoded_y_test = lab_enc.fit_transform(y_test.values.ravel())

def run_once():
    clf = RandomForestClassifier(n_estimators=100)

    clf.fit(X_train, encoded_y_train)
    y_pred = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    
    accuracy = accuracy_score(encoded_y_test, y_pred)
    print('Train accuracy: ', accuracy_score(encoded_y_train, y_pred_train))
    print(confusion_matrix(encoded_y_train, y_pred_train))
    print('----------------------------------')
    print('Test accuracy: ', accuracy_score(encoded_y_test, y_pred))
    print(confusion_matrix(encoded_y_test, y_pred))
    
def run_more_than_once(run_number):
    tot_acc = 0
    max_matrix = 0
    max_accuracy = 0

    min_matrix = 0
    min_accuracy = 1
    
    tot_acc_train = 0
    
    for i in range(run_number):
        clf = RandomForestClassifier(n_estimators=100)

        clf.fit(X_train, encoded_y_train)
        y_pred = clf.predict(X_test)

        accuracy_train = accuracy_score(encoded_y_train, clf.predict(X_train))
        accuracy = accuracy_score(encoded_y_test, y_pred)

        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_matrix = confusion_matrix(encoded_y_test, y_pred)

        if accuracy < min_accuracy:
            min_accuracy = accuracy
            min_matrix = confusion_matrix(encoded_y_test, y_pred)

        tot_acc += accuracy
        tot = i+1
        mean = tot_acc/tot
        
        tot_acc_train += accuracy_train
        total = i+1
        mean_train = tot_acc_train/total

    print('Maximum')
    print(max_accuracy, '\n', max_matrix)
    print('----------------------------------')
    print('Minimum')
    print(min_accuracy, '\n', min_matrix)
    print('----------------------------------')
    print('Mean train accuracy:', mean_train)
    print('Mean test accuracy:', mean)

In [4]:
run_once()

Train accuracy:  1.0
[[137   0   0]
 [  0 282   0]
 [  0   0  36]]
----------------------------------
Test accuracy:  0.8444444444444444
[[ 59  17   0]
 [  7 120   0]
 [  5   6  11]]


In [5]:
run_more_than_once(100)

Maximum
0.8755555555555555 
 [[ 62  14   0]
 [  4 123   0]
 [  5   5  12]]
----------------------------------
Minimum
0.8222222222222222 
 [[ 56  20   0]
 [  7 119   1]
 [  5   7  10]]
----------------------------------
Mean train accuracy: 1.0
Mean test accuracy: 0.8459555555555562
