In [38]:
%store -r normalized_df

In [68]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [69]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn import utils

#X = normalized_df[['Sympt_blødning', 'Sympt_smerter', 'Sympt_ascites', 'Sympt_fatigue']]
#X = normalized_df[['histologi','Substadium','Primærbehandling','FIGO_stadium','Født','Age_at_diagnosis','Age','Behandling','age_rec','Histo_nummer']]
X = normalized_df.drop(['kreftform'], axis=1)
#X = normalized_df[['Sympt_blødning', 'Sympt_smerter', 'Sympt_ascites', 'Sympt_fatigue', 'Lengde_sympt_dager', 'Lengde_sympt_uker', 'Lengde_sympt_mnd']]
y = normalized_df[['kreftform']]

def run_once():
    
    data_dmatrix = xgb.DMatrix(data=X,label=y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    lab_enc = preprocessing.LabelEncoder()
    encoded_y_train = lab_enc.fit_transform(y_train.values.ravel())
    encoded_y_test = lab_enc.fit_transform(y_test.values.ravel())

    xg_reg = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                    max_depth = 5, alpha = 10, n_estimators = 100)

    xg_reg.fit(X_train,encoded_y_train)

    y_pred = xg_reg.predict(X_test)
    
    feature_list = []
    for feature in zip(X.columns, xg_reg.feature_importances_):
        feature_list.append(feature)

#    rmse = np.sqrt(mean_squared_error(encoded_y_test, y_pred))
#    print("RMSE: %f" % (rmse))

    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(encoded_y_test, predictions)
    
    print(confusion_matrix(encoded_y_test, predictions))
    print(accuracy_score(encoded_y_test, predictions))
    
    return feature_list

def run_more_than_once(run_number):
    tot_acc = 0
    max_matrix = 0
    max_accuracy = 0

    min_matrix = 0
    min_accuracy = 1
    for i in range(run_number):
        
        data_dmatrix = xgb.DMatrix(data=X,label=y)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

        lab_enc = preprocessing.LabelEncoder()
        encoded_y_train = lab_enc.fit_transform(y_train.values.ravel())
        encoded_y_test = lab_enc.fit_transform(y_test.values.ravel())

#        xg_reg = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
#                        max_depth = 5, alpha = 10, n_estimators = 100)
    
        xg_reg = xgb.XGBClassifier(
            learning_rate =0.1,
            n_estimators=100,
            max_depth=4,
            min_child_weight=6,
            gamma=0,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.005,
            objective= 'binary:logistic',
            nthread=4,
            scale_pos_weight=1,
            seed=27)

        xg_reg.fit(X_train,encoded_y_train)

        y_pred = xg_reg.predict(X_test)
        encoded_y_pred = lab_enc.fit_transform(y_pred)

        #rmse = np.sqrt(mean_squared_error(encoded_y_test, y_pred))
        #print("RMSE: %f" % (rmse))

        predictions = [round(value) for value in encoded_y_pred]
        accuracy = accuracy_score(encoded_y_test, predictions)
        #print("Accuracy: %.2f%%" % (accuracy * 100.0))

        accuracy = accuracy_score(encoded_y_test, predictions)

        if accuracy > max_accuracy:
            max_accuracy = accuracy
            max_matrix = confusion_matrix(encoded_y_test, encoded_y_pred)

        if accuracy < min_accuracy:
            min_accuracy = accuracy
            min_matrix = confusion_matrix(encoded_y_test, encoded_y_pred)

        tot_acc += accuracy
        tot = i+1
        mean = tot_acc/tot

    print('Maximum')
    print(max_accuracy, '\n', max_matrix)
    print('----------------------------------')
    print('Minimum')
    print(min_accuracy, '\n', min_matrix)
    print('----------------------------------')
    print('mean accuracy:', mean)

In [95]:
run_more_than_once(100)

Maximum
0.6577777777777778 
 [[ 26  40   0]
 [ 25 115   4]
 [  1   7   7]]
----------------------------------
Minimum
0.5333333333333333 
 [[21 49  1]
 [39 91  8]
 [ 2  6  8]]
----------------------------------
mean accuracy: 0.5785777777777777


In [94]:
features = run_once()

[[ 22  44   0]
 [ 36 104   3]
 [  2  11   3]]
0.5733333333333334


In [71]:
features_importance = pd.DataFrame(features)

In [72]:
features_names = features_importance[0]

In [96]:
features_names = features_names[:4]

In [97]:
features_names

0            Født
1    Oppdaget_når
2             Age
3             Død
Name: 0, dtype: object

In [74]:
feature_names = list(features_names)

In [75]:
X = normalized_df[feature_names]