In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import arff
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, f1_score

# 1. Importation des données

In [3]:
data = arff.loadarff('Dataset/PhishingData.arff')
df_base = pd.DataFrame(data[0])

# 2. Encodage des variables

In [5]:
label_encoder = LabelEncoder()
target_var = label_encoder.fit_transform(df_base.Result)

df = df_base.drop(['Result'], axis=1)

df_encoded = OneHotEncoder().fit_transform(df).toarray()

# 3. 

### Split des données pour le protocole de validation KFold
Pour éviter la sous représentation des données pour utiliser la classe StratifiedKFold pour assurer dans chaque split le même pourcentatage de représentation que dans le jeu de données initial.

In [148]:
stratifiedKFold = StratifiedKFold(n_splits=10, shuffle=True)

In [151]:
scores = pd.DataFrame(columns=['train_accuracy', 'train_f1_score', 'test_accuracy', 'test_f1_score'])
classifiers = np.array([])

for train_index, test_index in stratifiedKFold.split(df_encoded, target_var):
    X_train, X_test = df_encoded[train_index], df_encoded[test_index]
    y_train, y_test = target_var[train_index], target_var[test_index]
    
    decision_tree = DecisionTreeClassifier(criterion='gini')
    decision_tree.fit(X_train, y_train)
    classifiers = np.append(classifiers, decision_tree)
    
    train_predictions = decision_tree.predict(X_train)
    train_accurary = accuracy_score(y_train, train_predictions)
    train_f1_score = f1_score(y_train, train_predictions, average='micro')
    test_predictions = decision_tree.predict(X_test)
    test_accurary = accuracy_score(y_test, test_predictions)
    test_f1_score = f1_score(y_test, test_predictions, average='micro')
    scores = scores.append({'train_accuracy': train_accurary, 'train_f1_score': train_f1_score, 'test_accuracy': test_accurary, 'test_f1_score': test_f1_score}, ignore_index=True)


In [154]:
scores.describe()

Unnamed: 0,train_accuracy,train_f1_score,test_accuracy,test_f1_score
count,10.0,10.0,10.0,10.0
mean,0.962224,0.962224,0.88396,0.88396
std,0.002789,0.002789,0.032179,0.032179
min,0.958128,0.958128,0.822222,0.822222
25%,0.959762,0.959762,0.869485,0.869485
50%,0.962218,0.962218,0.885185,0.885185
75%,0.964484,0.964484,0.907407,0.907407
max,0.966338,0.966338,0.933333,0.933333


In [153]:
scores

Unnamed: 0,train_accuracy,train_f1_score,test_accuracy,test_f1_score
0,0.962202,0.962202,0.867647,0.867647
1,0.962202,0.962202,0.911765,0.911765
2,0.963846,0.963846,0.875,0.875
3,0.964696,0.964696,0.881481,0.881481
4,0.966338,0.966338,0.822222,0.822222
5,0.958949,0.958949,0.933333,0.933333
6,0.958128,0.958128,0.911111,0.911111
7,0.964696,0.964696,0.851852,0.851852
8,0.962233,0.962233,0.888889,0.888889
9,0.958949,0.958949,0.896296,0.896296


In [155]:
final_classifier = classifiers[5]

In [None]:
fig = plt.figure(figsize=(150,150))
plot_tree(final_classifier, feature_names=pd.Series(np.arange(0, 25)).apply(str), class_names=pd.Series(real_labels).apply(str).unique(), filled=True)

In [157]:
fig.savefig("decision_tree.jpg")