In [31]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt
from lib.dataset import Dataset
from lib.knn import Knn
from lib.decisionTree import decisionTree

# Part 2

# Analyse supervisé

preparation des données

In [32]:
Data = pd.read_csv("Data/Dataset1.csv")
ds = Dataset(Data)
ds.preprocessData(null="drop",outliers=None,normalisation=None)
Data = ds.data
train=Data.sample(frac=0.8)
test=Data.drop(train.index)

In [33]:
fert2 = train[train["Fertility"] == 2]
fert2.shape

(34, 14)

In [34]:
fert2 = test[test["Fertility"] == 2]
fert2.shape

(5, 14)

Définition et Initialisation de la classe 

In [35]:
class Classification:
    #marge is the percentage of the train data between 0 and 1
    def __init__(self, data,marge):
        self.train=data.sample(frac=marge,random_state=42)
        self.test=data.drop(self.train.index)
        self.trees = None
        self.knn = None
        self.randomTrees = None

    def testKnn(self,data=None,distance="euclidienne",k=2):
        
        results = []
        
        self.knn = Knn(np.array(self.train)) 
        if data is None:
            data = self.test

        for i in range(len(data)):
            elem = list(data.iloc[i])[:-1]
            classe = self.knn.getClass(elem,algo=distance,k=k)
            results.append(classe)

        return results
    
    def trainDecisionTree(self,maxDepth=10,minSamplesSplit=2):
        
        self.tree = decisionTree(maxDepth=maxDepth,minSamplesSplit=minSamplesSplit)
        trainX,trainY = np.array(self.train)[:,:-1],np.array(self.train)[:,-1].reshape(-1,1)
        self.tree.fit(trainX,trainY)
        
        return self.tree
    def testDecisionTree(self,data=None):
        if data is None:
            data = self.test
        testX,testY = np.array(data)[:,:-1],np.array(data)[:,-1].reshape(-1,1)
        pred = self.tree.predict(testX)
        pred = np.array(pred).reshape(-1,1)

        return pred
    
    def trainRandomForest(self,data=None,n_tree=100,maxDepth=10,minSamplesSplit=2):
        self.randomTrees = []

        nb_cols = int(sqrt(len(self.train.columns)-1))

        for i in range(n_tree):
            random_subset = train.sample(n=len(train), replace=True)
            cols = []
            while len(cols) < nb_cols:
                col = np.random.randint(0,len(train.columns)-1)
                if col not in cols:
                    cols.append(col)
            cols.append(len(train.columns)-1)
            traincols = train.iloc[:,cols]
            traincols = traincols.loc[random_subset.index]
            trainX,trainY = np.array(traincols)[:,:-1],np.array(traincols)[:,-1].reshape(-1,1)
            tree = decisionTree(maxDepth=maxDepth,minSamplesSplit=minSamplesSplit)
            tree.fit(trainX,trainY)
            self.randomTrees.append((tree,cols))
        
        return self.randomTrees

    def testRandomForest(self,data=None):
        if data is None:
            data = self.test

        if self.randomTrees is None:
            print("You need to train the random forest first")
            return None
        
        results = []
        for i in range(len(data)):
            elem = list(data.iloc[i])[:-1]
            elementClasses = []
            for tree,cols in self.randomTrees:
                row = [elem[i] for i in cols[:-1]]
                prediction = tree.predict([row])[0]
                elementClasses.append(prediction)
            print(elementClasses)
            print(set(elementClasses))
            classe = max(set(elementClasses), key=elementClasses.count)
            results.append(classe)

        return results

    def confMatrix(self,results):
        self.matrix = np.zeros((len(self.test["Fertility"].unique()),len(self.test["Fertility"].unique())))
        for i in range(len(self.test)):
            self.matrix[int(self.test.iloc[i]["Fertility"])][int(results[i])]+=1
        return self.matrix

    def getMetrics(self,confMatrix):
        metrics = {}
        for i in range(len(confMatrix)):
            TP = confMatrix[i][i]
            FP = sum(confMatrix[:,i])-TP
            FN = sum(confMatrix[i,:])-TP
            TN = sum(sum(confMatrix))-TP-FP-FN
            metrics[i] = {}
            metrics[i]["precision"] = TP/(TP+FP)
            metrics[i]["recall"] = TP/(TP+FN)
            metrics[i]["accuracy"] = (TP+TN)/(TP+FP+FN+TN)
            metrics[i]["specificity"] = TN/(TN+FP)
            metrics[i]["f1"] = 2*(metrics[i]["precision"]*metrics[i]["recall"])/(metrics[i]["precision"]+metrics[i]["recall"])
        return metrics

classificateur = Classification(Data,0.8)

Knn execution

In [36]:
knnResult = classificateur.testKnn()

Tree execution

In [37]:
tree = classificateur.trainDecisionTree(maxDepth=3)

In [38]:
treeResult = classificateur.testDecisionTree()

Random tree execution

In [39]:
trees = classificateur.trainRandomForest(n_tree=10,maxDepth=10,minSamplesSplit=2)

In [40]:
randomResult = classificateur.testRandomForest()

[1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0]
{0.0, 1.0}
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0]
{0.0, 1.0}
[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0]
{0.0, 1.0}
[0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0, 2.0, 1.0]
{0.0, 1.0, 2.0}
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
{0.0, 1.0}
[1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0]
{0.0, 1.0}
[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
{0.0, 1.0}
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
{0.0}
[0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
{0.0, 1.0}
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
{0.0}
[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
{0.0, 1.0}
[1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
{0.0, 1.0}
[2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
{0.0, 1.0, 2.0}
[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0]
{0.0, 1.0}
[0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
{0.0, 1.0}
[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
{0.0, 1.0}
[0.0, 0.

Matrice de confusion

In [41]:
knnConfM = classificateur.confMatrix(knnResult)
knnConfM

array([[63.,  7.,  1.],
       [10., 79.,  5.],
       [ 0.,  7.,  4.]])

In [42]:
treeConfM = classificateur.confMatrix(treeResult)
treeConfM

  self.matrix[int(self.test.iloc[i]["Fertility"])][int(results[i])]+=1


array([[68.,  3.,  0.],
       [10., 77.,  7.],
       [ 0.,  6.,  5.]])

In [43]:
randomConfM = classificateur.confMatrix(randomResult)
randomConfM

array([[68.,  3.,  0.],
       [10., 84.,  0.],
       [ 0.,  5.,  6.]])

In [44]:
knnScore = classificateur.getMetrics(knnConfM)
treeScore = classificateur.getMetrics(treeConfM)
randomScore = classificateur.getMetrics(randomConfM)

In [45]:
knnScore

{0: {'precision': 0.863013698630137,
  'recall': 0.8873239436619719,
  'accuracy': 0.8977272727272727,
  'specificity': 0.9047619047619048,
  'f1': 0.875},
 1: {'precision': 0.8494623655913979,
  'recall': 0.8404255319148937,
  'accuracy': 0.8352272727272727,
  'specificity': 0.8292682926829268,
  'f1': 0.8449197860962566},
 2: {'precision': 0.4,
  'recall': 0.36363636363636365,
  'accuracy': 0.9261363636363636,
  'specificity': 0.9636363636363636,
  'f1': 0.380952380952381}}

In [46]:
treeScore

{0: {'precision': 0.8717948717948718,
  'recall': 0.9577464788732394,
  'accuracy': 0.9261363636363636,
  'specificity': 0.9047619047619048,
  'f1': 0.9127516778523489},
 1: {'precision': 0.8953488372093024,
  'recall': 0.8191489361702128,
  'accuracy': 0.8522727272727273,
  'specificity': 0.8902439024390244,
  'f1': 0.8555555555555555},
 2: {'precision': 0.4166666666666667,
  'recall': 0.45454545454545453,
  'accuracy': 0.9261363636363636,
  'specificity': 0.9575757575757575,
  'f1': 0.43478260869565216}}

In [47]:
randomScore

{0: {'precision': 0.8717948717948718,
  'recall': 0.9577464788732394,
  'accuracy': 0.9261363636363636,
  'specificity': 0.9047619047619048,
  'f1': 0.9127516778523489},
 1: {'precision': 0.9130434782608695,
  'recall': 0.8936170212765957,
  'accuracy': 0.8977272727272727,
  'specificity': 0.9024390243902439,
  'f1': 0.9032258064516129},
 2: {'precision': 1.0,
  'recall': 0.5454545454545454,
  'accuracy': 0.9715909090909091,
  'specificity': 1.0,
  'f1': 0.7058823529411764}}

In [48]:
import pickle

#pickle.dump(classificateur, open("lib/classificateur2.p", "wb"))