In [18]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from math import sqrt
from lib.dataset import Dataset
from lib.knn import Knn
from lib.decisionTree import decisionTree

# Part 2

# Analyse supervisé

preparation des données

In [19]:
Data = pd.read_csv("Data/Dataset1.csv")
ds = Dataset(Data)
ds.preprocessData(null="drop",outliers=None,normalisation=None)
Data = ds.data
train=Data.sample(frac=0.8)
test=Data.drop(train.index)

In [20]:
fert2 = train[train["Fertility"] == 2]
fert2.shape

(29, 14)

In [21]:
fert2 = test[test["Fertility"] == 2]
fert2.shape

(10, 14)

Définition et Initialisation de la classe 

In [22]:
class Classification:
    #marge is the percentage of the train data between 0 and 1
    def __init__(self, data,marge):
        self.train=data.sample(frac=marge,random_state=42)
        self.test=data.drop(self.train.index)
        self.trees = None
        self.knn = None
        self.randomTrees = None

    def testKnn(self,data=None,distance="euclidienne",k=2):
        
        results = []
        
        self.knn = Knn(np.array(self.train)) 
        if data is None:
            data = self.test

        for i in range(len(data)):
            elem = list(data.iloc[i])[:-1]
            classe = self.knn.getClass(elem,algo=distance,k=k)
            results.append(classe)

        return results
    
    def trainDecisionTree(self,maxDepth=10,minSamplesSplit=2):
        
        self.tree = decisionTree(maxDepth=maxDepth,minSamplesSplit=minSamplesSplit)
        trainX,trainY = np.array(self.train)[:,:-1],np.array(self.train)[:,-1].reshape(-1,1)
        self.tree.fit(trainX,trainY)
        
        return self.tree
    def testDecisionTree(self,data=None):
        if data is None:
            data = self.test
        testX,testY = np.array(data)[:,:-1],np.array(data)[:,-1].reshape(-1,1)
        pred = self.tree.predict(testX)
        pred = np.array(pred).reshape(-1,1)

        return pred
    
    def trainRandomForest(self,data=None,n_tree=100,maxDepth=10,minSamplesSplit=2):
        self.randomTrees = []

        nb_cols = int(sqrt(len(self.train.columns)-1))

        for i in range(n_tree):
            random_subset = train.sample(n=len(train), replace=True)
            cols = []
            while len(cols) < nb_cols:
                col = np.random.randint(0,len(train.columns)-1)
                if col not in cols:
                    cols.append(col)
            cols.append(len(train.columns)-1)
            traincols = train.iloc[:,cols]
            traincols = traincols.loc[random_subset.index]
            trainX,trainY = np.array(traincols)[:,:-1],np.array(traincols)[:,-1].reshape(-1,1)
            tree = decisionTree(maxDepth=maxDepth,minSamplesSplit=minSamplesSplit)
            tree.fit(trainX,trainY)
            self.randomTrees.append((tree,cols))
        
        return self.randomTrees

    def testRandomForest(self,data=None):
        if data is None:
            data = self.test

        if self.randomTrees is None:
            print("You need to train the random forest first")
            return None
        
        results = []
        for i in range(len(data)):
            elem = list(data.iloc[i])[:-1]
            elementClasses = []
            for tree,cols in self.randomTrees:
                row = [elem[i] for i in cols[:-1]]
                prediction = tree.predict([row])[0]
                elementClasses.append(prediction)

            classe = max(set(elementClasses), key=elementClasses.count)
            results.append(classe)

        return results


    def confMatrix(self,results):
        self.matrix = np.zeros((len(self.test["Fertility"].unique()),len(self.test["Fertility"].unique())))
        for i in range(len(self.test)):
            self.matrix[int(self.test.iloc[i]["Fertility"])][int(results[i])]+=1
        return self.matrix

    def getMetrics(self,confMatrix):
        metrics = {}
        for i in range(len(confMatrix)):
            TP = confMatrix[i][i]
            FP = sum(confMatrix[:,i])-TP
            FN = sum(confMatrix[i,:])-TP
            TN = sum(sum(confMatrix))-TP-FP-FN
            metrics[i] = {}
            metrics[i]["precision"] = TP/(TP+FP)
            metrics[i]["recall"] = TP/(TP+FN)
            metrics[i]["accuracy"] = (TP+TN)/(TP+FP+FN+TN)
            metrics[i]["f1"] = 2*(metrics[i]["precision"]*metrics[i]["recall"])/(metrics[i]["precision"]+metrics[i]["recall"])
        return metrics

classificateur = Classification(Data,0.8)

Knn execution

In [23]:
knnResult = classificateur.testKnn()

Tree execution

In [24]:
tree = classificateur.trainDecisionTree()

In [25]:
treeResult = classificateur.testDecisionTree()

Random tree execution

In [26]:
trees = classificateur.trainRandomForest(n_tree=100,maxDepth=10,minSamplesSplit=2)

In [27]:
randomResult = classificateur.testRandomForest()

Matrice de confusion

In [28]:
knnConfM = classificateur.confMatrix(knnResult)

In [29]:
treeConfM = classificateur.confMatrix(treeResult)

  self.matrix[int(self.test.iloc[i]["Fertility"])][int(results[i])]+=1


In [30]:
randomConfM = classificateur.confMatrix(randomResult)

In [31]:
knnScore = classificateur.getMetrics(knnConfM)
treeScore = classificateur.getMetrics(treeConfM)
randomScore = classificateur.getMetrics(randomConfM)

In [32]:
knnScore

{0: {'precision': 0.863013698630137,
  'recall': 0.8873239436619719,
  'accuracy': 0.8977272727272727,
  'f1': 0.875},
 1: {'precision': 0.8494623655913979,
  'recall': 0.8404255319148937,
  'accuracy': 0.8352272727272727,
  'f1': 0.8449197860962566},
 2: {'precision': 0.4,
  'recall': 0.36363636363636365,
  'accuracy': 0.9261363636363636,
  'f1': 0.380952380952381}}

In [33]:
treeScore

{0: {'precision': 0.881578947368421,
  'recall': 0.9436619718309859,
  'accuracy': 0.9261363636363636,
  'f1': 0.9115646258503401},
 1: {'precision': 0.9,
  'recall': 0.8617021276595744,
  'accuracy': 0.875,
  'f1': 0.8804347826086957},
 2: {'precision': 0.6,
  'recall': 0.5454545454545454,
  'accuracy': 0.9488636363636364,
  'f1': 0.5714285714285713}}

In [34]:
randomScore

{0: {'precision': 0.9726027397260274,
  'recall': 1.0,
  'accuracy': 0.9886363636363636,
  'f1': 0.9861111111111112},
 1: {'precision': 0.9583333333333334,
  'recall': 0.9787234042553191,
  'accuracy': 0.9659090909090909,
  'f1': 0.968421052631579},
 2: {'precision': 1.0,
  'recall': 0.6363636363636364,
  'accuracy': 0.9772727272727273,
  'f1': 0.7777777777777778}}

In [36]:
import pickle

pickle.dump(classificateur, open("lib/classificateur.p", "wb"))