In [39]:
import urllib.request
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'

urllib.request.urlretrieve(url, 'car.data')

('car.data', <http.client.HTTPMessage at 0x7f0d2c8cd760>)

In [40]:
import numpy as np
data = np.loadtxt('car.data', dtype='str', delimiter=',')
X, y = data[:,:-1], data[:,-1]
X.shape, y.shape

((1728, 6), (1728,))

In [41]:
combinacoes = 1
for i in range(X.shape[1]):
    valores = set(X[:,i])
    combinacoes *= len(valores)
    print(valores)
print(combinacoes)

{'high', 'low', 'vhigh', 'med'}
{'high', 'low', 'vhigh', 'med'}
{'3', '2', '5more', '4'}
{'2', 'more', '4'}
{'small', 'big', 'med'}
{'high', 'low', 'med'}
1728


In [42]:
from sklearn.metrics import accuracy_score
import numpy as np
from collections import Counter

def maisFrequente(y):
    return Counter(y.flat).most_common(1)[0][0]

class ZeroR():
    def fit(self, X, y):
        self.resposta = maisFrequente(y)

    def predict(self, X):
        y = np.empty((X.shape[0]), dtype='<U5')
        y[:] = self.resposta
        return y

modelo = ZeroR()
modelo.fit(X,y)
ypred = modelo.predict(X)

accuracy_score(y, ypred), (ypred == y)


(0.7002314814814815, array([ True,  True,  True, ...,  True, False, False]))

In [43]:
def impureza(y): #Gini
    labels = list(set(y))
    labels.sort()
    probabilidades = np.zeros((len(labels),))
    for i, k in enumerate(labels):
        probabilidades[i] = sum(y==k)/len(y)
    result = 1 - sum(probabilidades**2)
    return result

ytmp = y[:]
valor = impureza(ytmp)
valor

0.457283763074417

In [44]:
def impurezaValor(x, y, valor):
    iguais = x==valor
    impurezaIguais = impureza(y[iguais])
    propIguais = sum(iguais)/len(y)
    impurezaDiferentes = impureza(y[~iguais])
    propDiferentes = sum(~iguais)/len(y)
    impurezaTotal = propIguais*impurezaIguais + propDiferentes*impurezaDiferentes
    return impurezaTotal

impurezaValor(X[:,0], y, 'vhigh')


0.44934645776177407

In [45]:
def impurezaMinima(X, y):
    impurezas = []
    caracteristicasValores = []
    for i in range(X.shape[1]):
        valores = sorted(list(set(X[:,i])))
        for valor in valores:
            caracteristicasValores.append([i,valor])
            impurezaValorCaracteristica = impurezaValor(X[:,i], y, valor)
            impurezas.append(impurezaValorCaracteristica)

    caracteristicasValores = np.array(caracteristicasValores)
    impurezas = np.array(impurezas)
    menorImpureza = np.argmin(impurezas)
    caracteristica, valor = caracteristicasValores[menorImpureza]
    return impurezas[menorImpureza], int(caracteristica), valor

impurezaMinima(X,y)


(0.38615712609310704, 3, '2')

In [46]:
from sklearn.metrics import accuracy_score
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin

def maisFrequente(y):
    return Counter(y.flat).most_common(1)[0][0]

class Arvore(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.impureza, self.caracteristica, self.valor = impurezaMinima(X,y)
        iguais = X[:,self.caracteristica] == self.valor
        if sum(iguais>0) and sum(~iguais)>0:
            self.iguais = Arvore()
            self.iguais.fit(X[iguais,:],y[iguais])
            self.diferentes = Arvore()
            self.diferentes.fit(X[~iguais,:],y[~iguais])
        else:
            self.resposta = maisFrequente(y)

    def predict(self, X):
        y = np.empty((X.shape[0]), dtype='<U5')
        if hasattr(self, 'resposta'):
            y[:] = self.resposta
        else:
            iguais = X[:,self.caracteristica] == self.valor
            y[iguais] = self.iguais.predict(X[iguais,:])
            y[~iguais] = self.diferentes.predict(X[~iguais,:])
        return y

modelo = Arvore()
modelo.fit(X,y)
ypred = modelo.predict(X)

accuracy_score(y, ypred), (ypred == y)

(1.0, array([ True,  True,  True, ...,  True,  True,  True]))

In [49]:
from sklearn.model_selection import cross_validate

scores = cross_validate(Arvore(), X, y)
scores['test_score'], np.mean(scores['test_score'])

(array([0.62716763, 0.73121387, 0.75144509, 0.75362319, 0.8057971 ]),
 0.7338493758900897)