# Árboles de regresión: caso práctico

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [7]:
mainpath = "/Users/irene/Documents/GitHub/python-ml-course/datasets"  #Ruta ficheros
filename = "boston/Boston.csv" #Fichero a abrir
fullpath = mainpath + "/" + filename #Ruta completa

df = pd.read_csv(fullpath)
print(df.shape) 
df.head(3)

(506, 14)


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7


Dataset de casas en Boston
- 506 casas
- crim = índice de crimen per cápita (barrio)
- zn (proporción de residentes / pies^2)
- indus = proporción de industria
- chas = si la casa está cerca del río
- nox = contaminación
- rm = promedio de hab/casa
- medv = precio medio -> **var. a predecir = y**
- ...

In [8]:
#Separamos las columnas en parámetros y objetivo 

colnames = df.columns.values.tolist()
predictors = colnames[:13]
target = colnames[13]

In [13]:
#Creamos las variables del modelo

X = df[predictors]
Y = df[target]

In [15]:
from sklearn.tree import DecisionTreeRegressor

In [17]:
# Creamos el modelo

regtree = DecisionTreeRegressor(min_samples_split = 30, min_samples_leaf = 10, random_state = 0)
regtree.fit(X, Y)

DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=30, random_state=0)

In [19]:
pred = regtree.predict(df[predictors])
df["Predictions"] = pred

In [22]:
#Comparamos el valor actual con el predicho

df[["Predictions", "medv"]]

Unnamed: 0,Predictions,medv
0,22.840000,24.0
1,22.840000,21.6
2,35.247826,34.7
3,35.247826,33.4
4,35.247826,36.2
...,...,...
501,22.840000,22.4
502,20.624138,20.6
503,28.978261,23.9
504,31.170000,22.0


Hay algunas observaciones que tienen el mismo valor predicho ya que todas esas están dentro de una misma rama

In [23]:
# Visualización gráfica del árbol

from sklearn.tree import export_graphviz
import os
from graphviz import Source 
os.environ["PATH"] += os.pathsep + 'C:/Users/irene/.anaconda/navigator/Library/bin/graphviz'

In [28]:
#para visualizar grafos

#Necesitamos crear un archivo .dot

with open("/Users/irene/Documents/GitHub/curso-machine-learning-python/resources/regtree_boston.dot", "w") as dotfile:
    export_graphviz(regtree, out_file = dotfile, feature_names = predictors)
    dotfile.close()

In [29]:
file = open("/Users/irene/Documents/GitHub/curso-machine-learning-python/resources/regtree_boston.dot", "r")
text = file.read()
text

'digraph Tree {\nnode [shape=box] ;\n0 [label="rm <= 6.941\\nmse = 84.42\\nsamples = 506\\nvalue = 22.533"] ;\n1 [label="lstat <= 14.4\\nmse = 40.273\\nsamples = 430\\nvalue = 19.934"] ;\n0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;\n2 [label="lstat <= 4.91\\nmse = 26.009\\nsamples = 255\\nvalue = 23.35"] ;\n1 -> 2 ;\n3 [label="mse = 47.187\\nsamples = 20\\nvalue = 31.565"] ;\n2 -> 3 ;\n4 [label="lstat <= 9.715\\nmse = 17.974\\nsamples = 235\\nvalue = 22.651"] ;\n2 -> 4 ;\n5 [label="age <= 87.6\\nmse = 22.287\\nsamples = 122\\nvalue = 24.393"] ;\n4 -> 5 ;\n6 [label="rm <= 6.125\\nmse = 11.111\\nsamples = 112\\nvalue = 23.787"] ;\n5 -> 6 ;\n7 [label="mse = 4.865\\nsamples = 29\\nvalue = 20.624"] ;\n6 -> 7 ;\n8 [label="rm <= 6.611\\nmse = 8.576\\nsamples = 83\\nvalue = 24.893"] ;\n6 -> 8 ;\n9 [label="tax <= 332.5\\nmse = 6.848\\nsamples = 60\\nvalue = 23.99"] ;\n8 -> 9 ;\n10 [label="age <= 63.7\\nmse = 6.345\\nsamples = 50\\nvalue = 24.366"] ;\n9 -> 10 ;\n11 [label="dis 

In [None]:
#NO VA 

Source(text) #Profundidad = 3 decisiones

# 3. Validar y podar el árbol

Usamos el método del Cross Validation para la poda

In [52]:
from sklearn.model_selection import KFold #Para hacer un crossvalidation (cv)
from sklearn.model_selection import cross_val_score #Para conocer la media de las eficacias del cv
from sklearn.metrics import SCORERS

In [53]:
 cv = KFold(n_splits = 10, shuffle = True, random_state = 1)

In [58]:
scores = cross_val_score(regtree, X, Y, scoring = "neg_mean_squared_error", cv = cv, n_jobs=1)
print(scores)
score = np.mean(scores)
print("")
print("El score es de " + str(score)) #eficacia muy alta

#El error promedio es de 20 puntos arriba/abajo en la predicción

[-13.64925886 -17.28987161 -16.98569707 -47.56954086  -9.26202865
 -17.23057023 -15.41541493 -31.33011027 -22.79877067  -9.54180723]

El score es de -20.107307036443846


In [61]:
#Para saber las vars. más importantes:

list(zip(predictors,regtree.feature_importances_))

[('crim', 0.03421203230639308),
 ('zn', 0.0),
 ('indus', 0.0011605887788380146),
 ('chas', 0.0),
 ('nox', 0.01856163073811432),
 ('rm', 0.6308568014337028),
 ('age', 0.01725115143448847),
 ('dis', 0.0013745115995791378),
 ('rad', 0.0),
 ('tax', 0.0023698305298803803),
 ('ptratio', 0.009333247332530954),
 ('black', 0.0),
 ('lstat', 0.28488020584647283)]

# Random Forests para Regresión

In [80]:
from sklearn.ensemble import RandomForestRegressor

In [87]:
forest = RandomForestRegressor(n_jobs = 2, oob_score=True, n_estimators=500)
forest.fit(X, Y)

#n_jobs = nº trabajos simultáneos -> nº tareas en paralelo
#oob = muestreo aleatorio
#n_estimators = nº de árboles que tendrá nuestro bosque

RandomForestRegressor(n_estimators=500, n_jobs=2, oob_score=True)

In [88]:
df["Predicción Bosque"] = forest.oob_prediction_

#Para visualizar la predicción y el valor actual conjuntamente
df[["Predicción Bosque","medv"]]

Unnamed: 0,Predicción Bosque,medv
0,28.709091,24.0
1,22.829670,21.6
2,34.812432,34.7
3,34.075843,33.4
4,34.377949,36.2
...,...,...
501,24.681176,22.4
502,18.294000,20.6
503,27.659195,23.9
504,25.983696,22.0


Para calcular el error cuadrático medio, no necesitamos la validación cruzada, podemos comparar la predicción con el valor actual

In [90]:
df["Error Bosque 2"] = (df["Predicción Bosque"]-df["medv"])**2
print("El error será del " + str(sum(df["Error Bosque 2"])/len(df)) + "%")

#El error ha bajado de 20,10% a 10,39%

El error será del 10.391002965784644%


In [91]:
forest.oob_score_ #similar al R2, más eficaz

0.8769123715059268