## Tema: Árboles de decisión
Nombre y apellido:


In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.dpi"] = 300
np.set_printoptions(precision=3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [None]:
print(cancer.DESCR)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    cancer.data, cancer.target, stratify=cancer.target, random_state=0)

### Visualización de un árbol de decisión

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2)
tree.fit(X_train, y_train)

 Uso de del archivo *tree_plotting.py*.

In [None]:
import warnings
warnings.filterwarnings("ignore")

from tree_plotting import plot_tree
plt.figure(dpi=200)
plot_tree(tree, feature_names=cancer.feature_names, filled=True)

### Ajuste de parámetros

In [None]:
tree = DecisionTreeClassifier().fit(X_train, y_train)
plt.figure(figsize=(15, 5))
plot_tree(tree, feature_names=cancer.feature_names, filled=True)

In [None]:
tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)
plt.figure(figsize=(15, 5))
plot_tree(tree, feature_names=cancer.feature_names)

In [None]:
tree = DecisionTreeClassifier(max_leaf_nodes=8).fit(X_train, y_train)
plot_tree(tree, feature_names=cancer.feature_names, filled=True)

In [None]:
tree = DecisionTreeClassifier(min_samples_split=50).fit(X_train, y_train)
plot_tree(tree, feature_names=cancer.feature_names, filled=True)

In [None]:
tree = DecisionTreeClassifier(min_impurity_decrease=.01).fit(X_train, y_train)
plot_tree(tree, feature_names=cancer.feature_names, filled=True)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_depth':range(1, 7)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
param_grid = {'max_depth':range(1, 7)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,
                    cv=StratifiedShuffleSplit(100), return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_max_depth', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())
plt.legend(loc=(1, 0))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'max_leaf_nodes': range(2, 20)}
grid = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid=param_grid,
                    cv=StratifiedShuffleSplit(100, random_state=1),
                   return_train_score=True)
grid.fit(X_train, y_train)

scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_max_leaf_nodes', y=['mean_train_score', 'mean_test_score'], ax=plt.gca())
plt.legend(loc=(1, 0))

In [None]:
scores = pd.DataFrame(grid.cv_results_)
scores.plot(x='param_max_leaf_nodes', y='mean_train_score', yerr='std_train_score', ax=plt.gca())
scores.plot(x='param_max_leaf_nodes', y='mean_test_score', yerr='std_test_score', ax=plt.gca())

In [None]:
grid.best_params_

In [None]:
plot_tree(grid.best_estimator_, feature_names=cancer.feature_names, filled=True)

In [None]:
pd.Series(grid.best_estimator_.feature_importances_,
          index=cancer.feature_names).plot(kind="barh")

### Ejercicios

1. Presenta un informe de cada uno de los parámetros que encuentras en el archivo *tree_plotting*. Explica lo que hacen.

2. Aplica un árbol de decisiones al conjunto de datos **adult** y visualízalo. Ajusta los parámetros con búsqueda grid. Utiliza  `max_leaf_nodes` y `max_depth`, pero por separado. Visualice el árbol resultante y sus características importantes.

In [None]:
## Tus respuestas

In [38]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.dpi"] = 300
np.set_printoptions(precision=3)
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from collections import Counter
from sklearn import svm 
from sklearn.naive_bayes import GaussianNB 
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

data = pd.read_csv("datos/adult.csv",  index_col=0)
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   education       32561 non-null  object
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      32561 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   gender          32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  32561 non-null  object
 13  income          32561 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.7+ MB


Análisis de datos

In [10]:
data.isin(['?']).sum(axis=0) #https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.isin.html

age               0
workclass         0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [11]:
data.describe()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,10.080679,1077.648844,87.30383,40.437456
std,13.640433,2.57272,7385.292085,402.960219,12.347429
min,17.0,1.0,0.0,0.0,1.0
25%,28.0,9.0,0.0,0.0,40.0
50%,37.0,10.0,0.0,0.0,40.0
75%,48.0,12.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


In [13]:
print('workclass',data.workclass.unique())
print('education',data.education.unique())
print('marital-status',data['marital-status'].unique())
print('occupation',data.occupation.unique())
print('relationship',data.relationship.unique())
print('race',data.race.unique())
print('gender',data.gender.unique())
print('native-country',data['native-country'].unique())
print('income',data.income.unique())

workclass [' State-gov' ' Self-emp-not-inc' ' Private' ' Federal-gov' ' Local-gov'
 ' ?' ' Self-emp-inc' ' Without-pay' ' Never-worked']
education [' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']
marital-status [' Never-married' ' Married-civ-spouse' ' Divorced'
 ' Married-spouse-absent' ' Separated' ' Married-AF-spouse' ' Widowed']
occupation [' Adm-clerical' ' Exec-managerial' ' Handlers-cleaners' ' Prof-specialty'
 ' Other-service' ' Sales' ' Craft-repair' ' Transport-moving'
 ' Farming-fishing' ' Machine-op-inspct' ' Tech-support' ' ?'
 ' Protective-serv' ' Armed-Forces' ' Priv-house-serv']
relationship [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
race [' White' ' Black' ' Asian-Pac-Islander' ' Amer-Indian-Eskimo' ' Other']
gender [' Male' ' Female']
native-country [' United-States' ' Cuba' ' Jamaica' ' India

In [15]:
data.age = data.age.astype(float)
data['hours-per-week'] = data['hours-per-week'].astype(float)

Es útil a veces hacer esto.

In [16]:
data1 = data.dropna()

In [20]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32561 non-null  float64
 1   workclass       32561 non-null  object 
 2   education       32561 non-null  object 
 3   education-num   32561 non-null  int64  
 4   marital-status  32561 non-null  object 
 5   occupation      32561 non-null  object 
 6   relationship    32561 non-null  object 
 7   race            32561 non-null  object 
 8   gender          32561 non-null  object 
 9   capital-gain    32561 non-null  int64  
 10  capital-loss    32561 non-null  int64  
 11  hours-per-week  32561 non-null  float64
 12  native-country  32561 non-null  object 
 13  predclase       32561 non-null  object 
dtypes: float64(2), int64(3), object(9)
memory usage: 3.7+ MB


In [21]:
data1.loc[data['income'] == ' >50K', 'predclase'] = 1
data1.loc[data['income'] == ' <=50K', 'predclase'] = 0

In [22]:
data1['education'].replace('Preschool', 'dropout',inplace=True)
data1['education'].replace('10th', 'dropout',inplace=True)
data1['education'].replace('11th', 'dropout',inplace=True)
data1['education'].replace('12th', 'dropout',inplace=True)
data1['education'].replace('1st-4th', 'dropout',inplace=True)
data1['education'].replace('5th-6th', 'dropout',inplace=True)
data1['education'].replace('7th-8th', 'dropout',inplace=True)
data1['education'].replace('9th', 'dropout',inplace=True)
data1['education'].replace('HS-Grad', 'HighGrad',inplace=True)
data1['education'].replace('HS-grad', 'HighGrad',inplace=True)
data1['education'].replace('Some-college', 'CommunityCollege',inplace=True)
data1['education'].replace('Assoc-acdm', 'CommunityCollege',inplace=True)
data1['education'].replace('Assoc-voc', 'CommunityCollege',inplace=True)
data1['education'].replace('Bachelors', 'Bachelors',inplace=True)
data1['education'].replace('Masters', 'Masters',inplace=True)
data1['education'].replace('Prof-school', 'Masters',inplace=True)
data1['education'].replace('Doctorate', 'Doctorate',inplace=True)

In [23]:
data1['marital-status'].replace('Never-married', 'NotMarried',inplace=True)
data1['marital-status'].replace(['Married-AF-spouse'], 'Married',inplace=True)
data1['marital-status'].replace(['Married-civ-spouse'], 'Married',inplace=True)
data1['marital-status'].replace(['Married-spouse-absent'], 'NotMarried',inplace=True)
data1['marital-status'].replace(['Separated'], 'Separated',inplace=True)
data1['marital-status'].replace(['Divorced'], 'Separated',inplace=True)
data1['marital-status'].replace(['Widowed'], 'Widowed',inplace=True)

No olvidar  asignar valores numéricos a las variables de tipo de cadena.

In [25]:
numero = LabelEncoder()
data1['workclass'] = numero.fit_transform(data1['workclass'])
data1['education'] = numero.fit_transform(data1['education'])
data1['marital-status'] = numero.fit_transform(data1['marital-status'])
data1['occupation'] = numero.fit_transform(data1['occupation'])
data1['relationship'] = numero.fit_transform(data1['relationship'])
data1['race'] = numero.fit_transform(data1['race'])
data1['gender'] = numero.fit_transform(data1['gender'])
data1['native-country'] = numero.fit_transform(data1['native-country'])
data['predclase'] = numero.fit_transform(data1['predclase'])

In [26]:
data1['age_bin'] = pd.cut(data1['age'], 20)

In [27]:
data1['hours-per-week_bin'] = pd.cut(data1['hours-per-week'], 10)
data1['hours-per-week'] = data1['hours-per-week']


In [28]:
data1[['predclase', 'age']].groupby(['predclase'], as_index=False).mean().sort_values(by='age', ascending=False)

Unnamed: 0,predclase,age
1,1,44.249841
0,0,36.783738


### Modelos

In [29]:
from sklearn.svm import SVR
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 

data1 = data1.apply(LabelEncoder().fit_transform)
data1.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,predclase,age_bin,hours-per-week_bin
0,22,7,9,12,4,1,1,4,1,25,0,39,39,0,6,3
1,33,6,9,12,2,4,0,4,1,0,0,12,39,0,9,1
2,21,4,11,8,0,6,1,4,1,0,0,39,39,0,5,3
3,36,4,1,6,2,6,0,2,1,0,0,39,39,0,9,3
4,11,4,9,12,2,10,5,2,0,0,0,39,5,0,3,3


In [30]:
data1['age-hours'] = data1['age']*data1['hours-per-week']
data1['age-hours_bin'] = pd.cut(data1['age-hours'], 10)

Podemos quitar algunos elementos

In [32]:
elementos_ = ['education', 'native-country', 'predclase', 'age_bin', 'age-hours_bin','hours-per-week_bin']
y = data1["predclase"]
X = data1.drop(elementos_, axis=1)
X.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,age-hours
0,22,7,12,4,1,1,4,1,25,0,39,858
1,33,6,12,2,4,0,4,1,0,0,12,396
2,21,4,8,0,6,1,4,1,0,0,39,819
3,36,4,6,2,6,0,2,1,0,0,39,1404
4,11,4,12,2,10,5,2,0,0,0,39,429


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,train_size=0.6, random_state=0)

Uso de Standard Scaler para escalar los datos

In [34]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

#### Árboles de decisión

In [35]:
DT = DecisionTreeClassifier()
DT.fit(X_train,y_train)
y_pred = DT.predict(X_test)
score_DT = DT.score(X_test,y_test)
print("La exactitus del modelo de arbol de decision es:",score_DT)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))

La exactitus del modelo de arbol de decision es: 0.8128982725527831
              precision    recall  f1-score   support

       <=50k       0.88      0.88      0.88      9873
        >50k       0.61      0.61      0.61      3152

    accuracy                           0.81     13025
   macro avg       0.74      0.74      0.74     13025
weighted avg       0.81      0.81      0.81     13025



#### Naive Bayes Gaussiana

In [36]:
GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_pred = GNB.predict(X_test)
score_GNB = GNB.score(X_test,y_test)
print('La exactitud del modelo Naive Bayes Gaussiana es:', score_GNB)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))

La exactitud del modelo Naive Bayes Gaussiana es: 0.8192706333973129
              precision    recall  f1-score   support

       <=50k       0.85      0.92      0.89      9873
        >50k       0.66      0.51      0.58      3152

    accuracy                           0.82     13025
   macro avg       0.76      0.71      0.73     13025
weighted avg       0.81      0.82      0.81     13025



#### K-neighbors 

In [39]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
score_knn = knn.score(X_test,y_test)
print('La exactitud del modelo KNN Model es',score_knn)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))

La exactitud del modelo KNN Model es 0.8425335892514395
              precision    recall  f1-score   support

       <=50k       0.89      0.91      0.90      9873
        >50k       0.69      0.63      0.66      3152

    accuracy                           0.84     13025
   macro avg       0.79      0.77      0.78     13025
weighted avg       0.84      0.84      0.84     13025



#### SVC

In [40]:
from sklearn.svm import SVC
svc = SVC(gamma=0.22)
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
score_svc = svc.score(X_test,y_test)
print('La exactitud del modelo SVC es', score_svc)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))

La exactitud del modelo SVC es 0.8530518234165068
              precision    recall  f1-score   support

       <=50k       0.87      0.95      0.91      9873
        >50k       0.78      0.55      0.65      3152

    accuracy                           0.85     13025
   macro avg       0.82      0.75      0.78     13025
weighted avg       0.85      0.85      0.84     13025



#### Regresión logística

In [41]:
LR = LogisticRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
score_LR = LR.score(X_test,y_test)
print('La exactitus del modelo de regresión logísticas es:', score_LR)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))

La exactitus del modelo de regresión logísticas es: 0.8234932821497121
              precision    recall  f1-score   support

       <=50k       0.84      0.94      0.89      9873
        >50k       0.71      0.46      0.56      3152

    accuracy                           0.82     13025
   macro avg       0.78      0.70      0.72     13025
weighted avg       0.81      0.82      0.81     13025



#### Árboles aleatorios

In [42]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
y_pred = RF.predict(X_test)
score_RF = RF.score(X_test,y_test)
print('La exactitud del modelo de árbol aleatorio es', score_RF)
targets = ['<=50k' , '>50k']
print(classification_report(y_test, y_pred,target_names=targets))


La exactitud del modelo de árbol aleatorio es 0.8517466410748561
              precision    recall  f1-score   support

       <=50k       0.88      0.93      0.90      9873
        >50k       0.73      0.62      0.67      3152

    accuracy                           0.85     13025
   macro avg       0.81      0.77      0.79     13025
weighted avg       0.85      0.85      0.85     13025



In [44]:
forma_tabular = {'Clasificacion':['LogisticRegression','SupportVectorClassifier','RandomForestClassifier','DecisionTree','GaussianNaiveBayes','K-NearestNeighbors'],
                'Exactitud':[score_LR,score_svc,score_RF,score_DT,score_GNB,score_knn]
                }
forma_tabular = pd.DataFrame(forma_tabular,columns= ['Clasificacion','Exactitud'])
print(forma_tabular)

             Clasificacion  Exactitud
0       LogisticRegression   0.823493
1  SupportVectorClassifier   0.853052
2   RandomForestClassifier   0.851747
3             DecisionTree   0.812898
4       GaussianNaiveBayes   0.819271
5       K-NearestNeighbors   0.842534
