<a href="https://colab.research.google.com/github/jrmaza/machine-learning/blob/main/MLA_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decision Tree (árbol de decisión) Machine Learning
Un árbol de decisión es un algoritmo de aprendizaje supervisado, con estructuras jerárquicas que asemejan a un árbol que consiste en una raiz (root), ramas (branches) y nodos internos (leaf nodes)

![](https://www.ibm.com/content/dam/connectedassets-adobe-cms/worldwide-content/cdp/cf/ul/g/10/3c/Decision-Tree-Example.component.xl.ts=1640801899950.png/content/adobe-cms/us/en/topics/decision-trees/jcr:content/root/table_of_contents/intro/complex_narrative/items/content_group_1441304462/image)

El algoritmo de aprendizaje del árbol de decisiones emplea una estrategia de divide y vencerás mediante la realización de una búsqueda minuciosa para identificar los puntos de división óptimos dentro de un árbol. 

Este proceso de división se repite de forma recursiva de arriba hacia abajo hasta que todos o la mayoría de los registros se hayan clasificado bajo etiquetas de clase específicas. Que todos los puntos de datos se clasifiquen o no como conjuntos homogéneos depende en gran medida de la complejidad del árbol de decisión.

In [None]:
!pip install session_info

In [40]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns

In [41]:
url_breastcancer = 'https://raw.githubusercontent.com/jrmaza/machine-learning/main/breast_cancer_dataset.csv'

In [42]:
cancerdf = pd.read_csv(url_breastcancer)

In [43]:
cancerdf.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [44]:
cancer = cancerdf.drop(['Unnamed: 32', 'id'], axis=1)

In [None]:
cancer.diagnosis.value_counts()

In [46]:
maligno = len(cancer[cancer['diagnosis'] == 'M'])
benigno = len(cancer[cancer['diagnosis'] == 'B'])

In [None]:
import matplotlib.pyplot as plt
plt.rcdefaults()
fig, ax = plt.subplots()
y = ('Maligno', 'Benigno')
y_pos = np.arange(len(y))
x = (maligno, benigno)
ax.barh(y_pos, x, align='center')
ax.set_yticks(y_pos)
ax.set_yticklabels(y)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Count')
ax.set_title('Diagnóstico')
for i, v in enumerate(x):
    ax.text(v + 10, i, str(v), color='black', va='center', fontweight='normal')
plt.show()

In [48]:
qualitative = []
quantitative = []
for feature in cancer.columns:
    if len(cancer[feature].unique()) <= 8:
        qualitative.append(feature)
    else:
        quantitative.append(feature)

In [None]:
qualitative

In [None]:
quantitative

In [51]:
cancer['diagnosis'] = cancer['diagnosis'].map({'M': 1, 'B': 0})

#from sklearn.preprocessing import LabelEncoder
#lb = LabelEncoder() 
#cancer['diagnosis'] = lb.fit_transform(df['diagnosis'])
#cancer

In [None]:
cancer

In [None]:
top = 15
corr = cancer.corr()
top15 = corr.nlargest(top, 'diagnosis')['diagnosis'].index
corr_top15 = cancer[top15].corr()
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr_top15, square=True, ax=ax, annot=True, cmap='coolwarm', fmt='.2f', annot_kws={'size':12})
plt.title('Top correlated features of dataset', size=16)
plt.show()

In [None]:
ax = sns.distplot(cancer['concave points_worst'])

In [55]:
x = cancer.drop(['diagnosis'], axis=1)

In [56]:
y = cancer.loc[:,"diagnosis"]

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=89)

In [None]:
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_depth=1) 
decision_tree.fit(x_train, y_train)  
y_pred = decision_tree.predict(x_test)
score = decision_tree.score(x_train, y_train)
print('Training Score:', score)
score = decision_tree.score(x_test, y_test)
print('Testing Score:', score)

In [None]:
print('DecisionTreeClassifier')
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(max_depth=5) 
decision_tree.fit(x_train, y_train)  
y_pred = decision_tree.predict(x_test)
score = decision_tree.score(x_train, y_train)
print('Training Score:', score)
score = decision_tree.score(x_test, y_test)
print('Testing Score:', score)
output = pd.DataFrame({'Predicted':y_pred}) # Cancer Maligno o Benigno? 1/0
print(output.head())
people = output.loc[output.Predicted == 1]["Predicted"]
rate_people = 0
if len(people) > 0 :
    rate_people = len(people)/len(output)
print("% de personas con cancer maligno/benigno:", rate_people)
score_dtc = score
out_dtc = output
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,y_pred)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Greens', fmt = 'g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for logistic regression')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
y_probabilities = decision_tree.predict_proba(x_test)[:,1]
false_positive_rate_knn, true_positive_rate_knn, threshold_knn = roc_curve(y_test,y_probabilities)
plt.figure(figsize=(10,6))
plt.title('ROC para árbol de decisión')
plt.plot(false_positive_rate_knn, true_positive_rate_knn, linewidth=5, color='green')
plt.plot([0,1],ls='--',linewidth=5)
plt.plot([0,0],[1,0],c='.5')
plt.plot([1,1],c='.5')
plt.text(0.2,0.6,'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)),size= 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()

In [None]:
print('RandomForestClassifier')
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=5) # , max_depth=5, random_state=1
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
score = random_forest.score(x_train, y_train)
print('Training Score:', score)
score = random_forest.score(x_test, y_test)
print('Testing Score:', score)
output = pd.DataFrame({'Predicted':y_pred}) # Heart-Disease yes or no? 1/0
print(output.head())
people = output.loc[output.Predicted == 1]["Predicted"]
rate_people = 0
if len(people) > 0 :
    rate_people = len(people)/len(output)
print("% of people predicted with heart-disease:", rate_people)
score_rfc = score
out_rfc = output
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,y_pred)
class_names = [0,1]
fig,ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks,class_names)
plt.yticks(tick_marks,class_names)
sns.heatmap(pd.DataFrame(confusion_matrix), annot = True, cmap = 'Greens', fmt = 'g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Confusion matrix for random forest')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
print('Support Vector Machine')
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix


In [None]:
svc_model=SVC()
svc_model.fit(x_train,y_train)

In [102]:
y_predict=svc_model.predict(x_test)

In [None]:
cm=confusion_matrix(y_test,y_predict)
sns.heatmap(cm,annot=True)
plt.ioff()

In [None]:
print(classification_report(y_test,y_predict))

In [105]:
from sklearn.preprocessing import MinMaxScaler
sc=MinMaxScaler(feature_range=(0,1))
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.fit_transform(x_test)

In [119]:
svc_model_scaled=SVC(kernel='linear',C=1)
#svc_model_scaled=SVC(kernel='poly',C=1)
#svc_model_scaled=SVC(kernel='sigmoid',C=1)
svc_model_scaled.fit(x_train_scaled,y_train)

In [120]:
y_predict=svc_model_scaled.predict(x_test_scaled)

In [None]:
cm=confusion_matrix(y_test,y_predict)
sns.heatmap(cm,annot=True)
plt.ioff()

In [None]:
print(classification_report(y_test,y_predict))

In [122]:
import session_info
session_info.show(html=False)

-----
matplotlib          3.5.3
numpy               1.22.4
pandas              1.3.5
seaborn             0.11.2
session_info        1.0.0
sklearn             1.2.1
-----
IPython             7.9.0
jupyter_client      6.1.12
jupyter_core        5.2.0
notebook            6.3.0
-----
Python 3.8.10 (default, Nov 14 2022, 12:59:47) [GCC 9.4.0]
Linux-5.10.147+-x86_64-with-glibc2.29
-----
Session information updated at 2023-03-06 17:34
