# **Random Forest and Decision Tree**


In [None]:
import pandas as pd
import numpy as np

In [None]:
url="https://gist.githubusercontent.com/armgilles/194bcff35001e7eb53a2a8b441e8b2c6/raw/92200bc0a673d5ce2110aaad4544ed6c4010f687/pokemon.csv"

pokemon=pd.read_csv(url)
pokemon

In [None]:
X = pokemon.drop(columns=['#', 'Name', 'Type 1', 'Type 2', 'Generation','Legendary'])
Y = pokemon['Legendary']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, Y.values)

In [None]:
from sklearn.ensemble import RandomForestClassifier as RF
classificator = RF(bootstrap=True)
classificator.fit(X_train, y_train)

In [None]:
X.columns

In [None]:
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
plot_tree(classificator.estimators_[0], 
          feature_names=X.columns,
          class_names='Legendary', 
          filled=True, impurity=True, 
          rounded=True)
fig = plt.figure(figsize=(150, 150))
#fig.savefig('pokemon_randomforest.png')

In [None]:
!pip install dtreeviz

In [None]:
from dtreeviz.trees import dtreeviz

In [None]:
viz = dtreeviz(classificator.estimators_[0], X, Y,
               feature_names=X.columns,
               class_names='NY', 
               title="100th decision tree - Pokemon")

viz.save("decision_pokemon.svg")

from google.colab import files
files.download("decision_pokemon.svg")

In [None]:
classificator.score(X_test,y_test)

## Matriz de confusion
![](https://www.researchgate.net/profile/Arda-Aras-2/publication/350487701/figure/fig1/AS:1007018756280322@1617103389957/Confusion-Matrix-for-Binary-Classification-7.ppm)

In [None]:
from sklearn.metrics import confusion_matrix
y_pred = classificator.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
print(cf_matrix)

In [None]:
import seaborn as sns

ax = sns.heatmap(cf_matrix, annot=True, cmap='Blues')

ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print('Sensitivity', tp/(tp+fn))
print('Specificity', tn/(fp+tn))

# Validación cruzada

In [None]:
from sklearn.model_selection import cross_val_score
classificator2 = RF(bootstrap=True, max_depth=3)
cross_val_score(classificator2, X, Y, cv=10)

In [None]:
classificator2.fit(X,Y)
estimator = classificator2.estimators_[-1]

In [None]:
#Total 	HP 	Attack 	Defense 	Sp. Atk 	Sp. Def 	Speed
agumon = np.array([[635,450,68,45,37,37,20]])
classificator2.predict(agumon)

# Principal Component Analysis (PCA)
No es una técnica diferenciada de aprendizaje de máquina, pero es útil para reducir la dimensionalidad de los datos

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import load_breast_cancer

In [None]:
cancer=load_breast_cancer()

In [None]:
cancer.keys()

In [None]:
print(cancer['DESCR'])

In [None]:
cancerdf=pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
cancerdf.head(2)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=StandardScaler()
scaler.fit(cancerdf)

In [None]:
scaled_data=scaler.transform(cancerdf)
scaled_data

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca=PCA(n_components=2)

In [None]:
pca.fit(scaled_data)

In [None]:
x_pca=pca.transform(scaled_data)

In [None]:
scaled_data.shape

In [None]:
x_pca.shape

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0],x_pca[:,1],c=cancer['target'])
plt.xlabel('First principle component')
plt.ylabel('Second principle component')

# **K-Means clustering**
Es un algoritmo de aprendizaje de máquina no supervisado que agrupa datos para predicciones rápidas y toma de decisiones. Estas predicciones se basan en el número de clústeres o agrupaciones (k) y los valores medios cercanos (distancias euclideanas)

In [None]:
from sklearn.cluster import KMeans

In [None]:
url = 'https://gist.githubusercontent.com/tijptjik/9408623/raw/b237fa5848349a14a14e5d4107dc7897c21951f5/wine.csv'
wine=pd.read_csv(url)
wine.info()

In [None]:
wine.head(5)

In [None]:
var_wine = wine.drop(['Wine'], axis=1)

In [None]:
var_wine.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [None]:
scaler=MinMaxScaler()
scaler.fit(var_wine)

In [None]:
wine_scaled=scaler.transform(var_wine)
wine_scaled

In [None]:
winedf=pd.DataFrame(wine_scaled,columns=var_wine.columns)
#df = pd.DataFrame(my_array, columns = ['Column_A','Column_B','Column_C'])
winedf.head(3)

In [None]:
winedf.columns = ['Alc', 'Mal', 'Ash', 'Acl', 'Mg', 'Phe','Fla','NFla','PAnt','Col','Hue','Od','Pro']

In [None]:
winedf.head(3)

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(winedf, hue ='Alc') 

In [None]:
wcss = []
for i in range(1,11):
  kmeans = KMeans(n_clusters=i, max_iter=200)
  kmeans.fit(winedf)
  wcss.append(kmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.title('Jambu s Elbow')
plt.show()

In [None]:
kmeansclustering = KMeans(n_clusters=3, max_iter=300)
kmeansclustering.fit(winedf)

In [None]:
wine['KMeans_cluster'] = kmeansclustering.labels_
wine

In [None]:
sns.pairplot(wine, vars=['Alcohol', 'Malic.acid', 'Ash'], hue ='Wine') 

# **Support Vector machine**
El objetivo del algoritmo SVM es crear la mejor línea o límite de decisión que pueda segregar el espacio n-dimensional en clases para que podamos colocar fácilmente el nuevo punto de datos en la categoría correcta en el futuro. Este límite de mejor decisión se llama hiperplano.

SVM elige los puntos/vectores extremos que ayudan a crear el hiperplano. Estos casos extremos se denominan vectores de soporte y, por lo tanto, el algoritmo se denomina Máquina de vectores de soporte. Considere el siguiente diagrama en el que hay dos categorías diferentes que se clasifican utilizando un límite de decisión o hiperplano:
![](https://static.javatpoint.com/tutorial/machine-learning/images/support-vector-machine-algorithm.png)

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn import svm

In [None]:
digitos = datasets.load_digits()

svmclass = svm.SVC(gamma=0.0001, C=1000)
X,y = digitos.data[:-10], digitos.target[:-10]
svmclass.fit(X,y)

In [None]:
print(svmclass.predict(digitos.data[:-10]))

In [None]:
plt.imshow(digitos.images[6], interpolation='nearest')

## **Tarea***
Aplique por lo menos dos técnicas aprendidas de machine learning al conjunto de moléculas publlicadas en: A novel series of highly potent 2,6,9-trisubstituted purine cyclin-dependent kinase inhibitors (PMID: 23829517 DOI: 10.1021/jm4006884) https://pubmed.ncbi.nlm.nih.gov/23829517/, emplee DataWarrior para calcular propiedades que usted considere