# Maestría en Maestría en Ciencia de Datos e Inteligencia Artificial
#### 8. Machine Learning and Deep Learning
#### Docente: Msc. Renzo Claure Aracena.

### Creando una base desbalanceada

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
cancer = pd.read_csv('cancer.csv', delimiter = ";", decimal=",")

In [None]:
cancer['Tipo'].value_counts(normalize=True)

In [None]:
cancer_db = cancer.sort_values(by='Tipo')
cancer_db = cancer_db.iloc[:400]
cancer_db[['Tipo']].value_counts(normalize=True)


In [None]:
X = cancer_db.drop(['Tipo', 'ID'], axis=1)
y = cancer_db['Tipo'].replace(['M', 'B'], [1,0])

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

In [None]:
from sklearn.svm import SVC

In [None]:
clasif = SVC(kernel='rbf').fit(X_train, y_train)

In [None]:
clasif.score(X_test, y_test)

### Clasificador Base o Clasificador tonto (Dummy)

In [None]:
y_train.value_counts(normalize=True)

In [None]:
from sklearn.dummy import DummyClassifier
dummy_mayor = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_dummy_predic = dummy_mayor.predict(X_test)
y_dummy_predic

In [None]:
dummy_mayor.score(X_test, y_test)

In [None]:
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

### Matriz de confusión

In [None]:
from sklearn.metrics import confusion_matrix

dummy_mayor = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
y_mayor_pred = dummy_mayor.predict(X_test)
confusion = confusion_matrix(y_test, y_mayor_pred)

In [None]:
y_test.shape

In [None]:
confusion

In [None]:
dummy_mayor = DummyClassifier(strategy = 'stratified').fit(X_train, y_train)
y_estrat_pred = dummy_mayor.predict(X_test)
confusion = confusion_matrix(y_test, y_estrat_pred)

In [None]:
confusion

In [None]:
svm = SVC(kernel = 'linear', C=100).fit(X_train, y_train)
svm_pred = svm.predict(X_test)
confusion = confusion_matrix(y_test, svm_pred)
confusion

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
lr_pred = lr.predict(X_test)
confusion = confusion_matrix(y_test, lr_pred)
confusion

In [None]:
from sklearn.tree import DecisionTreeClassifier

ad = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
tree_predicted = ad.predict(X_test)
confusion = confusion_matrix(y_test, tree_predicted)

print('Decision tree classifier (max_depth = 2)\n', confusion)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
confusion = confusion_matrix(y_test, tree_predicted, labels=ad.classes_)
disp = ConfusionMatrixDisplay(confusion,display_labels=ad.classes_)
disp.plot()

### Medidas de efectividad (Binarias)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
print('Score: {:.3f}'.format(svm.score(X_test, y_test)))
print('Accuracy_score: {:.3f}'.format(accuracy_score(y_test, svm.predict(X_test))))
print('Precision: {:.3f}'.format(precision_score(y_test, svm.predict(X_test))))
print('Recall: {:.3f}'.format(recall_score(y_test, svm.predict(X_test))))
print('F1: {:.3f}'.format(f1_score(y_test, svm.predict(X_test))))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
confusion = confusion_matrix(y_test, svm.predict(X_test), labels=ad.classes_)
disp = ConfusionMatrixDisplay(confusion,display_labels=ad.classes_)
disp.plot()

### Matriz con todos los indicadores

In [None]:
from sklearn.metrics import classification_report

print("Árboles de Dec.:\n", classification_report(y_test, ad.predict(X_test), target_names = ['B', 'M']))

In [None]:
print('Dummy_mayor:\n', classification_report(y_test, dummy_mayor.predict(X_test), target_names = ['B', 'M']))
print('Arboles de dec.', classification_report(y_test, ad.predict(X_test), target_names = ['B', 'M']))
print('Regresion Log.' , classification_report(y_test, lr.predict(X_test), target_names = ['B', 'M']))
print('Maq. Soporte Vectorial', classification_report(y_test, svm.predict(X_test), target_names = ['B', 'M']))


### Balanceo, lo haremos solo para SVM

In [None]:
#pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
#Solo debemos aplicar el balanceo a la muestra de entrenamiento
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)

In [None]:
#Aplicamos al modelo
ad = SVC(kernel='linear').fit(X_train_res, y_train_res)
#Obtenemos las predicciones de comprobación
predicted_train = ad.predict(X_train_res)
predicted_test = ad.predict(X_test)

In [None]:
#Obtenemos las metricas de entrenamiento
print(classification_report(y_train_res, predicted_train, target_names = ['B', 'M']))

In [None]:
#Obtenemos las metricas
print(classification_report(y_test, predicted_test, target_names = ['B', 'M']))

In [None]:
#Matriz de entrenamiento
confusion_train = confusion_matrix(y_train_res, predicted_train)

In [None]:
disp = ConfusionMatrixDisplay(confusion_train,display_labels=ad.classes_)
disp.plot()

In [None]:
#Matriz de comprobación
confusion_test = confusion_matrix(y_test, predicted_test)
disp = ConfusionMatrixDisplay(confusion_test,display_labels=ad.classes_)
disp.plot()

In [None]:
#clf = SVC(class_weight='balanced', kernel='rbf', C=1.0)

In [None]:
svm = SVC(kernel = 'linear', C=100).fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print('Maq. Soporte Vectorial', classification_report(y_test, svm.predict(X_test), target_names = ['B', 'M']))

In [None]:
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
svm = SVC(kernel = 'linear', C=100).fit(X_train_res, y_train_res)
svm_pred = svm.predict(X_test)
print('Maq. Soporte Vectorial', classification_report(y_test, svm.predict(X_test), target_names = ['B', 'M']))

In [None]:
svm = SVC(kernel='linear', C=100, class_weight='balanced').fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print('Maq. Soporte Vectorial', classification_report(y_test, svm.predict(X_test), target_names = ['B', 'M']))

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
svm = SVC(kernel='linear', C=100)
svm.fit(X_train_res, y_train_res)
y_pred = svm.predict(X_test)
print('Maq. Soporte Vectorial con SMOTE')
print(classification_report(y_test, y_pred, target_names=['B', 'M']))