## Cargamos datos

In [None]:
import pandas as pd
import numpy as np
 
# Read dataset
df = pd.read_csv('datos_no_balanceados.csv')

# Display example observations
df.head()


## Comprobamos cuantos clientes hay de cada clase

In [None]:
import matplotlib.pyplot as plt

df['client'].value_counts().plot.bar()


In [None]:
print(df['client'].value_counts())


# Dividimos en train y test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# Separate input features (X) and target variable (y)
y = df.client
X = df.drop('client', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [None]:
## Realizamos una clasificación
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(X_train, y_train)
pd.DataFrame(data = {"predict":clf.predict(X_test), "real": y_test})

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

clf.feature_importances_

## ¿Con la accuracy que hemos obtenido esta bien?
Hemos obtenido un 92% de acierto sobre los datos de test. 
¿hemos cumplido con nuestro cometido?

In [None]:
# Guardamos los valores que hemos predecido:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = clf.predict(X_test)

print(confusion_matrix(y_true, y_pred))

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("True Positive: " + str(tp))

## Absolutamente no hemos cumplido nuestros objetivos

## Opción 1: Upsample la categoría minoritaria

In [None]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df[df.client==0]
df_minority = df[df.client==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=576,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.client.value_counts()
# 1    576
# 0    576
# Name: balance, dtype: int64

In [None]:
# Separate input features (X) and target variable (y)
y = df_upsampled.client
X = df_upsampled.drop('client', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Train model
clf = RandomForestClassifier().fit(X_train, y_train)
 
# Predict on training set
pred_y_1 = clf.predict(X_test)
 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [None]:
# Guardamos los valores que hemos predecido:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = clf.predict(X_test)

print(confusion_matrix(y_true, y_pred))

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("True Positive: " + str(tp))

## Opción 2: Downsample la categoría mayoritaria


In [None]:
# Separate majority and minority classes
df_majority = df[df.client==0]
df_minority = df[df.client==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=49,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.client.value_counts()
# 1    49
# 0    49
# Name: balance, dtype: int64

In [None]:
# Separate input features (X) and target variable (y)
y = df_downsampled.client
X = df_downsampled.drop('client', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Train model
clf = RandomForestClassifier().fit(X_train, y_train)
 
# Predict on training set
pred_y_1 = clf.predict(X_test)
 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [None]:
# Guardamos los valores que hemos predecido:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = clf.predict(X_test)

print(confusion_matrix(y_true, y_pred))

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("True Positive: " + str(tp))

## Otras técnicas: ROC Curve, darle pesos al algoritmo

In [None]:
# Separate input features (X) and target variable (y)
y = df.client
X = df.drop('client', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


In [None]:
# Train model
clf = RandomForestClassifier(class_weight={0:1,1:20}).fit(X_train, y_train)
 
# Predict on training set
pred_y_1 = clf.predict(X_test)
 
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

# Guardamos los valores que hemos predecido:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = clf.predict(X_test)
#
print(confusion_matrix(y_true, y_pred))

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
print("True Negative: " + str(tn))
print("False Positive: " + str(fp))
print("False Negative: " + str(fn))
print("True Positive: " + str(tp))