<a href="https://colab.research.google.com/github/ivonnics/Machine-Learning/blob/master/How_to_create_and_deal_with_Unbalanced_Datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Taken from: https://machinelearningmastery.com/imbalanced-classification-is-hard/

## Muy buenos: https://medium.com/strands-tech-corner/unbalanced-datasets-what-to-do-144e0552d9cd & https://elitedatascience.com/imbalanced-classes

https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/

https://www.kdnuggets.com/2017/06/7-techniques-handle-imbalanced-data.html

https://towardsdatascience.com/handling-imbalanced-datasets-in-machine-learning-7a0e84220f28

https://www.analyticsvidhya.com/blog/2017/03/imbalanced-data-classification/

In [0]:
# vary the dataset size for a 1:100 imbalanced dataset
from collections import Counter
from sklearn.datasets import make_classification
from matplotlib import pyplot
from numpy import where
# dataset sizes
sizes = [100, 1000, 10000, 100000]
# create and plot a dataset with each size
for i in range(len(sizes)):
	# determine the dataset size
	n = sizes[i]
	# create the dataset
	X, y = make_classification(n_samples=n, n_features=2, n_redundant=0,
		n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
	# summarize class distribution
	counter = Counter(y)
	print('Size=%d, Ratio=%s' % (n, counter))
	# define subplot
	pyplot.subplot(2, 2, 1+i)
	pyplot.title('n=%d' % n)
	pyplot.xticks([])
	pyplot.yticks([])
	# scatter plot of examples by class label
	for label, _ in counter.items():
		row_ix = where(y == label)[0]
		pyplot.scatter(X[row_ix, 0], X[row_ix, 1], label=str(label))
	pyplot.legend()
# show the figure
pyplot.show()

In [0]:
pip install -U imbalanced-learn

In [0]:
# https://imbalanced-learn.readthedocs.io/en/stable/api.html
# https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/index.html
import pandas as pd
import imblearn.under_sampling as under
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
pd.Series(breast_cancer.target).value_counts()


In [0]:
UnderSampling = under.ClusterCentroids(sampling_strategy={1:300, 0:212}, random_state=83, voting='hard')
x_resampled, y_resampled = UnderSampling.fit_resample(breast_cancer.data, breast_cancer.target)
pd.Series(y_resampled).value_counts()

In [0]:
# https://stackoverflow.com/questions/38105539/how-to-convert-a-scikit-learn-dataset-to-a-pandas-dataset/46379878#46379878
# https://stackoverflow.com/questions/48769682/how-do-i-convert-data-from-a-scikit-learn-bunch-object-to-a-pandas-dataframe

def answer_one(): 
    import numpy as np
    import pandas as pd
    from sklearn.datasets import load_breast_cancer 
    cancer = load_breast_cancer()     
    data = np.c_[cancer.data, cancer.target]
    columns = np.append(cancer.feature_names, ["target"])
    return pd.DataFrame(data, columns=columns)

answer_one()

In [0]:
# From: https://elitedatascience.com/imbalanced-classes
import pandas as pd
import numpy as np
 
# Read dataset
df = pd.read_csv('https://raw.githubusercontent.com/ivonnics/Machine-Learning/master/DATA/balance-scale.data', 
                 names=['balance', 'var1', 'var2', 'var3', 'var4'])
 
# Display example observations
print(df.info())
df

In [0]:
print(df['balance'].value_counts())
print(df['var1'].value_counts())
print(df['var2'].value_counts())
print(df['var3'].value_counts())
print(df['var4'].value_counts())

In [0]:
df['balance'] = [1 if b=='B' else 0 for b in df.balance]
df['balance'].value_counts()

In [0]:
print(df)

## The Danger of Imbalanced Classes:

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [0]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_0 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_0 = clf_0.predict(X)
# How's the accuracy?
print( accuracy_score(pred_y_0, y) )

In [0]:
# Should we be excited?
print( np.unique( pred_y_0 ) )

In [0]:
from sklearn.utils import resample

In [0]:
# Separate majority and minority classes
print('DF Majority:')
df_majority = df[df.balance==0]
print(df_majority)
print(df_majority.info())
print('__________________________________________________________')
print('DF Minority:')
df_minority = df[df.balance==1]
print(df_minority)
print(df_minority.info())
print('__________________________________________________________')
print('DF Minority UNSAMPLED:')
# Upsample minority class
# sample with replacement to match majority class reproducible results
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=123) 
print(df_minority_upsampled)
print(df_minority_upsampled.info())
print('__________________________________________________________')
print('DF UNSAMPLED:')
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print(df_upsampled)
print(df_upsampled.info())
print('__________________________________________________________')
 
# Display new class counts
df_upsampled.balance.value_counts()

In [0]:
# Separate input features (X) and target variable (y)
y = df_upsampled.balance
print(y)
X = df_upsampled.drop('balance', axis=1)
print(X)
 
# Train model
clf_1 = LogisticRegression().fit(X, y)
 
# Predict on training set
pred_y_1 = clf_1.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_1 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_1) )
# 0.513888888889

In [0]:
# Separate majority and minority classes
df_majority = df[df.balance==0]
df_minority = df[df.balance==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results

print(df_majority_downsampled)
print(df_majority_downsampled.info())
print('______________________________________________________________')
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
print(df_downsampled)
print(df_downsampled.info())
print('______________________________________________________________')
# Display new class counts
df_downsampled.balance.value_counts()

In [0]:
# Separate input features (X) and target variable (y)
y = df_downsampled.balance
X = df_downsampled.drop('balance', axis=1)
 
# Train model
clf_2 = LogisticRegression().fit(X, y)
print(clf_2)
print('__________________________________________')
 
# Predict on training set
pred_y_2 = clf_2.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_2 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_2) )
# 0.581632653061

In [0]:
from sklearn.metrics import roc_auc_score

In [0]:
# Predict class probabilities
prob_y_2 = clf_2.predict_proba(X)
print(prob_y_2)
print('______________________________________________________________')
 
# Keep only the positive class
prob_y_2 = [p[1] for p in prob_y_2]
 
prob_y_2[:5] # Example
# [0.45419197226479618,
#  0.48205962213283882,
#  0.46862327066392456,
#  0.47868378832689096,
#  0.58143856820159667]

In [0]:
print( roc_auc_score(y, prob_y_2) )
# 0.568096626406

In [0]:
prob_y_0 = clf_0.predict_proba(X)
prob_y_0 = [p[1] for p in prob_y_0]
 
print( roc_auc_score(y, prob_y_0) )
# 0.530718537415

In [0]:
	from sklearn.svm import SVC

In [0]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_3 = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)
 
clf_3.fit(X, y)
 
# Predict on training set
pred_y_3 = clf_3.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_3 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_3) )
# 0.688
 
# What about AUROC?
prob_y_3 = clf_3.predict_proba(X)
prob_y_3 = [p[1] for p in prob_y_3]
print( roc_auc_score(y, prob_y_3) )
# 0.5305236678

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
# Separate input features (X) and target variable (y)
y = df.balance
X = df.drop('balance', axis=1)
 
# Train model
clf_4 = RandomForestClassifier()
clf_4.fit(X, y)
 
# Predict on training set
pred_y_4 = clf_4.predict(X)
 
# Is our model still predicting just one class?
print( np.unique( pred_y_4 ) )
# [0 1]
 
# How's our accuracy?
print( accuracy_score(y, pred_y_4) )
# 0.9744
 
# What about AUROC?
prob_y_4 = clf_4.predict_proba(X)
prob_y_4 = [p[1] for p in prob_y_4]
print( roc_auc_score(y, prob_y_4) )
# 0.999078798186