# <a>How to use oversampling without overfitting!</a>



In [1]:
!pip install notebook pandas scikit-learn imblearn



You should consider upgrading via the 'F:\leon\venvs\ml\Scripts\python.exe -m pip install --upgrade pip' command.





In [2]:
import pandas as pd

In [3]:
# PIMA indians
df = pd.read_csv('./data/diabetes.csv')

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
268/len(df) * 100

34.89583333333333

In [6]:
100 - (268/len(df) * 100)

65.10416666666667

In [7]:
X = df.drop('Outcome', axis=1)
y = df[['Outcome']]

X.shape, y.shape

((768, 8), (768, 1))

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((576, 8), (192, 8), (576, 1), (192, 1))

In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

def cross_validation(model, X, y, oversampling=False):
    kfold = KFold(n_splits=10)

    fold_accuracies = []
    
    for idx, (idx_train, idx_validation) in enumerate(kfold.split(X)):
        X_train_split = X.iloc[idx_train, :]
        y_train_split = y.iloc[idx_train, :]
    
        if oversampling:
            sm = SMOTE(random_state=42)
            X_train_split, y_train_split = sm.fit_resample(X_train_split, y_train_split)
        
        model.fit(X_train_split, y_train_split.values.flatten())
    
        X_validation_split = X.iloc[idx_validation, :]
        y_validation_split = y.iloc[idx_validation, :]
        
        predictions = model.predict(X_validation_split)
        
        fold_accuracy = accuracy_score(y_validation_split, predictions)
        
        fold_accuracies.append(fold_accuracy)
        
        print(f'{idx} fold accuracy: {fold_accuracy}')
    
    return fold_accuracies
    

In [10]:
from sklearn.ensemble import HistGradientBoostingClassifier
from statistics import mean

# Criando o modelo
hgb = HistGradientBoostingClassifier()

In [11]:
no_smote_accuracy = mean(cross_validation(hgb, X_train, y_train, oversampling=False)) * 100

0 fold accuracy: 0.7586206896551724
1 fold accuracy: 0.7586206896551724
2 fold accuracy: 0.7241379310344828
3 fold accuracy: 0.7758620689655172
4 fold accuracy: 0.7586206896551724
5 fold accuracy: 0.8275862068965517
6 fold accuracy: 0.7192982456140351
7 fold accuracy: 0.7719298245614035
8 fold accuracy: 0.6842105263157895
9 fold accuracy: 0.7719298245614035


In [12]:
no_smote_accuracy

75.50816696914701

In [13]:
smote_accuracy = mean(cross_validation(hgb, X_train, y_train, oversampling=True)) * 100

0 fold accuracy: 0.7931034482758621
1 fold accuracy: 0.7758620689655172
2 fold accuracy: 0.7413793103448276
3 fold accuracy: 0.7068965517241379
4 fold accuracy: 0.7068965517241379
5 fold accuracy: 0.7758620689655172
6 fold accuracy: 0.6666666666666666
7 fold accuracy: 0.7894736842105263
8 fold accuracy: 0.631578947368421
9 fold accuracy: 0.7192982456140351


In [14]:
smote_accuracy

73.07017543859648

In [15]:
f'No smote: {no_smote_accuracy:.02f}, smote: {smote_accuracy:.02f}'

'No smote: 75.51, smote: 73.07'

## Watch out! The code below is WRONG, just to see how easily mistaken one can be ;)

In [17]:
from sklearn.model_selection import cross_val_score

# WRONG! Don't balance before train test split
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Splits created by cross_val_score have balaenced validation splits, which is WRONG :)
mean(cross_val_score(hgb, X_train, y_train.values.flatten())) * 100

80.16912751677853