# Import modules

In [None]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.over_sampling  import RandomOverSampler, SMOTE
from imblearn.combine        import SMOTEENN, SMOTETomek

import seaborn as sns

In [None]:
RANDOM_STATE = 123

# Load data

In [None]:
data = pd.read_csv('../../data/ai4i2020.csv', index_col='UDI')

# Clean data

In [None]:
def clean_data(data, drop_type=True, encode_mf=False):
    ''' clean the data '''

    if drop_type:
        data.drop(columns='Type', inplace=True)

    if encode_mf:
        # needs improvement because of multiple failure modes
        # from itertools import combinations, permutations
        # failures = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
        # list(permutations(failures, 2))
        data['Machine failure'] = 0
        data['Machine failure'][data['TWF'] == 1] = 1
        data['Machine failure'][data['HDF'] == 1] = 2
        data['Machine failure'][data['PWF'] == 1] = 3
        data['Machine failure'][data['OSF'] == 1] = 4
        data['Machine failure'][data['RNF'] == 1] = 5

    data.drop(columns=['Product ID', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'], inplace=True)

    return data

In [None]:
data = clean_data(data)

In [None]:
X = data.copy()
y = X.pop('Machine failure')

In [None]:
y.value_counts().plot(kind='bar')

# Imbalance

In [None]:
data['Machine failure'].value_counts()

## Under-sampling

In [None]:
rus = RandomUnderSampler(random_state=RANDOM_STATE, replacement=True)
X_rus, y_rus = rus.fit_resample(X, y)

In [None]:
y_rus.value_counts().plot(kind='bar')

## Tomek

In [None]:
tomek = TomekLinks(sampling_strategy='majority')
X_tom, y_tom = tomek.fit_resample(X, y)

In [None]:
y_tom.value_counts().plot(kind='bar')

## NearMiss

In [None]:
near = NearMiss()
X_near, y_near = near.fit_resample(X, y)

In [None]:
y_near.value_counts().plot(kind='bar')

## Over-sampling

In [None]:
ros = RandomOverSampler(random_state=RANDOM_STATE)
X_ros, y_ros = ros.fit_resample(X, y)

In [None]:
y_ros.value_counts().plot(kind='bar')

## SMOTE

In [None]:
smote = SMOTE(random_state=RANDOM_STATE)
X_smo, y_smo = smote.fit_resample(X, y)

In [None]:
y_smo.value_counts().plot(kind='bar')

# Function

In [None]:
def imbalance(X, y, method='RUS'):
    ''' fix class imbalance '''

    if (method == 'RUS'):
        return RandomUnderSampler().fit_resample(X, y), RandomUnderSampler()
    elif (method == 'Tomek'):
        return TomekLinks().fit_resample(X, y), TomekLinks()
    elif (method == 'NearMiss'):
        return NearMiss().fit_resample(X, y), NearMiss()
    elif (method == 'ROS'):
        return RandomOverSampler().fit_resample(X, y), RandomOverSampler()
    elif (method == 'SMOTE'):
        return SMOTE().fit_resample(X, y), SMOTE()
    elif (method == 'SMOTEENN'):
        return SMOTEENN().fit_resample(X, y), SMOTEENN()
    elif (method == 'SMOTETomek'):
        return SMOTETomek().fit_resample(X, y), SMOTETomek()

In [None]:
data_bal, _ = imbalance(X, y, method='SMOTETomek')
X_b, y_b = data_bal
y_b.value_counts().plot(kind='bar')