# Step 02a: SMOTE

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from imblearn.over_sampling import SMOTE

In [2]:
churn = pd.read_csv('../data/churn_cleaned.csv', index_col=0)

In [3]:
churn.columns

Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'churn', 'phone_prefix'],
      dtype='object')

In [4]:
churn.dtypes

state                       int64
account_length              int64
area_code                   int64
international_plan          int64
voice_mail_plan             int64
number_vmail_messages       int64
total_day_minutes         float64
total_day_calls             int64
total_day_charge          float64
total_eve_minutes         float64
total_eve_calls             int64
total_eve_charge          float64
total_night_minutes       float64
total_night_calls           int64
total_night_charge        float64
total_intl_minutes        float64
total_intl_calls            int64
total_intl_charge         float64
customer_service_calls      int64
churn                       int64
phone_prefix                int64
dtype: object

### Time to SMOTE

In [5]:
X = churn.drop(columns = 'churn', axis = 1)

In [6]:
y = churn.churn

In [7]:
sm = SMOTE(random_state=3)
X_sm, y_sm = sm.fit_resample(X, y)

In [8]:
X_sm_df = pd.DataFrame(X_sm, columns=X.columns)

In [9]:
X.shape, X_sm.shape

((3333, 20), (5700, 20))

In [10]:
y.shape, y_sm.shape

((3333,), (5700,))

In [11]:
y.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [12]:
y_sm_df = pd.DataFrame(y_sm, columns=['churn'])
y_sm_df.churn.value_counts()

1    2850
0    2850
Name: churn, dtype: int64

### Train test split data with SMOTE

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X_sm_df, 
                                                    y_sm, 
                                                    test_size = 0.3, 
                                                    stratify = y_sm, 
                                                    random_state = 3)

In [19]:
X_train.to_csv('../data/X_train_sm.csv')
X_test.to_csv('../data/X_test_sm.csv')

pickle_out = open('../data/y_train_sm.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/y_test_sm.pickle', 'wb')
pickle.dump(y_test, pickle_out)