# Step 02a: Train, Test, Split and Then SMOTE of training data

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from imblearn.over_sampling import SMOTE

In [18]:
churn = pd.read_csv('../data/churn_cleaned.csv', index_col=0)

In [19]:
churn.columns

Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'churn', 'phone_prefix', 'total_minutes', 'total_calls',
       'total_charge'],
      dtype='object')

In [20]:
churn.dtypes

state                       int64
account_length              int64
area_code                   int64
international_plan          int64
voice_mail_plan             int64
number_vmail_messages       int64
total_day_minutes         float64
total_day_calls             int64
total_day_charge          float64
total_eve_minutes         float64
total_eve_calls             int64
total_eve_charge          float64
total_night_minutes       float64
total_night_calls           int64
total_night_charge        float64
total_intl_minutes        float64
total_intl_calls            int64
total_intl_charge         float64
customer_service_calls      int64
churn                       int64
phone_prefix                int64
total_minutes             float64
total_calls                 int64
total_charge              float64
dtype: object

### Train test split with stratifiction in respect to y

In [21]:
X = churn.drop(columns = 'churn', axis = 1)

In [22]:
y = churn.churn

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    stratify = y, 
                                                    random_state = 3)

### Time to SMOTE the training data

In [24]:
sm = SMOTE(random_state=3)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [25]:
X_train_sm_df = pd.DataFrame(X_train_sm, columns = X.columns)

In [26]:
X_train.shape, X_train_sm.shape

((2333, 23), (3990, 23))

In [27]:
y_train.shape, y_train_sm.shape

((2333,), (3990,))

In [28]:
y.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [29]:
y_sm_df = pd.DataFrame(y_train_sm, columns=['churn'])
y_sm_df.churn.value_counts()

1    1995
0    1995
Name: churn, dtype: int64

### Export data

In [30]:
### Export X_train (original), X_train_sm (smote), and X_test
X_train.to_csv('../data/X_train.csv')
X_train_sm_df.to_csv('../data/X_train_sm.csv')
X_test.to_csv('../data/X_test.csv')

### Export y_train (original), y_train_sm (smote), and y_test
pickle_out = open('../data/y_train.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/y_train_sm.pickle', 'wb')
pickle.dump(y_train_sm, pickle_out)

pickle_out = open('../data/y_test.pickle', 'wb')
pickle.dump(y_test, pickle_out)