# Step 02a: Train, Test, Split and Then SMOTE of training data

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from imblearn.over_sampling import SMOTENC

In [29]:
churn = pd.read_csv('../data/churn_cleaned.csv', index_col=0)

In [30]:
churn.columns

Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'churn', 'phone_prefix', 'total_minutes', 'total_calls',
       'total_charge'],
      dtype='object')

In [31]:
churn.dtypes

state                      object
account_length              int64
area_code                   int64
international_plan         object
voice_mail_plan            object
number_vmail_messages       int64
total_day_minutes         float64
total_day_calls             int64
total_day_charge          float64
total_eve_minutes         float64
total_eve_calls             int64
total_eve_charge          float64
total_night_minutes       float64
total_night_calls           int64
total_night_charge        float64
total_intl_minutes        float64
total_intl_calls            int64
total_intl_charge         float64
customer_service_calls      int64
churn                       int64
phone_prefix                int64
total_minutes             float64
total_calls                 int64
total_charge              float64
dtype: object

In [32]:
churn.phone_prefix = churn.phone_prefix.apply(lambda p: str(p))

churn.phone_prefix

0       382
1       371
2       358
3       375
4       330
       ... 
3328    414
3329    370
3330    328
3331    364
3332    400
Name: phone_prefix, Length: 3333, dtype: object

In [33]:
categorical = []
i = 0
for c in churn.columns:
    if churn[c].dtypes == 'object':
        categorical.append(i)
        i += 1
    else:
        i += 1
        
categorical

[0, 3, 4, 20]

### Train test split with stratifiction in respect to y

In [34]:
X = churn.drop(columns = 'churn', axis = 1)

In [35]:
y = churn.churn

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    stratify = y, 
                                                    random_state = 3)

In [37]:
X_train

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,phone_prefix,total_minutes,total_calls,total_charge
1263,MA,40,408,no,yes,31,224.7,69,38.20,134.5,...,104,5.41,7.5,5,2.03,1,351,487.0,259,57.07
5,AL,118,510,yes,no,0,223.4,98,37.98,220.6,...,118,9.18,6.3,6,1.70,0,391,654.2,323,67.61
2214,CT,90,415,no,no,0,175.9,111,29.90,285.2,...,122,6.79,13.0,7,3.51,1,347,624.9,355,64.44
130,NE,90,415,no,no,0,145.5,92,24.74,217.7,...,123,6.61,10.9,2,2.94,3,353,521.0,331,52.79
3196,OR,124,510,no,no,0,169.3,108,28.78,178.6,...,82,10.90,12.2,3,3.29,1,337,602.4,284,58.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202,MO,86,415,no,no,0,83.5,96,14.20,221.1,...,75,15.74,12.6,3,3.40,0,385,666.9,237,52.13
417,AZ,117,408,no,no,0,239.9,84,40.78,174.8,...,93,9.43,9.8,2,2.65,0,417,634.0,285,67.72
1458,MD,125,408,no,no,0,137.1,94,23.31,209.8,...,114,10.73,8.6,4,2.32,1,349,593.9,295,54.19
1004,DE,64,415,no,yes,27,201.3,101,34.22,143.8,...,127,6.76,12.3,3,3.32,1,402,507.6,320,56.52


### Time to SMOTE the training data

In [38]:
sm = SMOTENC(categorical_features = categorical, random_state=3, n_jobs = -1)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [46]:
pd.DataFrame(X_train_sm, columns = X_train.columns)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,...,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,phone_prefix,total_minutes,total_calls,total_charge
0,MA,40,408,no,yes,31,224.7,69,38.2,134.5,...,104,5.41,7.5,5,2.03,1,351,487,259,57.07
1,AL,118,510,yes,no,0,223.4,98,37.98,220.6,...,118,9.18,6.3,6,1.7,0,391,654.2,323,67.61
2,CT,90,415,no,no,0,175.9,111,29.9,285.2,...,122,6.79,13,7,3.51,1,347,624.9,355,64.44
3,NE,90,415,no,no,0,145.5,92,24.74,217.7,...,123,6.61,10.9,2,2.94,3,353,521,331,52.79
4,OR,124,510,no,no,0,169.3,108,28.78,178.6,...,82,10.9,12.2,3,3.29,1,337,602.4,284,58.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3985,PA,104.727,510,no,no,0,234.759,90.9659,39.9121,275.443,...,107.83,9.9984,8.8,3.67803,2.38,0.96592,361.727,663.6,319.814,75.706
3986,VT,112.728,415,no,no,0,272.161,111.272,46.2702,268.843,...,104.49,9.68359,8.58162,9.89798,2.32177,1,418.524,781.6,343.15,81.1248
3987,NM,147.736,415,no,no,0,137.24,113.093,23.3293,208.713,...,78.2342,7.41785,12.2859,2.54648,3.31342,3.68773,383.654,448,274.952,51.802
3988,NC,57.692,510,yes,no,0,207.084,102.231,35.2053,221.831,...,99.154,4.4131,14.3692,3.61538,3.8769,3.15387,378.693,534.6,313.077,62.3507


In [39]:
X_train_sm_df = pd.DataFrame(X_train_sm, columns = X.columns)

In [40]:
X_train.shape, X_train_sm.shape

((2333, 23), (3990, 23))

In [41]:
y_train.shape, y_train_sm.shape

((2333,), (3990,))

In [42]:
y.value_counts()

0    2850
1     483
Name: churn, dtype: int64

In [43]:
y_sm_df = pd.DataFrame(y_train_sm, columns=['churn'])
y_sm_df.churn.value_counts()

1    1995
0    1995
Name: churn, dtype: int64

### Export data

In [44]:
### Export X_train (original), X_train_sm (smote), and X_test
X_train.to_csv('../data/X_train.csv')
X_train_sm_df.to_csv('../data/X_train_sm.csv')
X_test.to_csv('../data/X_test.csv')

### Export y_train (original), y_train_sm (smote), and y_test
pickle_out = open('../data/y_train.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/y_train_sm.pickle', 'wb')
pickle.dump(y_train_sm, pickle_out)

pickle_out = open('../data/y_test.pickle', 'wb')
pickle.dump(y_test, pickle_out)