# Train, Test, Split

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

In [12]:
churn = pd.read_csv('../data/churn_cleaned.csv', index_col=0)

In [13]:
churn.columns

Index(['state', 'account_length', 'area_code', 'international_plan',
       'voice_mail_plan', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'churn', 'phone_prefix', 'total_minutes', 'total_calls',
       'total_charge'],
      dtype='object')

In [14]:
churn.dtypes

state                      object
account_length              int64
area_code                   int64
international_plan         object
voice_mail_plan            object
number_vmail_messages       int64
total_day_minutes         float64
total_day_calls             int64
total_day_charge          float64
total_eve_minutes         float64
total_eve_calls             int64
total_eve_charge          float64
total_night_minutes       float64
total_night_calls           int64
total_night_charge        float64
total_intl_minutes        float64
total_intl_calls            int64
total_intl_charge         float64
customer_service_calls      int64
churn                       int64
phone_prefix                int64
total_minutes             float64
total_calls                 int64
total_charge              float64
dtype: object

In [15]:
churn.phone_prefix = churn.phone_prefix.apply(lambda p: str(p))
churn.area_code = churn.area_code.apply(lambda p: str(p))

### Train test split with stratifiction in respect to y

In [16]:
X = churn.drop(columns = 'churn', axis = 1)

In [17]:
y = churn.churn

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.3, 
                                                    stratify = y, 
                                                    random_state = 3)

In [19]:
### X_train (original), X_train_trans (transformed), X_test, y_train, y_test
### Export X_train (original), X_train_sm (smote), and X_test
X_train.to_csv('../data/X_train.csv')
X_test.to_csv('../data/X_test.csv')

### Export y_train (original), y_train_sm (smote), and y_test
pickle_out = open('../data/y_train.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/y_test.pickle', 'wb')
pickle.dump(y_test, pickle_out)

### Define categorical and numeric columns

In [131]:
categorical = []
numeric = []
for c in X_train.columns:
    if X_train[c].dtypes == 'object':
        categorical.append(c)
    else:
        numeric.append(c)
        
categorical, numeric;

### OHE

In [132]:
# ss = StandardScaler()
# ohe = OneHotEncoder(drop='first', sparse=False)

In [133]:
# ct = ColumnTransformer(
#     transformers=[
#         ('num', ss, numeric),
#         ('cat', ohe, categorical)], 
#     remainder='passthrough')

In [134]:
# X_ct = ct.fit_transform(X_train)

In [135]:
# pd.DataFrame(X_ct)

In [136]:
X_ohe = X_train.drop(columns = numeric)

In [137]:
ohe = OneHotEncoder(drop='first', sparse=False)

In [138]:
ohe.fit_transform(X_ohe)

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [139]:
categorical

['state', 'area_code', 'international_plan', 'voice_mail_plan', 'phone_prefix']

In [140]:
X_ohe_df = pd.DataFrame(ohe.fit_transform(X_ohe), columns = ohe.get_feature_names())

### StandardScaler

In [141]:
X_ss = X_train.drop(columns = categorical)

In [142]:
X_ss.columns

Index(['account_length', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'total_minutes', 'total_calls', 'total_charge'],
      dtype='object')

In [143]:
ss = StandardScaler()
X_ss_df = pd.DataFrame(ss.fit_transform(X_ss), columns = X_ss.columns)

In [144]:
X_ss_df

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,customer_service_calls,total_minutes,total_calls,total_charge
0,-1.508366,1.658110,0.832457,-1.530204,0.832508,-1.310525,-0.972401,-1.311142,-1.616872,0.193381,-1.618393,-0.987060,0.224309,-0.981136,-0.426884,-1.152825,-1.331638,-0.219875
1,0.432612,-0.592293,0.808653,-0.097455,0.808811,0.379882,0.040214,0.379595,0.056732,0.913424,0.058700,-1.414198,0.633467,-1.416169,-1.191800,0.685933,0.526322,0.774870
2,-0.264149,-0.592293,-0.061110,0.544812,-0.061490,1.648178,0.749044,1.647648,-1.006287,1.119150,-1.004497,0.970658,1.042625,0.969925,-0.426884,0.363710,1.455301,0.475691
3,-0.264149,-0.592293,-0.617759,-0.393886,-0.617276,0.322946,0.698414,0.321851,-1.084361,1.170582,-1.084570,0.223166,-1.003165,0.218503,1.102947,-0.778915,0.758566,-0.623814
4,0.581918,-0.592293,-0.181961,0.396596,-0.182125,-0.444707,-0.466094,-0.444986,0.825468,-0.938114,0.823847,0.685899,-0.594007,0.679902,-0.426884,0.116270,-0.605873,-0.117947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2328,-0.363687,-0.592293,-1.753029,-0.196265,-1.752545,0.389698,-1.883755,0.388834,2.975528,-1.298135,2.976932,0.828279,-0.594007,0.824914,-1.191800,0.825599,-1.970312,-0.686103
2329,0.407728,-0.592293,1.110782,-0.789127,1.110401,-0.519313,0.293368,-0.518898,0.168839,-0.372367,0.169913,-0.168378,-1.003165,-0.163799,-1.191800,0.463786,-0.576842,0.785251
2330,0.606802,-0.592293,-0.771569,-0.295075,-0.771302,0.167845,-0.871140,0.167098,0.747393,0.707697,0.748222,-0.595516,-0.184849,-0.598833,-0.426884,0.022792,-0.286536,-0.491684
2331,-0.911142,1.367735,0.403984,0.050760,0.403820,-1.127938,-0.567355,-1.128672,-1.018298,1.376308,-1.017843,0.721494,-0.594007,0.719451,-0.426884,-0.926279,0.439230,-0.271783


In [145]:
X_train_trans = pd.concat([X_ss_df, X_ohe_df], axis = 1)

### Export data

In [148]:
### X_train (original), X_train_trans (transformed), X_test, y_train, y_test
### Export X_train (original), X_train_sm (smote), and X_test
X_train.to_csv('../data/X_train.csv')
X_train_trans.to_csv('../data/X_train_trans.csv')
X_test.to_csv('../data/X_test.csv')

### Export y_train (original), y_train_sm (smote), and y_test
pickle_out = open('../data/y_train.pickle', 'wb')
pickle.dump(y_train, pickle_out)

pickle_out = open('../data/y_test.pickle', 'wb')
pickle.dump(y_test, pickle_out)