# Step2b: OHE and SS with respect to X_train

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [24]:
X_train = pd.read_csv('../data/X_train.csv', index_col=0)

### Define categorical and numeric columns

In [25]:
X_train.phone_prefix = X_train.phone_prefix.apply(lambda p: str(p))
X_train.area_code = X_train.area_code.apply(lambda p: str(p))

In [26]:
categorical = []
numeric = []
for c in X_train.columns:
    if X_train[c].dtypes == 'object':
        categorical.append(c)
    else:
        numeric.append(c)
        
categorical, numeric

(['state',
  'area_code',
  'international_plan',
  'voice_mail_plan',
  'phone_prefix'],
 ['account_length',
  'number_vmail_messages',
  'total_day_minutes',
  'total_day_calls',
  'total_day_charge',
  'total_eve_minutes',
  'total_eve_calls',
  'total_eve_charge',
  'total_night_minutes',
  'total_night_calls',
  'total_night_charge',
  'total_intl_minutes',
  'total_intl_calls',
  'total_intl_charge',
  'customer_service_calls',
  'total_minutes',
  'total_calls',
  'total_charge'])

### OHE

In [27]:
X_ohe = X_train.drop(columns = numeric)

In [28]:
ohe = OneHotEncoder(drop='first', sparse=False)

In [29]:
ohe.fit_transform(X_ohe)

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
categorical

['state', 'area_code', 'international_plan', 'voice_mail_plan', 'phone_prefix']

In [31]:
X_ohe_df = pd.DataFrame(ohe.fit_transform(X_ohe), columns = ohe.get_feature_names())

### StandardScaler

In [32]:
X_ss = X_train.drop(columns = categorical)

In [33]:
X_ss.columns

Index(['account_length', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'total_minutes', 'total_calls', 'total_charge'],
      dtype='object')

In [34]:
ss = StandardScaler()
X_ss_df = pd.DataFrame(ss.fit_transform(X_ss), columns = X_ss.columns)

In [35]:
X_ss_df;

In [36]:
X_train_trans = pd.concat([X_ss_df, X_ohe_df], axis = 1)

### Export data

In [37]:
### X_train (original), X_train_trans (transformed), X_test, y_train, y_test
X_train_trans.to_csv('../data/X_train_trans.csv')