# Import packages

In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Import X_test

In [25]:
X_test = pd.read_csv('../data/X_test.csv', index_col=0)

In [26]:
X_test.columns

Index(['account_length', 'area_code', 'international_plan', 'voice_mail_plan',
       'number_vmail_messages', 'total_day_minutes', 'total_day_calls',
       'total_day_charge', 'total_eve_minutes', 'total_eve_calls',
       'total_eve_charge', 'total_night_minutes', 'total_night_calls',
       'total_night_charge', 'total_intl_minutes', 'total_intl_calls',
       'total_intl_charge', 'customer_service_calls', 'total_minutes',
       'total_calls', 'total_charge'],
      dtype='object')

In [27]:
X_test.area_code = X_test.area_code.apply(lambda p: str(p))

# Define categorical and numeric columns

In [28]:
categorical = []
numeric = []
for c in X_test.columns:
    if X_test[c].dtypes == 'object':
        categorical.append(c)
    else:
        numeric.append(c)
        
categorical, numeric

(['area_code', 'international_plan', 'voice_mail_plan'],
 ['account_length',
  'number_vmail_messages',
  'total_day_minutes',
  'total_day_calls',
  'total_day_charge',
  'total_eve_minutes',
  'total_eve_calls',
  'total_eve_charge',
  'total_night_minutes',
  'total_night_calls',
  'total_night_charge',
  'total_intl_minutes',
  'total_intl_calls',
  'total_intl_charge',
  'customer_service_calls',
  'total_minutes',
  'total_calls',
  'total_charge'])

# OHE

In [29]:
X_ohe = X_test.drop(columns = numeric)

In [30]:
ohe = OneHotEncoder(drop='first', sparse=False)

In [31]:
ohe.fit_transform(X_ohe)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 1., 0.],
       ...,
       [1., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [32]:
categorical

['area_code', 'international_plan', 'voice_mail_plan']

In [33]:
X_ohe_df = pd.DataFrame(ohe.fit_transform(X_ohe), columns = ohe.get_feature_names())

# StandardScaler

In [34]:
X_ss = X_test.drop(columns = categorical)

In [35]:
X_ss.columns

Index(['account_length', 'number_vmail_messages', 'total_day_minutes',
       'total_day_calls', 'total_day_charge', 'total_eve_minutes',
       'total_eve_calls', 'total_eve_charge', 'total_night_minutes',
       'total_night_calls', 'total_night_charge', 'total_intl_minutes',
       'total_intl_calls', 'total_intl_charge', 'customer_service_calls',
       'total_minutes', 'total_calls', 'total_charge'],
      dtype='object')

In [36]:
ss = StandardScaler()
X_ss_df = pd.DataFrame(ss.fit_transform(X_ss), columns = X_ss.columns)

In [37]:
X_ss_df;

In [39]:
X_test_trans = pd.concat([X_ss_df, X_ohe_df], axis = 1)

# Export data

In [40]:
### X_train (original), X_train_trans (transformed), X_test, y_train, y_test
X_test_trans.to_csv('../data/X_test_trans.csv')