# Data Prep

Building off of the data prep notebook from Assignment 4.
* Initial data prep
* [Column Transformation](#Column-Transformation)
* Feature selection

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.facecolor']='w'

In [6]:

teledf = pd.read_csv('data/telcomarketing.csv')

In [3]:
#basic discovery

teledf.sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
3128,CO,90,408,No,No,0,109.9,102,18.68,220.8,114,18.77,104.0,133,4.68,10.9,6,2.94,0,False
3306,NY,209,415,No,No,0,153.7,105,26.13,188.6,87,16.03,200.8,95,9.04,10.7,2,2.89,0,False
2218,OH,134,415,No,No,0,7.8,86,1.33,171.4,100,14.57,186.5,80,8.39,12.9,2,3.48,2,False
995,ND,81,408,Yes,Yes,37,237.1,76,40.31,264.2,125,22.46,271.3,120,12.21,7.9,3,2.13,1,False
3329,AL,106,408,No,Yes,29,83.6,131,14.21,203.9,131,17.33,229.5,73,10.33,8.1,3,2.19,1,False


In [4]:
teledf.shape

(3333, 20)

In [5]:
teledf.nunique()

State                       51
Account length             212
Area code                    3
International plan           2
Voice mail plan              2
Number vmail messages       46
Total day minutes         1667
Total day calls            119
Total day charge          1667
Total eve minutes         1611
Total eve calls            123
Total eve charge          1440
Total night minutes       1591
Total night calls          120
Total night charge         933
Total intl minutes         162
Total intl calls            21
Total intl charge          162
Customer service calls      10
Churn                        2
dtype: int64

In [6]:
teledf.dtypes

State                      object
Account length              int64
Area code                   int64
International plan         object
Voice mail plan            object
Number vmail messages       int64
Total day minutes         float64
Total day calls             int64
Total day charge          float64
Total eve minutes         float64
Total eve calls             int64
Total eve charge          float64
Total night minutes       float64
Total night calls           int64
Total night charge        float64
Total intl minutes        float64
Total intl calls            int64
Total intl charge         float64
Customer service calls      int64
Churn                        bool
dtype: object

In [7]:
teledf.isna().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [5]:
#QC

teledf.sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,...,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn,Total minutes,Total calls,Total charge
355,OH,100,510,No,No,0,278.0,76,47.26,176.7,...,126,9.88,8.3,4,2.24,0,True,682.5,280,74.4
2240,IN,108,415,No,No,0,201.1,101,34.19,170.7,...,113,10.68,11.6,3,3.13,3,False,620.8,303,62.51
3194,OK,84,510,No,No,0,203.4,125,34.58,182.9,...,121,9.62,13.8,2,3.73,1,False,613.8,336,63.48
3222,WV,87,415,No,No,0,58.0,125,9.86,67.5,...,136,8.37,11.5,3,3.11,0,False,322.9,380,27.08
31,MA,78,415,No,No,0,130.8,64,22.24,223.7,...,108,10.25,10.0,5,2.7,1,False,592.3,293,54.2


## Column Transformation

In [7]:
cat_var = ['State','Area code']
bin_var = ['International plan', 'Voice mail plan']
target_var = ['Churn']
non_cont = cat_var + bin_var + target_var

cont_var = list(teledf.drop(columns = non_cont).columns)

In [10]:
#QC

teledf[cat_var].dtypes, teledf[cont_var].dtypes

(State        object
 Area code     int64
 dtype: object, Account length              int64
 Number vmail messages       int64
 Total day minutes         float64
 Total day calls             int64
 Total day charge          float64
 Total eve minutes         float64
 Total eve calls             int64
 Total eve charge          float64
 Total night minutes       float64
 Total night calls           int64
 Total night charge        float64
 Total intl minutes        float64
 Total intl calls            int64
 Total intl charge         float64
 Customer service calls      int64
 dtype: object)

In [11]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [12]:
data_transformer = ColumnTransformer(transformers = [
    ('cont', 'passthrough', cont_var),
    ('target', 'passthrough', target_var),
    ('binary', OrdinalEncoder(), bin_var),
    ('nominal', OneHotEncoder(sparse = False), cat_var)],
                                      remainder = 'drop')

In [14]:

data_transformer.fit(teledf)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cont', 'passthrough',
                                 ['Account length', 'Number vmail messages',
                                  'Total day minutes', 'Total day calls',
                                  'Total day charge', 'Total eve minutes',
                                  'Total eve calls', 'Total eve charge',
                                  'Total night minutes', 'Total night calls',
                                  'Total night charge', 'Total intl minutes',
                                  'Tot...ls', 'Total intl charge',
                                  'Customer service calls']),
                                ('target', 'passthrough', ['Churn']),
                                ('binary',
                                 OrdinalEncoder(categories='auto',
                                                dtype=<class 'num

In [15]:
nom_name=data_transformer.named_transformers_['nominal'].categories_

In [16]:

transformed_nomcat = []

for col, name in zip(cat_var, nom_name):
    for i in name:
        transformed_nomcat.append('_'.join([col,str(i)]))

In [17]:

transformed_nomcat

['State_AK',
 'State_AL',
 'State_AR',
 'State_AZ',
 'State_CA',
 'State_CO',
 'State_CT',
 'State_DC',
 'State_DE',
 'State_FL',
 'State_GA',
 'State_HI',
 'State_IA',
 'State_ID',
 'State_IL',
 'State_IN',
 'State_KS',
 'State_KY',
 'State_LA',
 'State_MA',
 'State_MD',
 'State_ME',
 'State_MI',
 'State_MN',
 'State_MO',
 'State_MS',
 'State_MT',
 'State_NC',
 'State_ND',
 'State_NE',
 'State_NH',
 'State_NJ',
 'State_NM',
 'State_NV',
 'State_NY',
 'State_OH',
 'State_OK',
 'State_OR',
 'State_PA',
 'State_RI',
 'State_SC',
 'State_SD',
 'State_TN',
 'State_TX',
 'State_UT',
 'State_VA',
 'State_VT',
 'State_WA',
 'State_WI',
 'State_WV',
 'State_WY',
 'Area code_408',
 'Area code_415',
 'Area code_510']

In [18]:
teledf_trans = data_transformer.transform(teledf)

In [20]:
trans_col = cont_var + target_var + bin_var  + transformed_nomcat

In [25]:
teledf2 = pd.DataFrame(data = teledf_trans, columns=trans_col)

In [26]:
teledf2.head()

Unnamed: 0,Account length,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,...,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Area code_408,Area code_415,Area code_510
0,128.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,107.0,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,137.0,0.0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,84.0,0.0,299.4,71.0,50.9,61.9,88.0,5.26,196.9,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,75.0,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [28]:
teledf2.to_csv('data/teledf.csv', index = False)