# Data Prep

Building off of the data prep notebook from Assignment 4.
* Initial data prep
* [Column Transformation](#Column-Transformation)
* [Feature selection](#Feature-Selection)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.facecolor']='w'

In [2]:

teledf = pd.read_csv('data/telcomarketing.csv')

In [3]:
#basic discovery

teledf.sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
1362,AZ,100,415,No,No,0,188.5,152,32.05,148.3,115,12.61,179.8,88,8.09,15.2,5,4.1,2,False
1806,KS,148,510,No,No,0,239.3,84,40.68,195.7,85,16.63,232.6,104,10.47,10.9,3,2.94,1,False
599,NC,27,408,No,No,0,201.2,128,34.2,227.2,100,19.31,145.8,91,6.56,8.4,3,2.27,2,False
484,MI,53,415,No,No,0,57.5,95,9.78,265.5,131,22.57,244.3,128,10.99,11.6,6,3.13,3,False
1530,NY,142,415,No,No,0,145.4,93,24.72,209.1,98,17.77,214.0,96,9.63,10.9,1,2.94,1,False


In [4]:
teledf.shape

(3333, 20)

In [5]:
teledf.nunique()

State                       51
Account length             212
Area code                    3
International plan           2
Voice mail plan              2
Number vmail messages       46
Total day minutes         1667
Total day calls            119
Total day charge          1667
Total eve minutes         1611
Total eve calls            123
Total eve charge          1440
Total night minutes       1591
Total night calls          120
Total night charge         933
Total intl minutes         162
Total intl calls            21
Total intl charge          162
Customer service calls      10
Churn                        2
dtype: int64

In [6]:
teledf.dtypes

State                      object
Account length              int64
Area code                   int64
International plan         object
Voice mail plan            object
Number vmail messages       int64
Total day minutes         float64
Total day calls             int64
Total day charge          float64
Total eve minutes         float64
Total eve calls             int64
Total eve charge          float64
Total night minutes       float64
Total night calls           int64
Total night charge        float64
Total intl minutes        float64
Total intl calls            int64
Total intl charge         float64
Customer service calls      int64
Churn                        bool
dtype: object

In [7]:
teledf.isna().sum()

State                     0
Account length            0
Area code                 0
International plan        0
Voice mail plan           0
Number vmail messages     0
Total day minutes         0
Total day calls           0
Total day charge          0
Total eve minutes         0
Total eve calls           0
Total eve charge          0
Total night minutes       0
Total night calls         0
Total night charge        0
Total intl minutes        0
Total intl calls          0
Total intl charge         0
Customer service calls    0
Churn                     0
dtype: int64

In [8]:
#QC

teledf.sample(5)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
1746,OK,180,415,No,No,0,143.5,121,24.4,189.3,111,16.09,174.9,82,7.87,8.8,5,2.38,3,False
2282,WV,87,415,No,No,0,124.3,91,21.13,173.4,105,14.74,256.3,109,11.53,7.5,5,2.03,3,False
2559,ID,103,415,No,No,0,174.7,151,29.7,148.0,56,12.58,168.2,109,7.57,15.8,3,4.27,6,True
666,AL,102,408,No,No,0,224.7,81,38.2,129.4,112,11.0,167.6,109,7.54,15.8,6,4.27,1,False
1472,IL,116,510,No,No,0,164.6,110,27.98,270.6,103,23.0,230.4,109,10.37,8.0,3,2.16,0,False


## Column Transformation

In [9]:
cat_var = ['State','Area code']
bin_var = ['International plan', 'Voice mail plan']
target_var = ['Churn']
non_cont = cat_var + bin_var + target_var

cont_var = list(teledf.drop(columns = non_cont).columns)

In [10]:
#QC

teledf[cat_var].dtypes, teledf[cont_var].dtypes

(State        object
 Area code     int64
 dtype: object, Account length              int64
 Number vmail messages       int64
 Total day minutes         float64
 Total day calls             int64
 Total day charge          float64
 Total eve minutes         float64
 Total eve calls             int64
 Total eve charge          float64
 Total night minutes       float64
 Total night calls           int64
 Total night charge        float64
 Total intl minutes        float64
 Total intl calls            int64
 Total intl charge         float64
 Customer service calls      int64
 dtype: object)

In [11]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [12]:
data_transformer = ColumnTransformer(transformers = [
    ('cont', 'passthrough', cont_var),
    ('target', 'passthrough', target_var),
    ('binary', OrdinalEncoder(), bin_var),
    ('nominal', OneHotEncoder(sparse = False), cat_var)],
                                      remainder = 'drop')

In [13]:

data_transformer.fit(teledf)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cont', 'passthrough',
                                 ['Account length', 'Number vmail messages',
                                  'Total day minutes', 'Total day calls',
                                  'Total day charge', 'Total eve minutes',
                                  'Total eve calls', 'Total eve charge',
                                  'Total night minutes', 'Total night calls',
                                  'Total night charge', 'Total intl minutes',
                                  'Tot...ls', 'Total intl charge',
                                  'Customer service calls']),
                                ('target', 'passthrough', ['Churn']),
                                ('binary',
                                 OrdinalEncoder(categories='auto',
                                                dtype=<class 'num

In [14]:
nom_name=data_transformer.named_transformers_['nominal'].categories_

In [15]:

transformed_nomcat = []

for col, name in zip(cat_var, nom_name):
    for i in name:
        transformed_nomcat.append('_'.join([col,str(i)]))

In [16]:

transformed_nomcat

['State_AK',
 'State_AL',
 'State_AR',
 'State_AZ',
 'State_CA',
 'State_CO',
 'State_CT',
 'State_DC',
 'State_DE',
 'State_FL',
 'State_GA',
 'State_HI',
 'State_IA',
 'State_ID',
 'State_IL',
 'State_IN',
 'State_KS',
 'State_KY',
 'State_LA',
 'State_MA',
 'State_MD',
 'State_ME',
 'State_MI',
 'State_MN',
 'State_MO',
 'State_MS',
 'State_MT',
 'State_NC',
 'State_ND',
 'State_NE',
 'State_NH',
 'State_NJ',
 'State_NM',
 'State_NV',
 'State_NY',
 'State_OH',
 'State_OK',
 'State_OR',
 'State_PA',
 'State_RI',
 'State_SC',
 'State_SD',
 'State_TN',
 'State_TX',
 'State_UT',
 'State_VA',
 'State_VT',
 'State_WA',
 'State_WI',
 'State_WV',
 'State_WY',
 'Area code_408',
 'Area code_415',
 'Area code_510']

In [17]:
teledf_trans = data_transformer.transform(teledf)

In [18]:
trans_col = cont_var + target_var + bin_var  + transformed_nomcat

In [19]:
teledf2 = pd.DataFrame(data = teledf_trans, columns=trans_col)

In [33]:
teledf2.head()

Unnamed: 0,Account length,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,...,State_UT,State_VA,State_VT,State_WA,State_WI,State_WV,State_WY,Area code_408,Area code_415,Area code_510
0,128.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,107.0,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,137.0,0.0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,84.0,0.0,299.4,71.0,50.9,61.9,88.0,5.26,196.9,89.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,75.0,0.0,166.7,113.0,28.34,148.3,122.0,12.61,186.9,121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
teledf2.to_csv('data/teledf.csv', index = False)

# Feature Selection

* Variance Threshold
* Correlation
* ANOVA


In [22]:
# variance threshold

from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

In [35]:
# categorical variables

cat_vt = VarianceThreshold(threshold=0.1)

cat_var_df = teledf2[transformed_nomcat + bin_var]
cat_var_df.head()


Unnamed: 0,State_AK,State_AL,State_AR,State_AZ,State_CA,State_CO,State_CT,State_DC,State_DE,State_FL,...,State_VT,State_WA,State_WI,State_WV,State_WY,Area code_408,Area code_415,Area code_510,International plan,Voice mail plan
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [36]:
cat_vt.fit_transform(cat_var_df)
cat_vt.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True,  True,  True,
       False,  True])

In [65]:
vt_features = cat_var_df.loc[:,list(cat_vt.get_support())]
vt_features = list(vt_features.columns)
vt_features

['Area code_408', 'Area code_415', 'Area code_510', 'Voice mail plan']

In [38]:
scaler = MinMaxScaler()

tele_cont = teledf2[cont_var]

tele_cont_scaled = scaler.fit_transform(tele_cont)

In [44]:
#vt_cont = VarianceThreshold(threshold = 0.1)

#vt_cont.fit_transform(tele_cont_scaled)

#none meet threshold

In [48]:
# correlation


top_5_corr = teledf2.corr()['Churn'].abs().sort_values(ascending = False)[:5]
list(top_5_corr.index), top_5_corr

(['Churn',
  'International plan',
  'Customer service calls',
  'Total day minutes',
  'Total day charge'],
 Churn                     1.000000
 International plan        0.259852
 Customer service calls    0.208750
 Total day minutes         0.205151
 Total day charge          0.205151
 Name: Churn, dtype: float64)

In [67]:
corr_features = list(top_5_corr.index)

In [49]:
# ANOVA

from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [52]:

X_ANOVA = tele_cont
y_ANOVA = teledf2.Churn

In [53]:

X_ANOVA_train, X_ANOVA_test, y_ANOVA_train, y_ANOVA_test = train_test_split(X_ANOVA, y_ANOVA, 
                                                                            test_size = 0.2, random_state = 42)

In [54]:
anovascaler = StandardScaler()

X_ANOVA_train_scaled = anovascaler.fit_transform(X_ANOVA_train)
X_ANOVA_test_scaled = anovascaler.transform(X_ANOVA_test)

In [55]:
anovaselector = SelectKBest(score_func = f_classif)
anovaselector.fit(X_ANOVA_train_scaled, y_ANOVA_train)

SelectKBest(k=10, score_func=<function f_classif at 0x00000281C658A840>)

In [60]:
tele_anova = tele_cont.loc[:,list(anovaselector.get_support())]

In [66]:
anova_features = list(tele_anova.columns)

In [62]:
anovaselector.pvalues_.round(3)

array([0.464, 0.   , 0.   , 0.127, 0.   , 0.   , 0.722, 0.   , 0.283,
       0.848, 0.283, 0.   , 0.003, 0.   , 0.   ])

In [63]:
anovaselector.get_support()

array([False,  True,  True,  True,  True,  True, False,  True, False,
       False, False,  True,  True,  True,  True])

In [68]:
teledf_final = teledf2[vt_features + corr_features + anova_features]
teledf_final.head()

Unnamed: 0,Area code_408,Area code_415,Area code_510,Voice mail plan,Churn,International plan,Customer service calls,Total day minutes,Total day charge,Number vmail messages,Total day minutes.1,Total day calls,Total day charge.1,Total eve minutes,Total eve charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls.1
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,265.1,45.07,25.0,265.1,110.0,45.07,197.4,16.78,10.0,3.0,2.7,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,161.6,27.47,26.0,161.6,123.0,27.47,195.5,16.62,13.7,3.0,3.7,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,243.4,41.38,0.0,243.4,114.0,41.38,121.2,10.3,12.2,5.0,3.29,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,2.0,299.4,50.9,0.0,299.4,71.0,50.9,61.9,5.26,6.6,7.0,1.78,2.0
4,0.0,1.0,0.0,0.0,0.0,1.0,3.0,166.7,28.34,0.0,166.7,113.0,28.34,148.3,12.61,10.1,3.0,2.73,3.0


In [69]:
teledf_final.to_csv('data/teledf_final.csv')