# Feature Engineering

In [2]:
import sagemaker
import sys
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import os
import io
import time
from time import strftime, gmtime
from sagemaker.session import Session
from sagemaker import get_execution_role
prefix = 'internet-churn-project'
role = get_execution_role()

sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
os.getcwd()

'/root/AAI-540-Internet-Churn-Project'

In [4]:
# Load data set
df = pd.read_csv('data/internet_service_churn.csv')
df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,11.95,25,0.14,0,8.4,2.3,0,0
1,18,0,0,8.22,0,,0,0.0,0.0,0,1
2,23,1,0,8.91,16,0.0,0,13.7,0.9,0,1
3,27,0,0,6.87,21,,1,0.0,0.0,0,1
4,34,0,0,6.39,0,,0,0.0,0.0,0,1


In [5]:
# Drop ID column
df = df.drop(labels = 'id',axis = 1)

In [6]:
# Fix spelling error in column
df = df.rename(columns = {'reamining_contract':'remaining_contract'})
df['remaining_contract'] = df['remaining_contract'].astype(str)

In [7]:
# Discretize column
df['remaining_contract'].replace('nan', 'no contract', inplace=True)
for i in df['remaining_contract']:
    try:
        if float(i) >= 0 and float(i) <1:
            df['remaining_contract'].replace(i, '0-1 years', inplace=True)
        elif float(i) >= 1 and float(i) < 2:
            df['remaining_contract'].replace(i, '1-2 years', inplace=True)
        elif float(i) >= 2 and float(i)<3:
            df['remaining_contract'].replace(i, '2-3 years', inplace=True)
    except:
        continue

df.head()

Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,remaining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,1,0,11.95,25,0-1 years,0,8.4,2.3,0,0
1,0,0,8.22,0,no contract,0,0.0,0.0,0,1
2,1,0,8.91,16,0-1 years,0,13.7,0.9,0,1
3,0,0,6.87,21,no contract,1,0.0,0.0,0,1
4,0,0,6.39,0,no contract,0,0.0,0.0,0,1


In [8]:
df['remaining_contract'].value_counts()

remaining_contract
0-1 years      31708
no contract    21572
1-2 years      18818
2-3 years        176
Name: count, dtype: int64

In [9]:
# Get dummy variables
df = pd.get_dummies(df, columns = ['remaining_contract'],dtype = int)
df.head()

Unnamed: 0,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,churn,remaining_contract_0-1 years,remaining_contract_1-2 years,remaining_contract_2-3 years,remaining_contract_no contract
0,1,0,11.95,25,0,8.4,2.3,0,0,1,0,0,0
1,0,0,8.22,0,0,0.0,0.0,0,1,0,0,0,1
2,1,0,8.91,16,0,13.7,0.9,0,1,1,0,0,0
3,0,0,6.87,21,1,0.0,0.0,0,1,0,0,0,1
4,0,0,6.39,0,0,0.0,0.0,0,1,0,0,0,1


In [10]:
# Split into train and test
X = df.drop(labels = 'churn',axis = 1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 24)

In [11]:
# Fill na with column median 
X_train[['download_avg','upload_avg']] = X_train[['download_avg','upload_avg']].fillna(X_train[['download_avg','upload_avg']].median())
X_test[['download_avg','upload_avg']] = X_test[['download_avg','upload_avg']].fillna(X_test[['download_avg','upload_avg']].median())

In [12]:
# Scale variables
cols_scale = ['subscription_age','bill_avg','service_failure_count','download_avg','upload_avg','download_over_limit']
SS = ColumnTransformer([('scaler',StandardScaler(),cols_scale)],remainder='passthrough')
X_train = SS.fit_transform(X_train)
X_test = SS.fit_transform(X_test)

In [13]:
new_cols = ['subscription_age','bill_avg','service_failure_count',
                                           'download_avg','upload_avg','download_over_limit','is_tv_subscriber',
                                           'is_movie_package_subscriber','remaining_contract_0-1_years',
                                           'remaining_contract_1-2_years','remaining_contract_2-3_years',
                                           'remaining_contract_no_contract']
X_train = pd.DataFrame(X_train,columns = new_cols)
X_test = pd.DataFrame(X_test,columns = new_cols)
X_train.head()

Unnamed: 0,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,is_tv_subscriber,is_movie_package_subscriber,remaining_contract_0-1_years,remaining_contract_1-2_years,remaining_contract_2-3_years,remaining_contract_no_contract
0,2.503554,0.227845,-0.334555,-0.121946,-0.195863,-0.209692,1.0,0.0,0.0,1.0,0.0,0.0
1,0.773575,0.003701,-0.334555,0.977917,0.40514,-0.209692,1.0,0.0,0.0,1.0,0.0,0.0
2,1.283257,0.30256,-0.334555,-0.489083,-0.364566,-0.209692,0.0,0.0,0.0,0.0,0.0,1.0
3,0.146274,-0.668732,-0.334555,-0.309387,-0.248583,-0.209692,1.0,1.0,0.0,1.0,0.0,0.0
4,0.136472,-0.071014,-0.334555,-0.451905,-0.332934,-0.209692,1.0,0.0,0.0,0.0,0.0,1.0
