# Feature Engineering

In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import os
import io
import time
from time import strftime, gmtime

In [3]:
# Sagemaker dependencies
from sagemaker.session import Session
from sagemaker.session import Session
from sagemaker import get_execution_role
import sagemaker
import sys
import boto3
prefix = 'internet-churn-project'
role = get_execution_role()
region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)

sagemaker_client = boto_session.client(service_name="sagemaker", region_name=region)
featurestore_runtime = boto_session.client(
    service_name="sagemaker-featurestore-runtime", region_name=region
)

feature_store_session = Session(
    boto_session=boto_session,
    sagemaker_client=sagemaker_client,
    sagemaker_featurestore_runtime_client=featurestore_runtime,
)
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
s3_bucket_name = sagemaker_session.default_bucket()
s3_client = boto3.client("s3", region_name=region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [4]:
os.getcwd()

'/root/AAI-540-Internet-Churn-Project'

In [5]:
# Load data set
df = pd.read_csv('data/internet_service_churn.csv')
df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,11.95,25,0.14,0,8.4,2.3,0,0
1,18,0,0,8.22,0,,0,0.0,0.0,0,1
2,23,1,0,8.91,16,0.0,0,13.7,0.9,0,1
3,27,0,0,6.87,21,,1,0.0,0.0,0,1
4,34,0,0,6.39,0,,0,0.0,0.0,0,1


In [6]:
# Fix spelling error in column
df = df.rename(columns = {'reamining_contract':'remaining_contract'})
df['remaining_contract'] = df['remaining_contract'].astype(str)

In [7]:
# Fix negative values
df = df[df[df.columns].min(axis=1) >= 0]

  df = df[df[df.columns].min(axis=1) >= 0]


In [8]:
# Discretize column
df['remaining_contract'].replace('nan', 'no contract', inplace=True)
for i in df['remaining_contract']:
    try:
        if float(i) >= 0 and float(i) <1:
            df['remaining_contract'].replace(i, '0-1 years', inplace=True)
        elif float(i) >= 1 and float(i) < 2:
            df['remaining_contract'].replace(i, '1-2 years', inplace=True)
        elif float(i) >= 2 and float(i)<3:
            df['remaining_contract'].replace(i, '2-3 years', inplace=True)
    except:
        continue

df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,remaining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,11.95,25,0-1 years,0,8.4,2.3,0,0
1,18,0,0,8.22,0,no contract,0,0.0,0.0,0,1
2,23,1,0,8.91,16,0-1 years,0,13.7,0.9,0,1
3,27,0,0,6.87,21,no contract,1,0.0,0.0,0,1
4,34,0,0,6.39,0,no contract,0,0.0,0.0,0,1


In [9]:
df['remaining_contract'].value_counts()

0-1 years      31707
no contract    21572
1-2 years      18818
2-3 years        176
Name: remaining_contract, dtype: int64

In [10]:
# Fill na with column median 
df[['download_avg','upload_avg']] = df[['download_avg','upload_avg']].fillna(df[['download_avg','upload_avg']].median())

In [11]:
# Get dummy variables
df = pd.get_dummies(df, columns = ['remaining_contract'],dtype = int)
df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,service_failure_count,download_avg,upload_avg,download_over_limit,churn,remaining_contract_0-1 years,remaining_contract_1-2 years,remaining_contract_2-3 years,remaining_contract_no contract
0,15,1,0,11.95,25,0,8.4,2.3,0,0,1,0,0,0
1,18,0,0,8.22,0,0,0.0,0.0,0,1,0,0,0,1
2,23,1,0,8.91,16,0,13.7,0.9,0,1,1,0,0,0
3,27,0,0,6.87,21,1,0.0,0.0,0,1,0,0,0,1
4,34,0,0,6.39,0,0,0.0,0.0,0,1,0,0,0,1


In [12]:
# Rename columns for better feature group creation
df= df.rename({'remaining_contract_0-1 years':'remaining_contract_0-1_years',
              'remaining_contract_1-2 years': 'remaining_contract_1-2_years',
              'remaining_contract_2-3 years': 'remaining_contract_2-3_years',
              'remaining_contract_no contract':'remaining_contract_no_contract'},axis = 1)

In [13]:
df.columns

Index(['id', 'is_tv_subscriber', 'is_movie_package_subscriber',
       'subscription_age', 'bill_avg', 'service_failure_count', 'download_avg',
       'upload_avg', 'download_over_limit', 'churn',
       'remaining_contract_0-1_years', 'remaining_contract_1-2_years',
       'remaining_contract_2-3_years', 'remaining_contract_no_contract'],
      dtype='object')

In [14]:
df.to_csv('clean_churn.csv')

In [None]:
# Writing cleaned DF to csv
#set(df['churn'])
#df.to_csv("data/internet_churn_cleaned.csv")

In [None]:
# USING LAB 3 CODE FOR FEATURE STORE CREATION

In [None]:
# Define feature group
churn_feature_group_name = "churn-feature-group-" + strftime("%d-%H-%M-%S", gmtime())

In [None]:
from sagemaker.feature_store.feature_group import FeatureGroup

churn_feature_group = FeatureGroup(
    name=churn_feature_group_name, sagemaker_session=feature_store_session
)

In [None]:
import time

current_time_sec = int(round(time.time()))


def cast_object_to_string(data_frame):
    for label in data_frame.columns:
        if data_frame.dtypes[label] == "object":
            data_frame[label] = data_frame[label].astype("str").astype("string")


# cast object dtype to string. The SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(df)

# record identifier and event time feature names
record_identifier_feature_name = "id"
event_time_feature_name = "EventTime"

# append EventTime feature
df[event_time_feature_name] = pd.Series(
    [current_time_sec] * len(df), dtype="float64"
)

# load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
churn_feature_group.load_feature_definitions(data_frame=df)
# output is suppressed


In [None]:
def wait_for_feature_group_creation_complete(feature_group):
    status = feature_group.describe().get("FeatureGroupStatus")
    while status == "Creating":
        print("Waiting for Feature Group Creation")
        time.sleep(5)
        status = feature_group.describe().get("FeatureGroupStatus")
    if status != "Created":
        raise RuntimeError(f"Failed to create feature group {feature_group.name}")
    print(f"FeatureGroup {feature_group.name} successfully created.")


churn_feature_group.create(
    s3_uri=f"s3://{s3_bucket_name}/{prefix}",
    record_identifier_name=record_identifier_feature_name,
    event_time_feature_name=event_time_feature_name,
    role_arn=role,
    enable_online_store=True,
)

wait_for_feature_group_creation_complete(feature_group=churn_feature_group)

In [None]:
churn_feature_group.describe()

In [None]:
# Ingest data into feature group
churn_feature_group.ingest(data_frame=df, max_workers=3, wait=True)

In [None]:
account_id = boto3.client("sts").get_caller_identity()["Account"]
print(account_id)

churn_feature_group_resolved_output_s3_uri = (
  churn_feature_group.describe()
    .get("OfflineStoreConfig")
    .get("S3StorageConfig")
    .get("ResolvedOutputS3Uri")
)



churn_feature_group_s3_prefix = churn_feature_group_resolved_output_s3_uri.replace(
    f"s3://{s3_bucket_name}/", ""
)

offline_store_contents = None
while offline_store_contents is None:
    objects_in_bucket = s3_client.list_objects(
        Bucket=s3_bucket_name, Prefix=churn_feature_group_s3_prefix
    )
    if "Contents" in objects_in_bucket and len(objects_in_bucket["Contents"]) > 1:
        offline_store_contents = objects_in_bucket["Contents"]
    else:
        print("Waiting for data in offline store...\n")
        sleep(60)

print("Data available.")

# CODE FOR AFTER FEATURE STORE IS CREATED FOR TRAIN/TEST/VAL SPLIT

In [None]:
# Split into train and test, validation 80/10/10
X = df.drop(labels = 'churn',axis = 1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 24)
X_test,X_val,y_test,y_val = train_test_split(X_test,y_test,test_size = 0.5,random_state = 24)

In [None]:
# Scale variables
cols_scale = ['subscription_age','bill_avg','service_failure_count','download_avg','upload_avg','download_over_limit']
SS = ColumnTransformer([('scaler',StandardScaler(),cols_scale)],remainder='passthrough')
X_train = SS.fit_transform(X_train)
X_test = SS.fit_transform(X_test)
X_val = SS.fit_transform(X_val)

In [None]:
new_cols = ['subscription_age','bill_avg','service_failure_count',
                                           'download_avg','upload_avg','download_over_limit','is_tv_subscriber',
                                           'is_movie_package_subscriber','remaining_contract_0-1_years',
                                           'remaining_contract_1-2_years','remaining_contract_2-3_years',
                                           'remaining_contract_no_contract']
X_train = pd.DataFrame(X_train,columns = new_cols)
X_test = pd.DataFrame(X_test,columns = new_cols)
X_val = pd.DataFrame(X_val,columns = new_cols)
X_train.head()