# CI/CD Pipeline 

In [None]:
!pip install -U sagemaker

In [2]:
import os
import boto3
import sagemaker
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
default_bucket = sagemaker_session.default_bucket()
model_package_group_name = f"ChurnModelPackageGroupName"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [3]:
# Read csv
df = pd.read_csv('data/internet_service_churn.csv')
df.head()

Unnamed: 0,id,is_tv_subscriber,is_movie_package_subscriber,subscription_age,bill_avg,reamining_contract,service_failure_count,download_avg,upload_avg,download_over_limit,churn
0,15,1,0,11.95,25,0.14,0,8.4,2.3,0,0
1,18,0,0,8.22,0,,0,0.0,0.0,0,1
2,23,1,0,8.91,16,0.0,0,13.7,0.9,0,1
3,27,0,0,6.87,21,,1,0.0,0.0,0,1
4,34,0,0,6.39,0,,0,0.0,0.0,0,1


In [None]:
# Set aside some data for batch transform
X = df.drop(labels = 'churn',axis = 1)
y = df['churn']
X_df,X_batch,y_df,y_batch = train_test_split(X,y,test_size = 0.1,random_state = 24)

X_df['churn'] = y_df
X_batch['churn'] = y_batch

# Save as dfs
X_df.to_csv("internet_churn.csv")
X_batch.to_csv("batch_churn.csv")

### Use lab 6 code as guide

In [None]:
# Input data 
local_path = "data/internet_churn.csv"

s3 = boto3.resource("s3")

base_uri = f"s3://{default_bucket}/churn"
input_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(input_data_uri)

In [None]:
# Batch data
local_path = "data/batch_churn.csv"

s3 = boto3.resource("s3")


base_uri = f"s3://{default_bucket}/churn"
batch_data_uri = sagemaker.s3.S3Uploader.upload(
    local_path=local_path,
    desired_s3_uri=base_uri,
)
print(batch_data_uri)

In [None]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)
input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)
batch_data = ParameterString(
    name="BatchData",
    default_value=batch_data_uri,
)
# Accuracy threshold
acc_threshold = ParameterFloat(name="AccThreshold", default_value=0.75)

In [None]:
!mkdir -p code

In [None]:
# Ceate feature engineering step
%%writefile code/preprocessing.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import os
import io
import time
from time import strftime, gmtime
if __name__ == "__main__":
    base_dir = "/opt/ml/processing"

    df = pd.read_csv(
        f"{base_dir}/input/internet_churn.csv")
    
    # Fix spelling error in column
    df = df.rename(columns = {'reamining_contract':'remaining_contract'})
    df['remaining_contract'] = df['remaining_contract'].astype(str)
    
    # Fix negative values
    df = df[df[df.columns].min(axis=1) >= 0]
    
    # Discretize column
    df['remaining_contract'].replace('nan', 'no contract', inplace=True)
    for i in df['remaining_contract']:
        try:
            if float(i) >= 0 and float(i) <1:
                df['remaining_contract'].replace(i, '0-1 years', inplace=True)
            elif float(i) >= 1 and float(i) < 2:
                df['remaining_contract'].replace(i, '1-2 years', inplace=True)
            elif float(i) >= 2 and float(i)<3:
                df['remaining_contract'].replace(i, '2-3 years', inplace=True)
        except:
            continue
            
    # Fill na with column median 
    df[['download_avg','upload_avg']] = df[['download_avg','upload_avg']].fillna(df[['download_avg','upload_avg']].median())
    
    # Get dummy variables
    df = pd.get_dummies(df, columns = ['remaining_contract'],dtype = int)
    
    # Rename columns
    df= df.rename({'remaining_contract_0-1 years':'remaining_contract_0-1_years',
                  'remaining_contract_1-2 years': 'remaining_contract_1-2_years',
                  'remaining_contract_2-3 years': 'remaining_contract_2-3_years',
                  'remaining_contract_no contract':'remaining_contract_no_contract'},axis = 1)
    train, validation, test = np.split(df, [int(0.7 * len(df)), int(0.85 * len(df))])

    pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    pd.DataFrame(validation).to_csv(
        f"{base_dir}/validation/validation.csv", header=False, index=False
    )
    pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor


framework_version = "1.2-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type="ml.m5.xlarge",
    instance_count=processing_instance_count,
    base_job_name="sklearn-churn-process",
    role=role,
    sagemaker_session=pipeline_session,
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
    ],
    code="code/preprocessing.py",
)

step_process = ProcessingStep(name="ChurnProcess", step_args=processor_args)

In [None]:
# Training step definition
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput

model_path = f"s3://{default_bucket}/ChurnTrain"

image = sagemaker.image_uris.retrieve(
    framework="xgboost", region=boto3.Session().region_name, version="1.7-1",instance_type="ml.m5.xlarge"
)
sm_estimator = sagemaker.estimator.Estimator(
    image,
    role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    volume_size=50,
    input_mode="File",
    output_path=model_path,
    sagemaker_session=sess,
)
xgb_train.set_hyperparameters(
    objective="binary:logistic",
    num_round=50,
    max_depth=5,
    eta=0.2,
    gamma=4,
    min_child_weight=6,
    subsample=0.7,
)

train_args = xgb_train.fit(
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        ),
        "validation": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv",
        ),
    }
)

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep


step_train = TrainingStep(
    name="ChurnTrain",
    step_args=train_args,
)