# Collect a data set and prepare it for ML

This is a synthetic data set based on an ML comp dataset

I have uploaded it into a bucket in the same region as this Sagemaker Studio domain 

https://telco-churn-seoul.s3.ap-northeast-2.amazonaws.com/churn_training.csv


In [28]:
import pandas as pd
import numpy as np


In [29]:
import sagemaker
import boto3
from sagemaker import get_execution_role

region = boto3.Session().region_name

session = sagemaker.Session()

# WE NEED A PLACE TO STORE INTERMEDIATE DATA TO BE USED BY ML PROCESSES
# You can modify the following to use a bucket of your choosing
bucket = session.default_bucket()
prefix = 'sagemaker/telco-churn'

role = get_execution_role()

# This is the client we will use to interact with SageMaker AutoPilot
sm = boto3.Session().client(service_name='sagemaker',region_name=region)

Use the following to load the data from S3 into a pandas dataframe

In [30]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='telco-churn-seoul', Key='churn_training.csv')
df = pd.read_csv(io.BytesIO(obj['Body'].read()))

In [31]:
df.head()

Unnamed: 0,CustomerID,Gender,Senior Citizen,Partner,Dependents,Tenure,Phone Service,Multiple Lines,Internet Service,Online Security,...,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn,date
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,2020-05-18
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,2020-05-18
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,2020-11-03
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,2020-05-18
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,2020-05-18


# Process the data
For the sake of simplicity, we are going to process the data to have numeric only fields.

In [32]:
def process_df(df):
    # change the target col
    df["Churn"] = np.where(df["Churn"] == "Yes", 1, 0)
    # remove unnecessary cols
    df_model_data = df.drop(["CustomerID", "date"], axis=1)
    # convert categories to indicators
    df_model_data = pd.get_dummies(df_model_data)
    return df_model_data

In [33]:
proc = process_df(df)

In [34]:
proc.head()

Unnamed: 0,Senior Citizen,Tenure,Monthly Charges,Total Charges,Churn,Gender_Female,Gender_Male,Partner_No,Partner_Yes,Dependents_No,...,Streaming Movies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,Paperless Billing_No,Paperless Billing_Yes,Payment Method_Bank transfer (automatic),Payment Method_Credit card (automatic),Payment Method_Electronic check,Payment Method_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


# Partition the data

Break it into some partitions for training, validation and scoring

In [35]:
target = "Churn"

In [36]:
from sklearn.model_selection import train_test_split
train_prop = 0.6
valid_prop = 0.2
valid_remainder = valid_prop/(1-train_prop)

In [37]:
train, rest = train_test_split(proc, train_size=train_prop, stratify=df[target])
valid, test = train_test_split(rest, train_size=valid_remainder, stratify=rest[target])

# Write to disk

In [38]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [39]:
# WRITE OUT PARTITIONED DATA 
output_path="data"
train.to_csv(output_path + "/train.csv", index=False, header=True)
valid.to_csv(output_path + "/validation.csv", index=False, header=True)
test.to_csv(output_path + "/test.csv", index=False, header=True)