In [1]:
import boto3
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

# Using AWS for CRISP-DM Phases 3-5: Data Prepartion, Modeling, and Evaluation

Now that you are inside a Jupyter Notebook, we assume that most of you should be back within familiar territory. As such, this tutorial will not go into detail about these phases. Rather, we'll quickly breeze through these three phases with a focus on how to get your model prepared for Phase 6, Deployment. In that phase, we'll provide more detail on how to deploy real-time models on the AWS architecture.

Because this tutorial is focused on the cloud computing architecture rather than the Data Science, we'll use a common dataset, iris.csv, from the UCI Machine Learning Repository. [Link to the dataset](https://archive.ics.uci.edu/ml/datasets/bank+marketing). LINK WIP

In [28]:
raw = pd.read_csv('../demo_data/iris.csv')
raw.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Data Preparation

### Split training/test sets

In [29]:
from sklearn.model_selection import train_test_split
X = raw.copy()
y = X.pop('class')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Fit and store feature selector

In [42]:
# Fit a feature selector that only takes the top 3 most important variables
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
feature_selector = SelectKBest(chi2, k=3)
X_train2 = feature_selector.fit_transform(X_train, y_train)

### Transform and store training data

DEPRECATED?# Merge X_train with y_train before storing into S3
df_train = pd.concat([X_train2, y_train.reset_index(drop=True)], axis=1)

s_train = df_train.to_csv(None, index=None)

s3_resource = boto3.resource('s3')
s3_bucket = 'jakechenawspublic'
s3_key = 'tutorials/mlstack_demo/data/intermediary/post_transform.csv'
s3_resource.Object(s3_bucket, s3_key).put(Body=s_train)

## Model Training + Evaluation

### Model training

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix

In [45]:
cfr = DecisionTreeClassifier()
cfr.fit(X_train2, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

### Model evaluation

In [46]:
# run feature selector on test dataset
X_test2 = feature_selector.transform(X_test)

In [47]:
# run model on transformed test dataset
y_pred = cfr.predict(X_test2)

In [49]:
print confusion_matrix(y_test, y_pred)

[[19  0  0]
 [ 0 15  0]
 [ 0  0 16]]


### Prediction pipeline

In [13]:
import boto3
import pickle

# get pickled feature selector from s3
s3_resource = boto3.resource('s3')
s3_bucket = 'jakechenawspublic'
s3_key = 'tutorials/mlstack_demo/transforms/feature_selector.pickle'
fs_string = s3_resource.Object(s3_bucket, s3_key).get()['Body'].read()

In [14]:
# load feature selector from pickle
feature_selector = pickle.loads(fs_string)

## Store Trained Model

If this model looks good then let's store the model

In [18]:
# Pickle and store feature selector to S3
import pickle
cfr_pickle = pickle.dumps(cfr)

s3_resource = boto3.resource('s3')
s3_bucket = 'jakechenawspublic'
s3_key = 'tutorials/mlstack_demo/models/tree_cfr.pickle'
s3_resource.Object(s3_bucket, s3_key).put(Body=cfr_pickle)

{u'ETag': '"4f5e722280946cebfc62235f9da9e396"',
 'ResponseMetadata': {'HTTPHeaders': {'content-length': '0',
   'date': 'Sat, 01 Apr 2017 23:05:38 GMT',
   'etag': '"4f5e722280946cebfc62235f9da9e396"',
   'server': 'AmazonS3',
   'x-amz-id-2': 'q+cyXpx6xGO9kP/cIyX+xuyYnqSaNEtrst6/EUwHe85NNgfWJFu6v3j0nEooeoQ8p5u9TUC+O2M=',
   'x-amz-request-id': '11BE3ACF68587867'},
  'HTTPStatusCode': 200,
  'HostId': 'q+cyXpx6xGO9kP/cIyX+xuyYnqSaNEtrst6/EUwHe85NNgfWJFu6v3j0nEooeoQ8p5u9TUC+O2M=',
  'RequestId': '11BE3ACF68587867',
  'RetryAttempts': 0}}