In [3]:
import logging
import boto3
from botocore.exceptions import ClientError
import sagemaker

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

import pickle
import os

import progressbar
from sys import getsizeof

In [60]:
from sklearn.externals import joblib



In [4]:
session = boto3.Session(profile_name='default')

In [6]:
sagemaker_session = sagemaker.Session(boto_session=session)

In [7]:
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)

In [8]:
role

'arn:aws:iam::113516067754:role/service-role/AmazonSageMaker-ExecutionRole-20200513T111076'

In [9]:
bucket

'sagemaker-us-east-2-113516067754'

# Data preparation

In [12]:
ls

README.md    cv_train.pkl            drugsComTest_raw.tsv   main.py
[0m[1;34mcache[0m/       drugs-dataset.ipynb     drugsComTrain_raw.tsv
cv_test.pkl  drugs-review-AWS.ipynb  [01;31mdrugsCom_raw.zip[0m


In [15]:
def read_data():
    data_train = pd.read_csv('drugsComTrain_raw.tsv', sep='\t')
    data_test = pd.read_csv('drugsComTest_raw.tsv', sep='\t')

    return data_train, data_test

In [16]:
data_train,data_test=read_data()

In [17]:
def df_train_test():
    train_df = data_train[['condition', 'review']]
    test_df = data_test[['condition', 'review']]
    train_df = train_df.dropna()
    test_df = test_df.dropna()

    def resub(review):
        review = re.sub(r"&#039;", "'", review)
        return review

    train_df.review = train_df.review.apply(resub)
    test_df.review = test_df.review.apply(resub)

    train_df = train_df[~train_df.condition.str.contains('</span>')]
    test_df = test_df[~test_df.condition.str.contains('</span>')]

    return train_df, test_df

In [18]:
train_df,test_df=df_train_test()

In [19]:
def reduce_conditions(value):
    cond = train_df.condition.value_counts() > value

    def g(condition):
        if cond[condition]:
            return condition
        else:
            return 'other'

    train_df['condcopy'] = train_df['condition'].apply(g)

    s = set(train_df['condcopy'])
    in_s = test_df['condition'].isin(s)
    test_df['condcopy'] = test_df['condition']
    test_df['condcopy'][~in_s] = 'other'

    len_train = len(set(train_df.condcopy))
    len_test = len(set(test_df.condcopy))

    other_train = train_df.condcopy.value_counts()['other'] / train_df.shape[0] * 100
    other_test = test_df.condcopy.value_counts()['other'] / test_df.shape[0] * 100
    print('Nr conditions Train: ', len_train, '\nNr conditions Test: ', len_test)
    print('Percentage "other", Train: ', other_train, '%')
    print('Percentate "other", Test: ', other_test, '%')

In [20]:
reduce_conditions(50)

Nr conditions Train:  225 
Nr conditions Test:  225
Percentage "other", Train:  3.524182121405911 %
Percentate "other", Test:  3.505639097744361 %


### save train_y and test_y

In [22]:
def save():
    pickle.dump(train_df.condcopy,open('train_y.pkl','wb'))
    pickle.dump(test_df.condcopy,open('test_y.pkl','wb'))
    
    print('data saved.')

In [23]:
save()

data saved.


In [24]:
def data_load():
    cv_train=pickle.load(open('cv_train.pkl','rb'))
    cv_test=pickle.load(open('cv_test.pkl','rb'))
    
    train_y=pickle.load(open('train_y.pkl','rb'))
    test_y=pickle.load(open('test_y.pkl','rb'))
    
    return cv_train,cv_test,train_y,test_y

In [62]:
cv_train,cv_test,train_y,test_y=data_load()

# Upload Data to S3

In [11]:
prefix='drugs-review-model'

In [25]:
!mkdir train_test

In [26]:
!cp cv_train.pkl cv_test.pkl train_y.pkl test_y.pkl train_test/

In [28]:
ls train_test/

cv_test.pkl  cv_train.pkl  test_y.pkl  train_y.pkl


In [14]:
data_dir='./train_test'

In [29]:
input_data = sagemaker_session.upload_data(path=data_dir, bucket=bucket, key_prefix=prefix)

In [30]:
input_data

's3://sagemaker-us-east-2-113516067754/drugs-review-model'

# Train Model in Sagemaker Instance

In [38]:
from sagemaker.sklearn.estimator import SKLearn

In [78]:
script_path = 'aws_rf.py'

sklearn_RF = SKLearn(
    entry_point=script_path,
    train_instance_type="ml.m5.2xlarge",
    role=role,
    sagemaker_session=sagemaker_session,
    hyperparameters={'n_estimators': 50,'max_depth':150})

In [79]:
sklearn_RF.fit({'train':input_data})

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2020-07-06 14:25:05 Starting - Starting the training job...
2020-07-06 14:25:06 Starting - Launching requested ML instances......
2020-07-06 14:26:40 Downloading - Downloading input data...
2020-07-06 14:27:09 Training - Training image download completed. Training in progress....[34m2020-07-06 14:27:09,688 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2020-07-06 14:27:09,691 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-07-06 14:27:09,700 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2020-07-06 14:27:09,917 sagemaker-containers INFO     Module aws_rf does not provide a setup.py. [0m
[34mGenerating setup.py[0m
[34m2020-07-06 14:27:09,917 sagemaker-containers INFO     Generating setup.cfg[0m
[34m2020-07-06 14:27:09,917 sagemaker-containers INFO     Generating MANIFEST.in[0m
[34m2020-07-06 14:27:09,917 sagemaker-containers INFO     Installing modul

## Deploy

In [48]:
predictor = sklearn_RF.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------*

UnexpectedStatusException: Error hosting endpoint sagemaker-scikit-learn-2020-07-06-11-22-47-929: Failed. Reason:  Failed to extract model data archive from URL "s3://sagemaker-us-east-2-113516067754/sagemaker-scikit-learn-2020-07-06-11-22-47-929/output/model.tar.gz". The model data archive is too large. Please reduce the size of the model data archive or move to an instance type with more memory..

In [49]:
script_path

'aws_rf.py'

# Download Model

In [50]:
s3 = boto3.client('s3')

In [51]:
bucket

'sagemaker-us-east-2-113516067754'

In [52]:
sklearn_RF.output_path

's3://sagemaker-us-east-2-113516067754/'

In [80]:
sklearn_RF.model_data

's3://sagemaker-us-east-2-113516067754/sagemaker-scikit-learn-2020-07-06-14-25-02-766/output/model.tar.gz'

In [81]:
s3.download_file(bucket,'sagemaker-scikit-learn-2020-07-06-14-25-02-766/output/model.tar.gz','model.tar.gz')

In [82]:
!tar -xzf model.tar.gz

# Load Model

In [83]:
rfmodel=joblib.load("model.joblib")



In [63]:
from sklearn.metrics import accuracy_score

In [64]:
def acc(model, X_train, X_test, y_train, y_test):
    preds_train = model.predict(X_train)
    acc_train = accuracy_score(preds_train, y_train)
    print('accuracy train done.')

    preds_test = model.predict(X_test)
    acc_test = accuracy_score(preds_test, y_test)
    print('accuracy test done.')

    print('Train error: ', acc_train, '\nTest error: ', acc_test)

In [84]:
rfmodel.get_params()



{'bootstrap': True,
 'ccp_alpha': None,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 150,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 36,
 'verbose': 1,
 'warm_start': False}

In [85]:
acc(rfmodel,cv_train,cv_test,train_y,test_y)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   12.4s
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:   17.9s finished


accuracy train done.


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.9s


accuracy test done.
Train error:  0.9580559003874657 
Test error:  0.7800563909774436


[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:    6.5s finished


In [87]:
%xdel rfmodel

NameError: name 'rfmodel' is not defined


In [86]:
del rfmodel