# Should you question an invoice sent by a supplier


For updates on the way Sagemaker or AWS behave compared to the notebook code, please refer to https://livebook.manning.com/#!/book/machine-learning-for-business/chapter-5/v-5/137

## Part 1: Load and examine the data

To run the code in the notebook cell, change the name of the data_bucket from 'machliba' to the data_bucket holding your data and click into the cell and press Ctrl+Enter.

In [None]:
data_bucket = 'ie-mlforbusiness-01' 
subfolder = 'ch05' 
dataset = 'activities.csv' 

In [None]:
import pandas as pd
import boto3
import s3fs
import sagemaker
from sklearn.model_selection import train_test_split
import json
import csv
from time import sleep

import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

role = sagemaker.get_execution_role()
s3 = s3fs.S3FileSystem(anon=False)

In [None]:
df = pd.read_csv(f's3://{data_bucket}/{subfolder}/{dataset}')
display(df[5:8])

In [None]:
#Count entries with and without errors



In [None]:
#Statistics on individual subject areas, resource types and type of activity

print(f'Number of rows in dataset: {df.shape[0]}')
print()
print('Matter types:')
print(df['Matter Type'].value_counts())
print()
print('Resources:')
print(df['Resource'].value_counts())
print()
print('Activities:')
print(df['Activity'].value_counts())

## Part 2: Get the data into the right shape

In [None]:
#Transfomation of data set --> One-Hot Encoding

encoded_df = pd.get_dummies(df, columns=['Matter Type','Resource','Activity']) 
encoded_df.head(3)

## Part 3: Create training and validation datasets

In [None]:
#Create train- and validation data set

train_df, val_df, _, _ = train_test_split(encoded_df, encoded_df['Error'], test_size=0.2, random_state=0)
train_df_no_result = train_df.drop(['Error','Firm Name'], axis=1)
val_df_no_result = val_df.drop(['Error','Firm Name'], axis=1)
print(f'{train_df.shape[0]} rows in training data')
print(f'{val_df.shape[0]} rows in validation data')

## Part 4: Train the model



In [None]:
#Import von RandomCutForest

from sagemaker import RandomCutForest

session = sagemaker.Session()

rcf = RandomCutForest(role=role,
                      train_instance_count=1,
                      train_instance_type='ml.m4.xlarge', # set instance type
                      data_location=f's3://{data_bucket}/{subfolder}/',#set path to locate data set
                      output_path=f's3://{data_bucket}/{subfolder}/output', #set path for model output
                      num_samples_per_tree=100, # Number of samples per tree; recommended because it povides good middle
                                                  #ground between speed and size 
                      num_trees=50) # Number of trees, set at error rate (2% = 1/50)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(train_df_no_result.values))

## Part 5: Host the model

In [None]:
#hosting model - create endpoint

endpoint_name = 'suspicious-lines'
try:
    session.delete_endpoint(endpoint_name)
    sess.delete_endpoint_config(endpoint_name)
    print('Warning: Existing endpoint deleted to make way for your new endpoint.')
    sleep(30)
except:
    pass

In [None]:
#hosting model - set resources

rcf_endpoint = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium', 
    endpoint_name=endpoint_name
)

In [None]:
# converting data to workable format

from sagemaker.predictor import csv_serializer, json_deserializer
from sagemaker.deserializers import JSONDeserializer   
from sagemaker.serializers import CSVSerializer        

#rcf_endpoint.content_type = 'text/csv'
#rcf_endpoint.__setattr__(rcf_endpoint.content_type, "text/csv")
rcf_endpoint.serializer = CSVSerializer()
#rcf_endpoint.accept = 'application/json'
rcf_endpoint.deserializer = JSONDeserializer()

## Part 6: Test the model

In [None]:
#calculat anomaly scores
results = rcf_endpoint.predict(val_df_no_result.values)
#Scores in neuem DataFrame festhalten
scores_df = pd.DataFrame(results['scores'])


val_df = val_df.reset_index(drop=True)

#adding scores to our validation data set
results_df = pd.concat([val_df, scores_df], axis=1)

#Output the number of errors in val_df: 20791 without error, 402 with error
results_df['Error'].value_counts()

In [None]:
#Determination of the median of all scores for data points that are in error (Threshold)
score_cutoff = results_df[results_df['Error'] == True]['score'].median()
print(f'Score cutoff: {score_cutoff}')

#new dataframe for scores that are above median
results_above_cutoff = results_df[results_df['score'] > score_cutoff]

#output threshold
#Number of data points above the threshold that actually represent errors
#Number of data points above the threshold that have been flagged as an anomaly but are not an anomaly.
results_above_cutoff['Error'].value_counts()

In [None]:
#Sets values in the prediction column to True where score > treshold
results_df['Prediction'] = results_df['score'] > score_cutoff

#results_df.head()

results_df.loc[results_df['score'] > score_cutoff]
#results_df.loc[results_df['score'] < score_cutoff]

In [None]:
#Confusion Matrix

data = {'y_Actual':    results_df['Error'],
        'y_Predicted': results_df['Prediction']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])


fig, ax = plt.subplots(figsize=(10,10))
sn.heatmap(confusion_matrix, annot=True, linewidths=0.5, fmt = 'd')
plt.show()

In [None]:
#calculate precision, recall, f1
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

score = precision_recall_fscore_support(results_df['Error'],results_df['Prediction'], average='binary')
score

In [None]:
#Determination of the median of all scores for data points that are in error (Threshold)

score_cutoff_new = 
print(f'Score cutoff: {score_cutoff_new}')

#new dataframe for scores that are above median
results_above_cutoff = results_df[results_df['score'] > score_cutoff_new]

#output threshold
#Number of data points above the threshold that actually represent errors
#Number of data points above the threshold that have been flagged as an anomaly but are not an anomaly.
results_above_cutoff['Error'].value_counts()

In [None]:
#Sets values in the prediction column to True where score > treshold
results_df['Prediction'] = results_df['score'] > score_cutoff_new

#results_df.head()

results_df.loc[results_df['score'] > score_cutoff_new]
#results_df.loc[results_df['score'] < score_cutoff]

In [None]:
#Confusion Matrix

data = {'y_Actual':    results_df['Error'],
        'y_Predicted': results_df['Prediction']
        }

df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])


fig, ax = plt.subplots(figsize=(10,10))
sn.heatmap(confusion_matrix, annot=True, linewidths=0.5, fmt = 'd')
plt.show()

In [None]:
#calculate precision, recall, f1
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support

score = precision_recall_fscore_support(results_df['Error'],results_df['Prediction'], average='binary')
score

## Remove the Endpoint (optional)

Comment out this cell if you want the endpoint to exist after "run all"

In [None]:
session.delete_endpoint(endpoint_name)