In [1]:
# Script mostly inspired by this tutorials https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb

In [1]:
import os
from datetime import datetime

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.sklearn.estimator import SKLearn

import pandas as pd

## Process the data

In [2]:
%%writefile jobs/0_process.py

"""
Script to process the raw data
"""
import os
import argparse
import ast
from collections import Counter 
import itertools

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

import pandas as pd

pd.set_option('mode.chained_assignment', None)

def compute_cards_count(decks):
    cards_count = {}
    for deck in decks:
        cards = ast.literal_eval(deck)
        for card in cards:
            if card in cards_count:
                cards_count[card] += 1
            else:
                cards_count[card] = 1    
    return cards_count

def compute_cards_count_v2(decks):
    return dict(Counter(list(itertools.chain.from_iterable(decks))))

def compute_ranking_cards(cards_count,k=5):
    dfp_ranking = pd.DataFrame.from_dict(cards_count, orient='index',columns=['count']).sort_values('count', ascending=False)
    return dfp_ranking['count'].tolist()[:k]

def encode_deck(deck, cards_selection):
    encoded_deck = [0] * len(cards_selection)
    
    for card in deck:
        if card in cards_selection:
            idx = cards_selection.index(card)
            encoded_deck[idx] += 1
    return encoded_deck

def prepare_dataset_to_train(dfp, cards_selection, encoder_hero):
    dfp['encoded_deck'] = dfp['deck'].apply(lambda deck: encode_deck(deck, cards_selection))
    dfp['encoded_hero'] = encoder_hero.transform(dfp['hero'].to_list())
    dfp['features'] = dfp.apply(lambda row: [row['encoded_hero']] + row['encoded_deck'], axis=1)
    dfp = dfp[['archetype', 'features']]
    dfp.columns = ['label', 'features']
    return dfp

def prepare_dataset_to_score(dfp, cards_selection, encoder_hero):
    dfp['encoded_deck'] = dfp['deck'].apply(lambda deck: encode_deck(deck, cards_selection))
    dfp['encoded_hero'] = encoder_hero.transform(dfp['hero'].to_list())
    dfp['features'] = dfp.apply(lambda row: [row['encoded_hero']] + row['encoded_deck'], axis=1)
    dfp = dfp[['deckid', 'features']]
    return dfp

if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test_size', type=float, default=0.3)
    args, _ = parser.parse_known_args()
    print('Received arguments {}'.format(args))
    
    # Collect the data
    input_data_to_train_path = os.path.join('/opt/ml/processing/input/ml', 'dataset_ml.csv')
    input_data_to_score_path = os.path.join('/opt/ml/processing/input/toscore', 'dataset_toscore.csv')
    print('Reading input data to train from {}'.format(input_data_to_train_path))
    dfp_ml = pd.read_csv(input_data_to_train_path)
    dfp_ml['deck'] = dfp_ml['cards'].apply(lambda cards:ast.literal_eval(cards))
    
    # Build the training set and the testing set
    dfp_train, dfp_test = train_test_split(dfp_ml, test_size=args.test_size, random_state=0)
    dfp_train.reset_index(drop=True,inplace=True)
    dfp_test.reset_index(drop=True,inplace=True)
    
    # Build some encoder
    encoder_hero = preprocessing.LabelEncoder()
    heroes = list(dfp_train['hero'].unique())
    encoder_hero.fit(heroes)
    
    # Rank the cards for the deck encoding
    dfp_train_agg = dfp_train.groupby(['archetype'])['deck'].apply(list).to_frame()
    dfp_train_agg['cards_count'] = dfp_train_agg['deck'].apply(lambda deck: compute_cards_count_v2(deck))
    dfp_train_agg['cards_selection'] = dfp_train_agg['cards_count'].apply(lambda cards_count: compute_ranking_cards(cards_count))
    cards_selection = list(dict.fromkeys(list(itertools.chain.from_iterable(dfp_train_agg['cards_selection'].tolist()))))
    
    dfp_dataset_train = prepare_dataset_to_train(dfp_train, cards_selection, encoder_hero)
    dfp_dataset_test = prepare_dataset_to_train(dfp_test, cards_selection, encoder_hero)
    
    print('Reading input data to score from {}'.format(input_data_to_score_path))
    dfp_score = pd.read_csv(input_data_to_score_path).sample(frac=0.1).head(1000)
    dfp_score['deck'] = dfp_score['cards'].apply(lambda cards:ast.literal_eval(cards))
    dfp_dataset_score = prepare_dataset_to_score(dfp_score, cards_selection, encoder_hero)
    
    # Save the data
    print('Saving the data in /opt/ml/processing/train and test and score')
    train_output_path = os.path.join('/opt/ml/processing/train', 'dataset_train.csv')
    test_output_path = os.path.join('/opt/ml/processing/test', 'dataset_test.csv')
    score_output_path = os.path.join('/opt/ml/processing/score', 'dataset_score.csv')
    dfp_dataset_train.to_csv(train_output_path, index=None)
    dfp_dataset_test.to_csv(test_output_path, index=None)
    dfp_dataset_score.to_csv(score_output_path, index=None)
    print('DONE')
    

Overwriting jobs/0_process.py


## Train a model

In [3]:
%%writefile jobs/1_train.py

"""
Script to process the raw data
"""
import os
import argparse
import ast

from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib

import pandas as pd


pd.set_option('mode.chained_assignment', None)


if __name__=="__main__":
    
    training_data_directory = '/opt/ml/input/data/train'
    print(f'Collect the training set data at {training_data_directory}')
    dfp_training = pd.read_csv(training_data_directory + '/dataset_train.csv')
    dfp_training['features'] = dfp_training['features'].apply(lambda features: ast.literal_eval(features))
    
    print('Set the random forest model')
    model = RandomForestClassifier(max_depth=2, random_state=0)
    
    print('Train the model')
    model.fit(dfp_training['features'].tolist(), dfp_training['label'].tolist())
    model_output_directory = os.path.join('/opt/ml/model', "model.joblib")
    
    print('Saving model to {}'.format(model_output_directory))
    joblib.dump(model, model_output_directory)
    print('DONE')

Overwriting jobs/1_train.py


## Evaluate a model

In [4]:
%%writefile jobs/2_evaluate.py

"""
Script to process the raw data
"""
import os
import argparse
import ast

import numpy as np
import math as mth
import pandas as pd
import tarfile

from sklearn.externals import joblib

pd.set_option('mode.chained_assignment', None)

def get_precision_at_k(recommendations, item, k=5):
    if item in recommendations[:k]:
        return 1
    return 0

def get_ndcg_at_k(recommendations, item, k=5):
    for idx, elt in enumerate(recommendations[:k]):
        if item == elt:
            return mth.log(2) / mth.log(idx+2)
    return 0

if __name__=="__main__":
    model_path = os.path.join('/opt/ml/processing/model', 'model.tar.gz')
    print('Extracting model from path: {}'.format(model_path))
    with tarfile.open(model_path) as tar:
        tar.extractall(path='.')
    print('Loading model')
    model = joblib.load('model.joblib')
    
    print('Loading test input data')
    dfp_test = pd.read_csv(os.path.join('/opt/ml/processing/test', 'dataset_test.csv'))
    dfp_test['features'] = dfp_test['features'].apply(lambda features: ast.literal_eval(features))
    
    dfp_test['prediction'] = list(model.predict_proba(dfp_test['features'].tolist()))
    dfp_test['prediction'] = dfp_test['prediction'].apply(lambda prediction: np.argsort(prediction)[::-1])
    
    labels = list(model.classes_)
    dfp_test['prediction'] = dfp_test['prediction'].apply(lambda prediction: [labels[idx] for idx in prediction])

    metrics = []
    for k in [1,3,5]:
        dfp_test[f'precision_at_{k}'] = dfp_test.apply(lambda row: get_precision_at_k(row['prediction'], row['label'], k), axis=1)
        metrics.append(f'precision_at_{k}')
        if k > 1:
            dfp_test[f'ndcg_at_{k}'] = dfp_test.apply(lambda row: get_ndcg_at_k(row['prediction'], row['label'], k), axis=1)
            metrics.append(f'ndcg_at_{k}')
    dfp_metrics = dfp_test[metrics].mean().to_frame().reset_index()
    dfp_metrics.columns = ['metric', 'value']
    evaluation_output_path = os.path.join('/opt/ml/processing/evaluation', 'metrics.csv')
    dfp_metrics.to_csv(evaluation_output_path, index=None)
    print('DONE')

Overwriting jobs/2_evaluate.py


## Score some data

In [5]:
%%writefile jobs/3_score.py

import os
import argparse
import ast

import pandas as pd
import numpy as np
import tarfile

from sklearn.externals import joblib

if __name__=="__main__":
    model_path = os.path.join('/opt/ml/processing/model', 'model.tar.gz')
    print('Extracting model from path: {}'.format(model_path))
    with tarfile.open(model_path) as tar:
        tar.extractall(path='.')
    print('Loading model')
    model = joblib.load('model.joblib')
    
    print('Loading test input data')
    dfp_toscore = pd.read_csv(os.path.join('/opt/ml/processing/score', 'dataset_score.csv'))
    dfp_toscore['features'] = dfp_toscore['features'].apply(lambda features: ast.literal_eval(features))
    
    dfp_toscore['probabilities'] = list(model.predict_proba(dfp_toscore['features'].tolist()))
    dfp_toscore['predictions'] = dfp_toscore['probabilities'].apply(lambda prediction_prob: np.argsort(prediction_prob)[::-1])
    
    labels = list(model.classes_)
    dfp_toscore['predictions'] = dfp_toscore['predictions'].apply(lambda predictions: [labels[idx] for idx in predictions])
    
    evaluation_output_path = os.path.join('/opt/ml/processing/predictions', 'predictions.csv')
    dfp_toscore.to_csv(evaluation_output_path, index=None)
    print('DONE')

Overwriting jobs/3_score.py
