<div style="text-align:center; display:flex; justify-content:center; margin:16px 0px">
    <span style="color:#ff5500; font-family:Play; font-size:3em; margin:auto 32px">Part II<br \>Feature Engineering</span>
</div>

---

# Introduction

This document is a part of the FACEIT Predictor project.

In this notebook the collected data (stored in the Mongo Database) is preprocessed and new fields/collections are stored directly in the same Database. Therefore, those steps are only executed once and the feature engineering phase is much faster.

In [2]:
import pandas as pd
import numpy as np
import json
from pymongo import MongoClient
from load_faceit_data import read_config
from statistics import mean
from glob import glob
from datetime import datetime, timezone
from collections import defaultdict

# Connect to the Mongo Database

In [3]:
db_cfg = read_config("local.ingestorDB")

In [4]:
client = MongoClient(**db_cfg)
db = client['faceit_imported']

In [5]:
# Connect to the collections inside the local ingestor database
players_coll = db['player']
matches_coll = db['match']
lifetime_stats_coll = db['player_lifetime_stats']

# Load Data

In [6]:
featurized_data_path = 'data/dataset/' # use your path
all_files = glob(featurized_data_path + "*.json")

In [7]:
chunked_dataframes = [pd.read_json(filename) for filename in all_files[:1]]
dataset = pd.concat(chunked_dataframes, ignore_index=True)

In [8]:
print(dataset.shape)
dataset.head()

(3001, 9)


Unnamed: 0,_id,startTime,mapPlayed,score,parties,entity,entityName,teamA,teamB
0,1-0002441e-bec7-4746-9c01-81560aeb145f,1582309092,de_dust2,16 / 11,{'1c73a556-2da0-46da-a18e-1a827d3a442b': ['824...,matchmaking,CS:GO 5v5 PREMIUM,[{'id': '82432d4b-9823-4f3a-82e6-063d2ab26a1c'...,[{'id': '371e87d2-f5ed-426b-8c27-efbd8bfda1f5'...
1,1-00044d9e-77bd-4a14-bc36-ea864858ac9a,1582210618,de_cache,16 / 12,{'01fd58ef-bd1a-4926-9423-af6f1ad78d31': ['2d0...,matchmaking,CS:GO 5v5,[{'id': '4773b555-5a4f-4a4e-b6a2-8353e315958e'...,[{'id': '2d0acf3e-915c-43d7-adf1-1bad5467943c'...
2,1-00052bb7-c844-4004-b17d-868b6ed7a645,1582201298,de_mirage,16 / 11,{'03b4db45-ff3f-4a93-aade-223fc5aa31e1': ['03b...,matchmaking,CS:GO 5v5,[{'id': '87d5d8f5-7276-4ae6-87be-8b5465be37b4'...,[{'id': 'e8d3abbc-8e60-43d1-92be-69f5a891913b'...
3,1-00055c82-b90a-4395-b7c6-f032eb106230,1582329709,de_mirage,16 / 13,{'5835d91a-3f98-4bb2-a14b-883b5967244b': ['583...,matchmaking,CS:GO 5v5,[{'id': '3518b5ad-4618-4747-9577-7795c48f5cdc'...,[{'id': '2ba15742-0ccb-47a2-bb6e-06a0887a40ed'...
4,1-00061954-f2ac-44c0-a7a8-ace5c2139e65,1581001380,de_dust2,16 / 10,{'36b6772a-21ce-4636-989a-8af9f58e4c02': ['085...,matchmaking,CS:GO 5v5 PREMIUM,[{'id': 'd703c44c-1b38-4668-9cf3-790efa24f726'...,[{'id': '419b372a-d3d2-423b-8a63-ca341c33e6a7'...


In [None]:
dataset metadata (number of matches, min and max starttime, number of diff players, etc.)

In [None]:
get predictions acc vs time of day
get predictions acc vs match mean elo
get predictions acc vs match mean number of matches
get predictions acc vs mapPlayed
get predictions acc vs entity
get predictions acc vs entityName

In [11]:
pd.get_dummies(dataset.mapPlayed)

Unnamed: 0,de_cache,de_dust2,de_inferno,de_mirage,de_nuke,de_overpass,de_train,de_vertigo
0,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0
3,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
2996,0,0,0,0,0,1,0,0
2997,0,1,0,0,0,0,0,0
2998,0,0,0,1,0,0,0,0
2999,0,0,0,1,0,0,0,0


In [10]:
dataset.mapPlayed.unique()

array(['de_dust2', 'de_cache', 'de_mirage', 'de_overpass', 'de_inferno',
       'de_train', 'de_nuke', 'de_vertigo'], dtype=object)

# Performance indicators statistics

In [81]:
kills_pr = []
survived_pr = []
multikills_rating_pr = []
assists = []
assists_pr = []
mvps_pr = []

In [82]:
def get_average_stats(match):
    team_rounds = [int(r) for r in match['score'].split("/")]
    total_rounds = sum(team_rounds)
    
    for team in (match["teamA"], match["teamB"]):
        for player in team:
            if(player['playerStats']==None):
                continue
            stats = player['playerStats']
            
            kills_pr.append(stats["kills"]/total_rounds)
            survived_pr.append((total_rounds - stats['deaths'])/total_rounds)
            multikills_rating_pr.append((stats['tripleKills']*9 +stats['quadraKills']*16 + stats['pentaKills']*25)/total_rounds)
            assists_pr.append(stats['assists']/total_rounds)
            mvps_pr.append(stats['mvps']/total_rounds)

In [83]:
for filename in all_files:
    data_batch = pd.read_json(filename)
    data_batch.apply(get_average_stats, axis=1)

In [84]:
performance_statistics = {
    'meanKPR': mean(kills_pr),
    'stddevKPR': np.std(kills_pr),
    'meanSPR': mean(survived_pr),
    'stddevSPR': np.std(survived_pr),
    'meanMKPR': mean(multikills_rating_pr),
    'stddevMKPR': np.std(multikills_rating_pr),
    'meanAPR': mean(assists_pr),
    'stddevAPR': np.std(assists_pr),
    'meanMVPPR': mean(mvps_pr),
    'stddevMVPPR': np.std(mvps_pr)}

In [86]:
# Store performance statistics
with open('data/performance_statistics.json', 'w') as fp:
    json.dump(performance_statistics, fp)

In [9]:
# Load performance statistics that were previously computed
with open('data/performance_statistics.json') as fp:
    performance_statistics = json.load(fp)

In [10]:
AVERAGE_KPR = performance_statistics["meanKPR"]  # average kills per round
AVERAGE_SPR = performance_statistics["meanSPR"]  # average survived rounds per round
AVERAGE_RMK = performance_statistics["meanMKPR"]  # average value calculated from rounds with multiple kills
AVERAGE_APR = performance_statistics["meanAPR"]   # average assists per round
AVERAGE_MVPPR = performance_statistics["meanMVPPR"]   # average mvp per round

# Featurization

## Helpers

In [11]:
def add_feature(data, function, **kwargs):
    teams = ("A", "B")

    # returns (everything beyond get_)
    feature = '_'.join(function.__name__.split("_")[1:])

    for team in teams:
        feature_team = '_'.join([feature, team])
        data[feature_team] = function(data, "team" + team, **kwargs)

    data["dif_" + feature] = data[feature + "_A"] - data[feature + "_B"]

In [12]:
def get_team_rounds(score_string):
    return [int(r) for r in score_string.split("/")]

In [13]:
def get_player_won_the_match(match, player_id):
    is_on_team_A = True if player_id in [player['id'] for player in match['teamA']] else False
    team_rounds = get_team_rounds(match["score"])

    if (is_on_team_A and team_rounds[0] > team_rounds[1]) or \
            (not is_on_team_A and team_rounds[1] > team_rounds[0]):
        return 1
    else:
        return 0

## Match Features

In [14]:
def get_elo_bin(value):
    if value < min(elo_bins):
        return 0
    if value > max(elo_bins):
        return len(elo_bins) - 2

    for bin_index, bin_limit in enumerate(elo_bins[1:]):
        if value <= bin_limit:
            return bin_index

In [15]:
def get_is_5v5_mm_queue(entity_names):
    return [1 if e == 'CS:GO 5v5' else 0 for e in entity_names]

def get_is_5v5_premium_queue(entity_names):
    return [1 if e == 'CS:GO 5v5 PREMIUM' else 0 for e in entity_names]

In [16]:
def get_mean_elo(data, team):
    mean_elos = []
    for team_data in data[team]:
        team_mean_elo = mean([player["elo"] for player in team_data])
        mean_elos.append(team_mean_elo)
    return mean_elos


def get_stddev_elo(data, team):
    stddev_elos = []
    for team_data in data[team]:
        team_stddev_elo = np.std([player["elo"] for player in team_data])
        stddev_elos.append(team_stddev_elo)
    return stddev_elos


def get_num_paid_memberships(data, team):
    memberships = []
    for team_data in data[team]:
        membership = len([p['membership'] for p in team_data if p['membership'] != 'free'])
        memberships.append(membership)
    return memberships


def get_num_solo_players(data, team):
    solo_players = []

    for parties_data, team_data in zip(data["parties"], data[team]):
        if not parties_data:
            solo_players.append(0)
            continue
        
        num_solo_players = 0
        team_players_ids = [p['id'] for p in team_data]

        for party_players_ids in parties_data.values():
            if len(party_players_ids) == 1 and party_players_ids[0] in team_players_ids:
                # Increment number of solo players if it belongs to a party with size 1
                num_solo_players += 1

        solo_players.append(num_solo_players)
    return solo_players


def get_num_parties(data, team):
    parties = []
    for parties_data, team_data in zip(data["parties"], data[team]):
        if not parties_data:
            parties.append(1)
            continue
        
        num_parties = 0
        team_players_ids = [p['id'] for p in team_data]

        for party_players_ids in parties_data.values():
            if party_players_ids[0] in team_players_ids:
                num_parties += 1

        parties.append(num_parties)
    return parties


def get_winner(scores):
    winners = []
    for score in scores:
        team_rounds = [int(r) for r in score.split("/")]
        winners.append(0 if team_rounds[0] > team_rounds[1] else 1)
    return winners

In [67]:
def add_match_features(data):
    add_feature(data, get_mean_elo) 
    add_feature(data, get_stddev_elo)
    add_feature(data, get_num_paid_memberships)
    add_feature(data, get_num_solo_players)
    add_feature(data, get_num_parties)
    
    data["match_mean_elo"] = (data["mean_elo_A"] + data["mean_elo_B"])/2
    binned_elos, elo_bins = pd.cut(dataset.match_mean_elo, bins=15, retbins=True, labels=False)
    data["binned_match_elo"] = binned_elos
    # store elo bins limits in a ?file?

    dummies_entity = pd.get_dummies(data.entity, drop_first=True)
    for col in dummies_entity:
        data[col] = dummies_entity[col]
        
    data["5v5_free_queue"] = get_is_5v5_mm_queue(data.entityName)
    data["5v5_premium_queue"] = get_is_5v5_premium_queue(data.entityName)
    data["winner"] = get_winner(data["score"])

## Lifetime Features

In [18]:
def get_mean_matches(data, team):
    mean_matches = []
    for team_data in data[team]:
        total_matches = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]
            total_matches += sum([ms['matches'] for ms in map_stats.values()])
        mean_matches.append(total_matches/5)
    return mean_matches

def get_mean_winrate(data, team):
    mean_winrate = []
    for team_data in data[team]:
        player_winrate = 0
        for p in team_data:
            total_matches, total_wins = 0, 0
            map_stats = p["mapStats"]["mapStats"]
            total_matches += sum([ms['matches'] for ms in map_stats.values()])
            total_wins += sum([ms['wins'] for ms in map_stats.values()])
            player_winrate += (total_wins / total_matches) if total_matches != 0 else 0.5
        mean_winrate.append(player_winrate/5)
    return mean_winrate

def get_mean_kd(data, team):
    mean_kd = []
    for team_data in data[team]:
        player_kd = 0
        for p in team_data:
            total_kills, total_deaths = 0, 0
            map_stats = p["mapStats"]["mapStats"]
            total_kills += sum([ms['kills'] for ms in map_stats.values()])
            total_deaths += sum([ms['deaths'] for ms in map_stats.values()])
            player_kd += (total_kills / total_deaths) if total_deaths != 0 else 1
        mean_kd.append(player_kd/5)
    return mean_kd

def get_mean_multikills_score(data, team):
    mean_multikills_score = []
    for team_data in data[team]:
        multikills_score = 0
        for p in team_data:
            total_triple_k, total_quadra_k, total_penta_k, total_rounds = 0, 0, 0, 0
            map_stats = p["mapStats"]["mapStats"]

            total_triple_k += sum([ms['tripleKills'] for ms in map_stats.values()])
            total_quadra_k += sum([ms['quadraKills'] for ms in map_stats.values()])
            total_penta_k += sum([ms['pentaKills'] for ms in map_stats.values()])
            total_rounds += sum([ms['rounds'] for ms in map_stats.values()])

            multikills = total_triple_k * 9 + total_quadra_k * 16 + total_penta_k * 25
            multikills_score += (multikills / total_rounds) if total_rounds != 0 else AVERAGE_RMK
        mean_multikills_score.append(multikills_score/5)
    return mean_multikills_score


def compute_rating(kills, deaths, triple_k, quadra_k, penta_k, assists, mvps, rounds):
    kill_rating = kills / rounds / AVERAGE_KPR
    survival_rating = (rounds - deaths) / rounds / AVERAGE_SPR
    multi_kills_score = triple_k * 9 + quadra_k * 16 + penta_k * 25
    multi_kills_rating = multi_kills_score / rounds / AVERAGE_RMK
    assists_rating = assists / rounds / AVERAGE_APR
    mvps_rating = mvps / rounds / AVERAGE_MVPPR

    rating = (kill_rating + 0.7 * survival_rating
              + multi_kills_rating
              + 0.5 * assists_rating
              + 0.3 * mvps_rating) / 3.5
    return rating


def get_mean_rating(data, team):
    mean_rating = []
    for team_data in data[team]:
        rating = 0
        for p in team_data:
            total_kills, total_deaths, total_assists, total_mvps = 0, 0, 0, 0
            total_triple_k, total_quadra_k, total_penta_k, total_rounds = 0, 0, 0, 0
            map_stats = p["mapStats"]["mapStats"]

            total_kills += sum([ms['kills'] for ms in map_stats.values()])
            total_deaths += sum([ms['deaths'] for ms in map_stats.values()])
            total_assists += sum([ms['assists'] for ms in map_stats.values()])
            total_mvps += sum([ms['mvps'] for ms in map_stats.values()])
            total_triple_k += sum([ms['tripleKills'] for ms in map_stats.values()])
            total_quadra_k += sum([ms['quadraKills'] for ms in map_stats.values()])
            total_penta_k += sum([ms['pentaKills'] for ms in map_stats.values()])
            total_rounds += sum([ms['rounds'] for ms in map_stats.values()])

            if total_rounds == 0:
                # TODO: review better value... impute mean
                rating += 1
            else:
                rating += compute_rating(total_kills, total_deaths, total_triple_k,
                                        total_quadra_k, total_penta_k, total_assists,
                                        total_mvps, total_rounds)

        mean_rating.append(rating/5)
    return mean_rating

In [19]:
def get_mean_matches_on_map(data, team):
    mean_matches = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        total_matches = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]
            total_matches += sum([ms['matches'] for ms in map_stats.values() if ms["name"]==map_played])
        mean_matches.append(total_matches/5)
    return mean_matches

def get_mean_winrate_on_map(data, team):
    mean_winrate = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        player_winrate = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]
            total_matches = sum([ms['matches'] for ms in map_stats.values() if ms["name"]==map_played])
            total_wins = sum([ms['wins'] for ms in map_stats.values() if ms["name"]==map_played])
            player_winrate += (total_wins / total_matches) if total_matches != 0 else 0.5
        mean_winrate.append(player_winrate/5)
    return mean_winrate

def get_mean_kd_on_map(data, team):
    mean_kd = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        player_kd = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]
            total_kills = sum([ms['kills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_deaths = sum([ms['deaths'] for ms in map_stats.values() if ms["name"]==map_played])
            player_kd += (total_kills / total_deaths) if total_deaths != 0 else 1
        mean_kd.append(player_kd/5)
    return mean_kd

def get_mean_multikills_score_on_map(data, team):
    mean_multikills_score = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        multikills_score = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_triple_k = sum([ms['tripleKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_quadra_k = sum([ms['quadraKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_penta_k = sum([ms['pentaKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_rounds = sum([ms['rounds'] for ms in map_stats.values() if ms["name"]==map_played])

            multikills = total_triple_k * 9 + total_quadra_k * 16 + total_penta_k * 25
            multikills_score += (multikills / total_rounds) if total_rounds != 0 else AVERAGE_RMK
        mean_multikills_score.append(multikills_score/5)
    return mean_multikills_score


def get_mean_rating_on_map(data, team):
    mean_rating = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        rating = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_kills = sum([ms['kills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_deaths = sum([ms['deaths'] for ms in map_stats.values() if ms["name"]==map_played])
            total_assists = sum([ms['assists'] for ms in map_stats.values() if ms["name"]==map_played])
            total_mvps = sum([ms['mvps'] for ms in map_stats.values() if ms["name"]==map_played])
            total_triple_k = sum([ms['tripleKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_quadra_k = sum([ms['quadraKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_penta_k = sum([ms['pentaKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_rounds = sum([ms['rounds'] for ms in map_stats.values() if ms["name"]==map_played])

            if total_rounds == 0:
                # TODO: review better value... impute mean
                rating += 1
            else:
                rating += compute_rating(total_kills, total_deaths, total_triple_k,
                                        total_quadra_k, total_penta_k, total_assists,
                                        total_mvps, total_rounds)

        mean_rating.append(rating/5)
    return mean_rating

In [20]:
def get_mean_matches_map_preference(data, team):
    mean_preference = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        players_preference = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]
            total_matches = sum([ms['matches'] for ms in map_stats.values()])
            total_matches_on_map = sum([ms['matches'] for ms in map_stats.values() if ms["name"]==map_played])
            players_preference += total_matches_on_map / total_matches if total_matches else 0.125
        mean_preference.append(players_preference/5)
    return mean_preference

def get_mean_winrate_map_preference(data, team):
    mean_winrate = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        player_winrate_map_preference = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_matches = sum([ms['matches'] for ms in map_stats.values()])
            total_wins = sum([ms['wins'] for ms in map_stats.values()])
            total_matches_on_map = sum([ms['matches'] for ms in map_stats.values() if ms["name"]==map_played])
            total_wins_on_map = sum([ms['wins'] for ms in map_stats.values() if ms["name"]==map_played])

            player_winrate = (total_wins / total_matches) if total_matches != 0 else 0.5
            player_winrate_on_map = (total_wins_on_map / total_matches_on_map) if total_matches_on_map != 0 else 0.5
            player_winrate_map_preference += (player_winrate_on_map / player_winrate) if player_winrate != 0 else 1

        mean_winrate.append(player_winrate_map_preference/5)
    return mean_winrate

def get_mean_kd_map_preference(data, team):
    mean_kd = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        player_kd_preference = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_kills = sum([ms['kills'] for ms in map_stats.values()])
            total_deaths = sum([ms['deaths'] for ms in map_stats.values()])
            total_kills_on_map = sum([ms['kills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_deaths_on_map = sum([ms['deaths'] for ms in map_stats.values() if ms["name"]==map_played])

            player_kd = (total_kills / total_deaths) if total_deaths != 0 else 1
            player_kd_on_map = (total_kills_on_map / total_deaths_on_map) if total_deaths_on_map != 0 else 1
            player_kd_preference += (player_kd_on_map / player_kd) if player_kd != 0 else 1

        mean_kd.append(player_kd_preference/5)
    return mean_kd

def get_mean_multikills_score_map_preference(data, team):
    mean_multikills_score = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        multikills_score_preference = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_triple_k_on_map = sum([ms['tripleKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_quadra_k_on_map = sum([ms['quadraKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_penta_k_on_map = sum([ms['pentaKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_rounds_on_map = sum([ms['rounds'] for ms in map_stats.values() if ms["name"]==map_played])
            total_triple_k = sum([ms['tripleKills'] for ms in map_stats.values()])
            total_quadra_k = sum([ms['quadraKills'] for ms in map_stats.values()])
            total_penta_k = sum([ms['pentaKills'] for ms in map_stats.values()])
            total_rounds = sum([ms['rounds'] for ms in map_stats.values()])

            total_multikills = total_triple_k * 9 + total_quadra_k * 16 + total_penta_k * 25
            total_multikills_score = (total_multikills / total_rounds) if total_rounds != 0 else AVERAGE_RMK

            total_multikills_on_map = total_triple_k_on_map * 9 + total_quadra_k_on_map * 16 + total_penta_k_on_map * 25
            total_multikills_score_on_map = (total_multikills_on_map / total_rounds_on_map) if total_rounds_on_map != 0 else AVERAGE_RMK

            multikills_score_preference += (total_multikills_score_on_map / total_multikills_score) if total_multikills_score != 0 else 1
        mean_multikills_score.append(multikills_score_preference/5)
    return mean_multikills_score


def get_mean_rating_map_preference(data, team):
    mean_rating = []
    for team_data, map_played in zip(data[team], data["mapPlayed"]):
        rating_preference = 0
        for p in team_data:
            map_stats = p["mapStats"]["mapStats"]

            total_kills_on_map = sum([ms['kills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_deaths_on_map = sum([ms['deaths'] for ms in map_stats.values() if ms["name"]==map_played])
            total_assists_on_map = sum([ms['assists'] for ms in map_stats.values() if ms["name"]==map_played])
            total_mvps_on_map = sum([ms['mvps'] for ms in map_stats.values() if ms["name"]==map_played])
            total_triple_k_on_map = sum([ms['tripleKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_quadra_k_on_map = sum([ms['quadraKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_penta_k_on_map = sum([ms['pentaKills'] for ms in map_stats.values() if ms["name"]==map_played])
            total_rounds_on_map = sum([ms['rounds'] for ms in map_stats.values() if ms["name"]==map_played])

            total_kills = sum([ms['kills'] for ms in map_stats.values()])
            total_deaths = sum([ms['deaths'] for ms in map_stats.values()])
            total_assists = sum([ms['assists'] for ms in map_stats.values()])
            total_mvps = sum([ms['mvps'] for ms in map_stats.values()])
            total_triple_k = sum([ms['tripleKills'] for ms in map_stats.values()])
            total_quadra_k = sum([ms['quadraKills'] for ms in map_stats.values()])
            total_penta_k = sum([ms['pentaKills'] for ms in map_stats.values()])
            total_rounds = sum([ms['rounds'] for ms in map_stats.values()])



            if total_rounds == 0:
                # TODO: review better value... impute mean
                total_rating += 1
            else:
                total_rating = compute_rating(total_kills, total_deaths, total_triple_k,
                                        total_quadra_k, total_penta_k, total_assists,
                                        total_mvps, total_rounds)
            if total_rounds_on_map == 0:
                # TODO: review better value... impute mean
                total_rating_on_map += 1
            else:
                total_rating_on_map = compute_rating(total_kills_on_map, total_deaths_on_map, total_triple_k_on_map,
                                        total_quadra_k_on_map, total_penta_k_on_map, total_assists_on_map,
                                        total_mvps_on_map, total_rounds_on_map)
            rating_preference += (total_rating_on_map / total_rating) if total_rating != 0 else 1
        mean_rating.append(rating_preference/5)
    return mean_rating

In [21]:
def add_lifetime_features(data):
    add_feature(data, get_mean_matches)
    add_feature(data, get_mean_winrate)
    add_feature(data, get_mean_kd)
    add_feature(data, get_mean_multikills_score)
    add_feature(data, get_mean_rating)

    add_feature(data, get_mean_matches_on_map)
    add_feature(data, get_mean_winrate_on_map)
    add_feature(data, get_mean_kd_on_map)
    add_feature(data, get_mean_multikills_score_on_map)
    add_feature(data, get_mean_rating_on_map)

    add_feature(data, get_mean_matches_map_preference)
    add_feature(data, get_mean_winrate_map_preference)
    add_feature(data, get_mean_kd_map_preference)
    add_feature(data, get_mean_multikills_score_map_preference)
    add_feature(data, get_mean_rating_map_preference)

## Date Features

In [22]:
def get_mean_created_at_faceit(data, team):
    mean_created_interval = []
    for team_data, start_time in zip(data[team], data["startTime"]):
        team_mean_account_age = mean([start_time - p["activatedAtTimeStamp"] for p in team_data])
        mean_created_interval.append(team_mean_account_age)
    return mean_created_interval


def get_stddev_created_at_faceit(data, team):
    stddev_created_interval = []
    for team_data, start_time in zip(data[team], data["startTime"]):
        team_stddev_account_age = np.std([start_time - p["activatedAtTimeStamp"] for p in team_data])
        stddev_created_interval.append(team_stddev_account_age)
    return stddev_created_interval


def get_min_created_at_faceit(data, team):
    min_created_interval = []
    for team_data, start_time in zip(data[team], data["startTime"]):
        team_min_account_age = min([start_time - p["activatedAtTimeStamp"] for p in team_data])
        min_created_interval.append(team_min_account_age)
    return min_created_interval

In [23]:
def convert_activated_date_to_timestamp(data):
    def convert_to_utc(utc_dt):
        return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
    date_string = "%a %b %d %X %Z %Y"

    for team in ('teamA', 'teamB'):
        for team_data in data[team]:
            for player in team_data:
                activated_at = player["activatedAt"]

                activated_datetime = datetime.strptime(activated_at, date_string)
                # convert back to utc
                activated_utc_datetime = convert_to_utc(activated_datetime)
                activated_timestamp = int(activated_utc_datetime.timestamp())

                player["activatedAtTimeStamp"] = activated_timestamp

In [24]:
def add_date_features(data):
    convert_activated_date_to_timestamp(data)
    
    add_feature(data, get_mean_created_at_faceit)
    add_feature(data, get_stddev_created_at_faceit)
    add_feature(data, get_min_created_at_faceit)

## Previous Matches Features

In [25]:
def get_all_previous_matches(match):
    previous_matches_ids = set()

    for team in (match["teamA"], match["teamB"]):
        for player in team:
            previous_matches_ids = previous_matches_ids.union(player["previousMatches"])
    
    previous_matches_cursor = matches_coll.find({"_id": {"$in": list(previous_matches_ids)}})
    return {m["_id"]:m for m in previous_matches_cursor}

In [26]:
def get_mean_matches_on_map_prev(match, team, **kwargs):
    num_matches_same_map = 0

    previous_matches = kwargs['previous_matches']
    map_played = match['mapPlayed']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        num_matches_same_map += len([m for m in player_prev_matches if m['mapPlayed'] == map_played])

    return num_matches_same_map / 5

def get_mean_winrate_prev(match, team, **kwargs):
    winrates = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]
        
        num_won_prev_matches = sum([get_player_won_the_match(m, player["id"]) for m in player_prev_matches])
        num_prev_matches = len(player_prev_matches)
        winrates += num_won_prev_matches / num_prev_matches if num_prev_matches != 0 else 0.5

    return winrates / 5


def get_mean_kd_prev(match, team, **kwargs):
    kds = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        prev_match_kds = []
        for prev_match in player_prev_matches:
            player_prev = [p for team in prev_match['teams'] for p in team if p['id'] == player_id][0]
            if 'playerStats' not in player_prev:
                prev_match_kds.append(1)
                continue
            player_stats = player_prev['playerStats']
            kills = player_stats['kills']
            deaths = player_stats['deaths']
            kd_ratio = (kills / deaths) if deaths != 0 else kills
            prev_match_kds.append(kd_ratio)
        kds += sum(prev_match_kds) / len(prev_match_kds) if prev_match_kds else 1

    return kds / 5


def get_mean_weighted_kd_by_elo_prev(match, team, **kwargs):
    kds = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        player_elo = player["elo"]
        prev_match_kds = []
        for prev_match in player_prev_matches:
            player_prev = [p for team in prev_match['teams'] for p in team if p['id'] == player_id][0]
            if 'playerStats' not in player_prev:
                prev_match_kds.append(1)
                continue
            player_stats = player_prev['playerStats']
            kills = player_stats['kills']
            deaths = player_stats['deaths']
            kd_ratio = (kills / deaths) if deaths != 0 else kills
            prev_match_kds.append(kd_ratio)
        kds += sum(prev_match_kds) * player_elo / len(prev_match_kds) if prev_match_kds else 1

    return kds / 5

def get_multikills_score_prev(match, team, **kwargs):
    all_multikills = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        prev_match_multikills = []
        for prev_match in player_prev_matches:
            player_prev = [player for team in prev_match['teams'] for player in team if player['id'] == player_id][0]
            if 'playerStats' not in player_prev:
                prev_match_multikills.append(AVERAGE_RMK)
                continue
            player_stats = player_prev['playerStats']
            triple_k = player_stats['tripleKills']
            quadra_k = player_stats['quadraKills']
            penta_k = player_stats['pentaKills']

            rounds = sum(get_team_rounds(prev_match['score']))
            multikills_score = (triple_k * 9 + quadra_k * 16 + penta_k * 25) / rounds if rounds else AVERAGE_RMK
            prev_match_multikills.append(multikills_score)
        all_multikills += sum(prev_match_multikills) / len(prev_match_multikills) if prev_match_multikills else AVERAGE_RMK

    return all_multikills / 5


def get_mean_rating_prev(match, team, **kwargs):
    all_ratings = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        prev_match_ratings = []
        for prev_match in player_prev_matches:
            player_prev = [player for team in prev_match['teams'] for player in team if player['id'] == player_id][0]
            if 'playerStats' not in player_prev:
                prev_match_ratings.append(1)
                continue
            player_stats = player_prev['playerStats']
            kills = player_stats['kills']
            deaths = player_stats['deaths']
            triple_k = player_stats['tripleKills']
            quadra_k = player_stats['quadraKills']
            penta_k = player_stats['pentaKills']
            assists = player_stats['assists']
            mvps = player_stats['mvps']
            rounds = sum(get_team_rounds(prev_match['score']))

            kill_rating = kills / rounds / AVERAGE_KPR
            survival_rating = (rounds - deaths) / rounds / AVERAGE_SPR
            multi_kills_score = triple_k * 9 + quadra_k * 16 + penta_k * 25
            multi_kills_rating = multi_kills_score / rounds / AVERAGE_RMK
            assists_rating = assists / rounds / AVERAGE_APR
            mvps_rating = mvps / rounds / AVERAGE_MVPPR
            prev_match_ratings.append((kill_rating + 0.7 * survival_rating
                                       + multi_kills_rating + 0.5 * assists_rating
                                       + 0.3 * mvps_rating) / 3.5)
        all_ratings += sum(prev_match_ratings) / len(prev_match_ratings) if prev_match_ratings else 1

    return all_ratings / 5

In [27]:
def get_mean_interval_time_prev(match, team, **kwargs):
    interval_time_prev = 0

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches]
        interval_time_prev += sum(player_intervals) / len(player_intervals) if player_intervals else 0

    return interval_time_prev / 5

def get_mean_interval_time_oldest_prev(match, team, **kwargs):
    interval_time_prev = 0

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches]
        interval_time_prev += max(player_intervals) if player_intervals else 0

    return interval_time_prev / 5


def get_mean_interval_time_most_recent_prev(match, team, **kwargs):
    interval_time_prev = 0

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches]
        interval_time_prev += min(player_intervals) if player_intervals else 0

    return interval_time_prev / 5

def get_max_interval_time_most_recent_prev(match, team, **kwargs):
    interval_time_prev = []

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches]
        most_recent = min(player_intervals) if player_intervals else 0
        interval_time_prev.append(most_recent)

    return max(interval_time_prev) if interval_time_prev else 0

In [28]:
def get_mean_delta_elo_prev(match, team, **kwargs):
    delta_elos = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        player_elo = player["elo"]
        prev_delta_elos = []
        for prev_match in player_prev_matches:
            player_prev_elo = [p for team in prev_match['teams'] for p in team if p['id'] == player_id][0]["elo"]
            prev_delta_elos.append(player_elo - player_prev_elo)
        delta_elos += sum(prev_delta_elos) / len(prev_delta_elos) if prev_delta_elos else 0

    return delta_elos / 5

def get_mean_dif_rounds_prev(match, team, **kwargs):
    dif_rounds = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        prev_dif_rounds = []
        for prev_match in player_prev_matches:
            is_on_team_A = any([player['id'] == player_id for p in prev_match['teamA']])
            team_rounds = get_team_rounds(prev_match['score'])
            dif_team_rounds = team_rounds[0] - team_rounds[1] if is_on_team_A else team_rounds[1] - team_rounds[0]
            prev_dif_rounds.append(dif_team_rounds)

        dif_rounds += sum(prev_dif_rounds) / len(prev_dif_rounds) if prev_dif_rounds else 0

    return dif_rounds / 5

def get_mean_dif_elo_prev(match, team, **kwargs):
    dif_elo = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        player_dif_elo = []
        for prev_match in player_prev_matches:
            is_on_team_A = any([player['id'] == player_id for p in prev_match['teamA']])
            player_elo = [player for team in prev_match['teams']
                          for player in team if player['id'] == player_id][0]['elo']
            if is_on_team_A:
                elos_opposing_team = [player['elo'] for player in prev_match['teamB']]
            else:
                elos_opposing_team = [player['elo'] for player in prev_match['teamA']]

            mean_elo_opposing_team = sum(elos_opposing_team) / len(elos_opposing_team)
            player_dif_elo.append(player_elo - mean_elo_opposing_team)
        dif_elo += sum(player_dif_elo) / len(player_dif_elo) if player_dif_elo else 0

    return dif_elo / 5

def get_mean_matches_afk(match, team, **kwargs):
    afks = 0

    previous_matches = kwargs['previous_matches']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        player_id = player["id"]
        prev_match_afks = 0
        for prev_match in player_prev_matches:
            player_prev = [player for team in prev_match['teams'] for player in team if player['id'] == player_id][0]
            if not 'playerStats' in player_prev:
                prev_match_afks += 1

        afks += prev_match_afks / len(player_prev_matches) if player_prev_matches else 0

    return afks / 5

def get_num_played_togthr_prev(match, team, **kwargs):
    all_played_together = 0

    previous_matches = kwargs['previous_matches']
    team_players_ids = [p['id'] for p in match[team]]

    players_in_match = defaultdict(list)
    for player in match[team]:
        for prev_match_id in player["previousMatches"]:
            players_in_match[prev_match_id].append(player["id"])

    # for all previous that have two or more common: check if all in the same team
    for match_id, player_ids in players_in_match.items():
        if len(player_ids) > 1:
            prev_match = previous_matches[match_id]
            
            players_ids_A = [p['id'] for p in prev_match["teamA"]]
            players_ids_B = [p['id'] for p in prev_match["teamB"]]
            players_on_A = [p for p in player_ids if p in players_ids_A]
            players_on_B = [p for p in player_ids if p in players_ids_B]
            if len(players_on_A) > 1 :
                all_played_together += len(players_on_A)
            if len(players_on_B) > 1 :
                all_played_together += len(players_on_B)

    num_matches = sum([len(p) for p in players_in_match.values()])
    return all_played_together / num_matches

def get_winrate_togthr_prev(match, team, **kwargs):
    wins_together, num_matches_together = 0, 0

    previous_matches = kwargs['previous_matches']
    team_players_ids = [p['id'] for p in match[team]]

    players_in_match = defaultdict(list)
    for player in match[team]:
        for prev_match_id in player["previousMatches"]:
            players_in_match[prev_match_id].append(player["id"])

    # for all previous that have two or more common: check if all in the same team
    for match_id, player_ids in players_in_match.items():
        if len(player_ids) > 1:
            prev_match = previous_matches[match_id]
            
            players_ids_A = [p['id'] for p in prev_match["teamA"]]
            players_ids_B = [p['id'] for p in prev_match["teamB"]]
            players_on_A = [p for p in player_ids if p in players_ids_A]
            players_on_B = [p for p in player_ids if p in players_ids_B]
            if len(players_on_A) > 1 :
                won_match = get_player_won_the_match(prev_match, players_on_A[0])
                won_multiplier = 1 if won_match == 1 else -1
                wins_together += won_multiplier * len(players_on_A)
                num_matches_together += len(players_on_A)
            if len(players_on_B) > 1 :
                won_match = get_player_won_the_match(prev_match, players_on_B[0])
                won_multiplier = 1 if won_match == 1 else -1
                wins_together += won_multiplier * len(players_on_B)
                num_matches_together += len(players_on_B)
            
    return wins_together / num_matches_together if num_matches_together else 0

In [29]:
# 7 hours
on_day_time = 7 * 3600

def get_mean_first_matches_on_day(match, team, **kwargs):
    most_recent_matches_intervals = []

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        most_recent = min([start_time - prev_match['startTime'] for prev_match in player_prev_matches])
        most_recent_matches_intervals.append(most_recent)

    # if most recent match was played more than 7 hours ago then mark as the first match of the day
    return len([i for i in most_recent_matches_intervals if i > on_day_time])


def get_mean_matches_on_day(match, team, **kwargs):
    num_matches_on_day = 0

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches]
        num_matches_on_day += len([i for i in intervals if i < on_day_time])

    return num_matches_on_day / 5


def get_mean_played_map_on_day(match, team, **kwargs):
    num_matches_on_day = 0

    previous_matches = kwargs['previous_matches']
    start_time = match['startTime']
    map_played = match['mapPlayed']

    for player in match[team]:
        player_prev_matches_ids = player["previousMatches"]
        player_prev_matches = [m for match_id, m in previous_matches.items() if match_id in player_prev_matches_ids]

        intervals = [start_time - prev_match['startTime'] for prev_match in player_prev_matches if prev_match['mapPlayed'] == map_played]
        num_matches_on_day += len([i for i in intervals if i < on_day_time])

    return num_matches_on_day / 5

In [30]:
def add_previous_matches_features(match):
    previous_matches = get_all_previous_matches(match)

    add_feature(match, get_mean_matches_on_map_prev, previous_matches=previous_matches)
    add_feature(match, get_mean_winrate_prev, previous_matches=previous_matches)
    add_feature(match, get_mean_kd_prev,  previous_matches=previous_matches)
    add_feature(match, get_mean_weighted_kd_by_elo_prev,  previous_matches=previous_matches)

    add_feature(match, get_multikills_score_prev, previous_matches=previous_matches)
    add_feature(match, get_mean_rating_prev, previous_matches=previous_matches)
 
    # # add_feature(match, get_mean_delta_rounds_predictor_prev,  previous_matches=previous_matches)
    # # ##    add_feature(match, get_mean_delta_rating_prev, prevs = previous_matches, p2m=players_to_match)
    add_feature(match, get_mean_interval_time_prev, previous_matches=previous_matches)
    add_feature(match, get_mean_interval_time_oldest_prev, previous_matches=previous_matches)
 
    add_feature(match, get_mean_interval_time_most_recent_prev, previous_matches=previous_matches)
    add_feature(match, get_max_interval_time_most_recent_prev, previous_matches=previous_matches)

    add_feature(match, get_mean_delta_elo_prev, previous_matches=previous_matches)
    add_feature(match, get_mean_dif_rounds_prev,  previous_matches=previous_matches)
    add_feature(match, get_mean_dif_elo_prev, previous_matches=previous_matches)

    add_feature(match, get_mean_matches_afk, previous_matches=previous_matches)

    add_feature(match, get_num_played_togthr_prev, previous_matches=previous_matches)
    add_feature(match, get_winrate_togthr_prev, previous_matches=previous_matches)

    add_feature(match, get_mean_first_matches_on_day, previous_matches=previous_matches)
    add_feature(match, get_mean_matches_on_day, previous_matches=previous_matches)
    add_feature(match, get_mean_played_map_on_day, previous_matches=previous_matches)

    return match

## All Features

In [73]:
def add_all_features(data_matches):
    add_match_features(data_matches)
    add_lifetime_features(data_matches)
    add_date_features(data_matches)
    
    data_matches = data_matches.apply(add_previous_matches_features, axis=1)
    return data_matches

In [69]:
def select_features(data):
    selected_columns = ["_id", "winner", "match_mean_elo", "binned_match_elo", "5v5_free_queue", "5v5_premium_queue"]
    dif_columns = [c for c in data.columns if c.startswith("dif_")]
    selected_columns.extend(dif_columns)
    return data[selected_columns]

In [70]:
def create_features(data_matches):
    featurized_matches = add_all_features(data_matches)
    featurized_matches = select_features(featurized_matches)

    return featurized_matches

In [71]:
data_featurized = create_features(dataset)

In [58]:
for index, filename in enumerate(all_files):
    dataset = pd.read_json(filename)
    data_featurized = create_features(dataset)
    data_featurized.to_pickle(f'data/dataset_featurized/batch_{index+1}.pkl')

# Rating Predictor

In [None]:
matches_sample = matches_coll.find({}, {"_id":0, 'teamA':1, 'teamB':1, 'score':1, 'teams':1}, batch_size=10000).limit(10000)
matches_sample_df = pd.DataFrame(list(matches_sample))

In [None]:
ratings_data = []

def get_ratings_elos_rounds(match):
    team_rounds = get_team_rounds(match['score'])
    dif_rounds = team_rounds[0] - team_rounds[1]
    
    for player in match['teamA']:
        if(player['playerStats']==None):
            continue
        player_elo = player['elo']
        opposing_team_mean_elo = mean([player['elo'] for player in match['teamB']])
        rating = compute_rating(match, player['id'])
        dif_elo = player_elo - opposing_team_mean_elo
        ratings_data.append((rating, dif_elo, dif_rounds))
    
    for player in match['teamB']:
        if(player['playerStats']==None):
            continue
        player_elo = player['elo']
        opposing_team_mean_elo = mean([player['elo'] for player in match['teamA']])
        rating = compute_rating(match, player['id'])
        dif_elo = player_elo - opposing_team_mean_elo
        ratings_data.append((rating, dif_elo, -dif_rounds))

In [None]:
matches_sample_df.apply(get_ratings_elos_rounds, axis=1);

In [None]:
rating_df = pd.DataFrame(ratings_data, columns=["rating", "dif_elo", "dif_rounds"])
rating_df.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    rating_df.drop(columns=["rating"]),
    rating_df["rating"],
    test_size=0.3,
    random_state=42)

In [None]:
rating_predictor = LinearRegression()
rating_predictor.fit(X_train,y_train)

print('Test score', rating_predictor.score(X_test, y_test))

### Store model

In [None]:
date = datetime.now().__str__()
date = date.replace(" ", "_").replace(":", "_").replace("-", "_").replace(".", "_")
model_name = "_".join(["model", "rating_predictor", date])
joblib.dump(rating_predictor, model_name + ".pkl")

### Load stored model

In [None]:
rating_predictor = joblib.load("model_rating_predictor_2020_03_05_07_23_44_731552.pkl")