<div style="text-align:center; display:flex; justify-content:center; margin:16px 0px">
    <span style="color:#ff5500; font-family:Play; font-size:3em; margin:auto 32px">Part I<br \>Build Dataset</span>
</div>

---

This document is a part of the FACEIT Predictor Data Science Workflow.

In this notebook the collected data (stored in the local MongoDB database) is preprocessed and new fields/collections are stored directly in the same Database. Therefore, those steps are only executed once and the feature engineering phase is much faster.

# Imports

In [None]:
import json

from pymongo import MongoClient, DESCENDING
from pymongo.errors import PyMongoError
from tqdm import tqdm
from collections import defaultdict

# local modules
from config import read_config

# Database Connection

In [None]:
db_cfg = read_config("local.ingestorDB")

In [None]:
client = MongoClient(**db_cfg)
db = client['faceit_imported']

In [None]:
# Connect to the collections inside the ingestor database
players_coll = db['player']
matches_coll = db['match']
lifetime_stats_coll = db['player_lifetime_stats']

# Create Lifetime Stats

In [None]:
def subtract_scoreboard_stats(player_lifetime, player_scoreboard):
    for k in player_scoreboard.keys():
        player_lifetime[k] -= player_scoreboard.get(k, 0)


def get_won_the_match(player_id, match, team_rounds):
    player_ids_team_A = [p['id'] for p in match['teamA']]
    is_on_team_A = player_id in player_ids_team_A

    if (is_on_team_A and team_rounds[0] > team_rounds[1]) or \
            (not is_on_team_A and team_rounds[1] > team_rounds[0]):
        return 1
    else:
        return 0


def update_stats(next_lifetime_stats, player, match):
    player_id = player["_id"]
    score = match["score"]
    map_played = match['mapPlayed']

    # create a deep copy of previous map stats
    new_lifetime_stats = {k: v for k, v in next_lifetime_stats.items()}
    new_lifetime_stats[map_played]["matches"] -= 1

    team_rounds = [int(r) for r in score.split("/")]
    new_lifetime_stats[map_played]['rounds'] -= sum(team_rounds)

    won_the_match = get_won_the_match(player_id, match, team_rounds)
    new_lifetime_stats[map_played]['wins'] -= won_the_match

    players_of_match = [player for team in match['teams']
                        for player in team]
    player_stats_on_match = [
        p for p in players_of_match if p["id"] == player_id][0]["playerStats"]

    if not player_stats_on_match:
        return None
        
    subtract_scoreboard_stats(
        new_lifetime_stats[map_played], player_stats_on_match)

    return new_lifetime_stats


def get_next_lifetime_stats(player, match_id):
    player_id = player["_id"]

    # Match history is sorted in temporal descending order
    # The following matches are stored in the preceding indexes
    next_match_index = -1
    for index, m_history in enumerate(player["matchHistory"]):
        if m_history["id"] == match_id:
            next_match_index = index - 1
            break

    # If the match was the last to be played (1st one in match history),
    # then retrieve the current lifetime stats of the player
    if next_match_index < 0:
        next_lifetime_stats = player["mapStats"]
    else:
        previous_match = player["matchHistory"][next_match_index]
        next_lifetime_stats = lifetime_stats_coll.find_one(
            {"playerId": player_id,
             "matchId": previous_match["id"]},
            {"_id": 0, "mapStats": 1})

        if not next_lifetime_stats:
            raise PyMongoError("No previous lifetime stats")
        
        next_lifetime_stats = next_lifetime_stats["mapStats"]

    return next_lifetime_stats


def compute_new_lifetime_stats(player, match):
    player_id = player["_id"]
    match_id = match["_id"]
    match_start_time = match["startTime"]

    # return if match was played after player processing time
    matches_of_player = [m["id"] for m in player["matchHistory"]]
    if match_start_time > player["updatedAtIngestor"] or match_id not in matches_of_player:
        return None

    try:
        next_lifetime_stats = get_next_lifetime_stats(player, match_id)
    except:
        return None

    if match["mapPlayed"] not in next_lifetime_stats:
        return None

    new_stats = {}
    new_stats["matchId"] = match_id
    new_stats["playerId"] = player_id
    new_stats["startTime"] = match_start_time
    map_stats = update_stats(next_lifetime_stats, player, match)
    if not map_stats:
        return None
    new_stats["mapStats"] = map_stats

    return new_stats


def create_all_lifetime_stats(match):
    # Get all ids of the players in the match
    players_ids = {player['id'] for team in match['teams'] for player in team}

    # Get the ids of the players whose lifetime stats
    # were already processed for this match
    players_ids_processed = set(lifetime_stats_coll.distinct("playerId", {
        "matchId": match['_id'],
        "playerId": {"$in": list(players_ids)}}))

    players_ids_to_process = players_ids - players_ids_processed
    if not players_ids_to_process:
        return

    players_to_process = players_coll.find(
        {"_id": {"$in": list(players_ids_to_process)}})

    lifetime_stats = [compute_new_lifetime_stats(
        player, match) for player in players_to_process]

    # Filter null values
    lifetime_stats = [x for x in lifetime_stats if x]
    if lifetime_stats:
        lifetime_stats_coll.insert_many(lifetime_stats)


In [None]:
matches_cursor = matches_coll.find({}).sort("startTime", DESCENDING)

In [None]:
for m in tqdm(matches_cursor, total=matches_coll.estimated_document_count()):
    create_all_lifetime_stats(m)

# Get Processable Matches

Check which matches have full info available:
* `match_history`: the match id is in all of the players' match history, and the previous 10 matches are in DB
* `lifetime_stats`: the lifetime stats for all players are present in the DB

In [None]:
matches_ready = defaultdict(lambda: {"match_history":0, "lifetime_stats":0})

In [None]:
all_players = players_coll.find({})

In [None]:
for p in tqdm(all_players, total=players_coll.estimated_document_count()):
    prev_matches = sorted(p["matchHistory"], key=lambda x: x["startTime"])
    prev_matches_ids = [m["id"] for m in prev_matches]

    matches_ids_in_db = set(matches_coll.distinct("_id", {"_id":{"$in":prev_matches_ids}}))
    missing_decay = 0
    for index, m in enumerate(prev_matches):
        match_id = m["id"]
        if match_id not in matches_ids_in_db:
            missing_decay = 10
            continue
        if missing_decay > 0:
            missing_decay -= 1
        elif missing_decay == 0 and index > 9:
            matches_ready[match_id]["match_history"] += 1

In [None]:
all_lifetime_stats = lifetime_stats_coll.find({})

In [None]:
for l in tqdm(all_lifetime_stats, total=lifetime_stats_coll.estimated_document_count()):
    matches_ready[l["matchId"]]["lifetime_stats"] += 1

In [None]:
processable_match_ids = [m_id for m_id, rd in matches_ready.items()
                if rd["match_history"]==10 and rd["lifetime_stats"]==10]

# Build Dataset

In [None]:
# BATCH processing
# for each match
    # for each player
        # get lifetime stats and player stats
        # concat lifetime, player and match stats
        # store the data in batches

In [None]:
players_data_fields = ['activatedAt', 'steamCreatedAt', 'updatedAt', 'csgoId','verified']

In [None]:
matches_to_process =  matches_coll.find({"_id":{"$in":processable_match_ids}})

In [None]:
matches_processed = []
players_data = {}
batch_size = 3000

for index, match in enumerate(tqdm(matches_to_process, total=len(processable_match_ids)):
    # Get all ids of the players in the match
    players_ids = {player['id'] for team in match['teams'] for player in team}

    lifetime_stats = lifetime_stats_coll.find({
        "matchId": match['_id'],
        "playerId": {"$in": list(players_ids)}})
    
    lifetime_data = {}
    for lt in lifetime_stats:
        lifetime_data[lt["playerId"]] = lt

    
    for team in ["teamA", "teamB"]:
        for player in match[team]:
            player_id = player["id"]

            # check if player already in data, if not retrieve from DB and store
            player_data = players_data.get(player_id, None)
            if not player_data:
                player_data = players_coll.find_one({"_id":player_id})
                players_data[player_data["_id"]] = player_data
                
            for player_field in players_data_fields:
                player[player_field] = players_data[player_id].get(player_field, None)

            player["mapStats"] = lifetime_data[player_id]

            player_match_history = player_data["matchHistory"]
            match_history_ids = [m['id'] for m in player_match_history]
            match_index = match_history_ids.index(match['_id'])
            player["previousMatches"] = match_history_ids[match_index+1:match_index+1+10]
    match.pop("teams")
    matches_processed.append(match)

    if index % batch_size == 0 and index > 0:
        with open(f'data/dataset/batch_{index // batch_size}.json', 'w') as fp:
            json.dump(matches_processed, fp, default=str)
        matches_processed.clear()


batch_number = (index // batch_size) + 1
with open(f'data/dataset/batch_{batch_number}.json', 'w') as fp:
    json.dump(matches_processed, fp, default=str)

# Dataset metadata