In [88]:
import os
import re
from datetime import datetime, timedelta
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU 
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop, Adagrad
from scikeras.wrappers import KerasClassifier
from keras.callbacks import CSVLogger, EarlyStopping, ModelCheckpoint

# sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# other
# import tqdm notebook
from tqdm.notebook import tqdm

import multiprocessing as mp

#### Matplotlib settings
%matplotlib inline
import matplotlib as mpl

# specify default rcParams so that fontsize, weight and style don't need to be set each time
# Title in bold, fontsize 20
mpl.rcParams['figure.titleweight'] = 'bold'
mpl.rcParams['figure.titlesize'] = 20
mpl.rcParams['axes.titleweight'] = 'bold'
mpl.rcParams['axes.titlesize'] = 20
# Plot fontsize 16 and bold
mpl.rcParams['axes.labelweight'] = 'bold'
mpl.rcParams['axes.labelsize'] = 16
# set figure size
mpl.rcParams['figure.figsize'] = (18, 8)
# set grid on
mpl.rcParams['axes.grid'] = True
# set grid linestyle
mpl.rcParams['grid.linestyle'] = '--'
# set axis labels fontsize
mpl.rcParams['xtick.labelsize'] = 14
mpl.rcParams['ytick.labelsize'] = 14

# race list and colors that will be used to represent them
RACE_LIST = [
    'Protoss',
    'Terran',
    'Zerg'
    ]
COLOR_DICT = {
    'Protoss': 'goldenrod',
    'Terran': 'firebrick',
    'Zerg': 'darkviolet'
    }

#### Pandas options
# these are set to allow better exploration of the large dataframes in this 
# notebook
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

#IPython
from IPython.display import display
%load_ext autoreload


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
class BatchGenerator(keras.utils.Sequence):
    
    def __init__(
        self, 
        list_filehashes, 
        labels,
        data_dir,
        window_size,
        n_channels,
        window_start=0,
        batch_size=32,  
        n_classes=2, 
        shuffle=True
    ):
        """
        Generate batches of data for training.
    
        Args:
            data (str): the path to a directory of npy files
            labels (dict): a dictionary of filehashes and their corresponding labels
            list_filehashes (list): the list of filehashes to use
            window_size (int): the number of timesteps to use as a window
            n_channels (int): the number of features in the data
            batch_size (int): the number of samples per batch. Defaults to 32.
            n_classes (int): the number of classes for the target. Defaults to 2.
            shuffle (bool): whether to shuffle the data. Defaults to True.
        """
        self.data_dir = data_dir
        self.labels = labels
        self.batch_size = batch_size
        self.window_size = window_size
        self.window_start = window_start
        self.n_channels = n_channels
        self.list_IDs = list_filehashes
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' 
        # X : (n_samples, window_size, n_channels)
        # Initialization
        X = np.empty((self.batch_size, self.window_size, self.n_channels))
        y = np.empty((self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):

            # get X_tmp and then cut down to the window size
            X_tmp = np.load(os.path.join(self.data_dir, ID + '.npy'))

            X[i,] = X_tmp[
                self.window_start:self.window_start + self.window_size,
                :
            ]

            # Store class
            y[i] = self.labels[ID]

        return X, y

In [37]:
def setup_data_for_modeling(
    metadata_path='data/spawningtool_replays.csv',
    master_columns_path='info/clean_master_columns_list.csv',
    non_feature_columns=['winner', 'filehash', 'frame'],
    partition_index=None,
    window_size=60,
    window_start=0,
    verbose=True
):
    """
    Setup the data for modeling.
    
    Args:
        metadata_path (str): the path to the metadata csv. Defaults to 'data/spawningtool_replays.csv'.
        master_columns_path (str): the path to the master_columns_list csv. Defaults to 'info/clean_master_columns_list.csv'.
        non_feature_columns (list): the list of non-feature columns. Defaults to ['winner', 'filehash', 'frame'].
        window_size (int): the window size to use. Defaults to 60.
        verbose (bool): whether to print out the progress. Defaults to True.

    Returns tuple:
        panda.DataFrame: the dataframe containing the metadata
        list: the list of all columns
        list: the list of feature columns
        dict: the dictionary of train, val, and test filehashes
        dict: the dictionary of labels
    """
    # get spawningtool_df
    spawningtool_df = pd.read_csv(metadata_path)

    # get master_columns_list
    master_columns_list = pd.read_csv(master_columns_path).values.tolist()
    master_columns_list = [col[0] for col in master_columns_list]

    # create non_feature_columns
    non_feature_columns = ['winner', 'filehash', 'frame']
    # create feature_columns list
    feature_columns = [
        col for col in master_columns_list if col not in non_feature_columns
    ]

    partition = {}

    if partition_index is None:
        # loop through files in the partitions and create a list of hashes that meet 
        # the window size
        
        for folder in ['train', 'val', 'test']:
            # get the filenames in the folder
            filehashes = [
                os.path.splitext(f)[0] for f in os.listdir(
                    f'data/model_data/{folder}'
                )
            ]

            partition_list = []
            # load the file and check its shape[0]
            for filehash in filehashes:
                arr = np.load(f'data/model_data/{folder}/{filehash}.npy')
                if arr.shape[0] > (window_size + window_start):
                    partition_list.append(filehash)

            # add the filehashes to the partition
            partition[folder] = partition_list

            # calculate number of files removed
            removed = len(filehashes) - len(partition_list)

            if verbose:
                print(f'\t{folder} has {len(filehashes)} valid files, removed {removed} files ({round(100*removed/len(filehashes),2)}%)')

    else:
        for folder in partition_index['partition'].unique():
            # filter by shape > window_size + window_start
            mask = (partition_index['partition'] == folder) \
                & (partition_index['shape'] > (window_size + window_start)) 

            # get the filehashes
            partition[folder] = partition_index.loc[mask].index.values.tolist()


    # construct the labels dictionary
    labels = spawningtool_df[['game_winner', 'filehash']].set_index('filehash').to_dict()['game_winner']
    # drop all keys with value == 0
    labels = {k: v for k, v in labels.items() if v != 0}
    # convert value all keys with value == 2 to 1
    labels = {k: 0 if v == 2 else 1 for k, v in labels.items()}

    return spawningtool_df, master_columns_list, feature_columns, partition, labels

In [34]:
# function to create, train, evaluate, and save a model for a given window size
def train_evaluate_model(
    window_size,
    partition_index,
    window_start=0,  
    save_models=True,
    verbose=1
):
    """
    train_evaluate_model
    Train, evaluate, and save a model for a given window size.

    Args:
        window_size (int): the window size to use
        window_start (int): the window start to use. Defaults to 0.
        save_models (bool): whether to save the models to file or not
        verbose (int): the verbosity level for Keras. Defaults to 1.

    Returns:
        tuple: model, history, model_file, score, prediction_results
    """

    print('\tSetting up model data...')
    # setup the model
    # set up the data for modeling
    _, _, feature_columns, partition, labels = setup_data_for_modeling(
        window_size=window_size, 
        window_start=window_start,
        partition_index=partition_index,
    )

    # create a dictionary of parameters to pass to both generators
    params = {
        'window_size': window_size, 
        'window_start': window_start,
        'n_channels': len(feature_columns),
        'batch_size': 32, 
        'shuffle': True
    }

    # setup generators for training and validation
    training_generator = BatchGenerator(
        partition['train'],
        labels,
        data_dir='data/model_data/train',
        **params
    )

    validation_generator = BatchGenerator(
        partition['val'],
        labels,
        data_dir='data/model_data/val',
        **params
    )

    print('\tCreating model architecture...')
    # instantiate RNN model
    model = Sequential()

    # add LSTM layer
    model.add(LSTM(
        units=128,
        input_shape=(window_size, len(feature_columns)),
        return_sequences=False
    ))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    # add dense layer
    model.add(Dense(
        1, 
        activation='sigmoid', 
        kernel_regularizer=keras.regularizers.l1(0.01)
    ))

    # compile model using binary crossentropy loss and adam optimizer
    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(learning_rate=0.001),
        metrics=['accuracy']
    )

    # generate a unique identifier for the model use LSTM nodes and window_size
    model_file = f'models/lstm_size.{window_size}_from.{window_start}'
    model_file_ext = '.h5'
    # check if the model file exists and if so, append a version number
    if os.path.exists(model_file+model_file_ext):
        # iterate through the version number to find the next available version number
        i = 1
        model_file = model_file.split('_v')[0] + '_v' + str(i)
        while os.path.exists(model_file+model_file_ext):
            i += 1
            model_file = model_file + '_v' + str(i)

    # create a callback to log history
    csv_logger = CSVLogger(model_file+'.csv')

    # create a callback to save checkpoints
    checkpoint = ModelCheckpoint(
        filepath='models/checkpoints/'+model_file.split('/')[1]+'.hdf5',
        monitor='val_loss',
        verbose=0,
        save_best_only=False
    )

    # create a callback which stops training when no improvement is being made
    earlystop = EarlyStopping(
        monitor='val_loss', # monitor validation loss to prevent overfitting
        patience=2, # stop after 3 epochs without improvement
        verbose=1,  # print a message when the callback is triggered
        mode='auto' # keras infers if the monitored variable should be increasing or decreasing
    )

    # set up callbacks to be used for training
    if save_models:
        callbacks = [earlystop, csv_logger, checkpoint]
    else:
        callbacks = [earlystop]

    print('\tTraining model...')
    # fit the model and save to history
    history = model.fit(
        training_generator,
        validation_data=validation_generator,
        epochs=50,
        callbacks=callbacks,
        verbose=verbose
    )

    if save_models:   
        # pickle the model
        model.save(model_file+model_file_ext)

    # get the last value for val_accuracy
    score = history.history['val_accuracy'][-1]

    print('\tGenerating predictions...')
    # create a generator for predictions
    prediction_generator = BatchGenerator(
        partition['val'],
        labels,
        data_dir='data/model_data/val',
        window_size=window_size,
        window_start=window_start, 
        n_channels=len(feature_columns),
        batch_size=1, 
        shuffle=False
    )

    # make predictions
    predictions = model.predict(prediction_generator)

    # get the true labels for the predictions
    true_labels = [labels[k] for k in partition['val']]

    # create the prediction_results dataframe
    prediction_results = pd.DataFrame({
        'filehash': partition['val'],
        'probability': predictions.reshape(-1),
    })

    # set the filehash as the index
    prediction_results.set_index('filehash', inplace=True)

    # return the model and the score
    return model, history, model_file, score, prediction_results

In [92]:
def create_partitions():
    
    folder_list = [
        'data/model_data/train',
        'data/model_data/val',
        'data/model_data/test'
    ]

    partition_index = pd.DataFrame(columns=['filehash', 'shape'])

    for folder in folder_list:
        for file in os.listdir(folder):
            filehash = os.path.splitext(file)[0]
            partition_index = partition_index.append(
                {
                    'filehash': filehash,
                    'shape': int(
                        np.load(folder+'/'+file, mmap_mode='r').shape[0]),
                    'partition': folder.split('/')[-1]
                },
                ignore_index=True
            )

    partition_index.set_index('filehash', inplace=True)

    print('Partitioning complete.')

    return partition_index

In [95]:
model_dict = {}
i = 0
print('Partitioning data...')
partition_index = create_partitions()
print('-'*80)

# create a list of window sizes and start tuples
model_params = []
for window_size in np.arange(48, 121, 6):
    for window_start in np.arange(0, 121 - window_size, step=2):
        model_params.append((window_size, window_start))

for i in tqdm(range(len(model_params)), 
    desc='Training models',
    unit='model'
):
    # get the window size and start tuple
    window_size, window_start = model_params[i]

    # set up the model
    model, history, model_file, score, new_pred_results = train_evaluate_model(
        window_size=window_size,
        window_start=window_start,
        save_models=False,
        partition_index=partition_index,
        verbose=0
    )

    model_index = str(window_start)+'_'+str(window_size)

    new_pred_results.rename(
        columns={'probability': model_index},
        inplace=True
    )

    if i == 0:
        prediction_results = new_pred_results
    else:
        prediction_results = pd.concat(
            [prediction_results, new_pred_results], 
            axis=1
        )

    # save models in a dict keyed by window_start + window_size
    model_dict[model_index] = model
    print('Done!')
    print('-'*80)

Partitioning data...
Partitioning complete.
--------------------------------------------------------------------------------


Training models:   0%|          | 0/247 [00:00<?, ?model/s]

	Setting up model data...
	Creating model architecture...
	Training model...
Epoch 00009: early stopping
	Generating predictions...
Done!
--------------------------------------------------------------------------------
	Setting up model data...
	Creating model architecture...
	Training model...
Epoch 00007: early stopping
	Generating predictions...
Done!
--------------------------------------------------------------------------------
	Setting up model data...
	Creating model architecture...
	Training model...
Epoch 00008: early stopping
	Generating predictions...
Done!
--------------------------------------------------------------------------------
	Setting up model data...
	Creating model architecture...
	Training model...
Epoch 00009: early stopping
	Generating predictions...
Done!
--------------------------------------------------------------------------------
	Setting up model data...
	Creating model architecture...
	Training model...
Epoch 00008: early stopping
	Generating predict

In [51]:
prediction_results.head()

Unnamed: 0_level_0,0_12,10_12,20_12,30_12,40_12,50_12,60_12,70_12,80_12,90_12,100_12,0_24,10_24,20_24,30_24,40_24,50_24,60_24,70_24,80_24,90_24,0_36,10_36,20_36,30_36,40_36,50_36,60_36,70_36,80_36,0_48,10_48,20_48,30_48,40_48,50_48,60_48,70_48,0_60,10_60,20_60,30_60,40_60,50_60,60_60,0_72,10_72,20_72,30_72,40_72,0_84,10_84,20_84,30_84,0_96,10_96,20_96,0_108,10_108,0_120
filehash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1
0016ddd5b36473f259ba8630c6c0047540a6858071737e8f890358d3f7ad355d,0.47854,0.59806,0.087926,0.199271,0.124007,0.135398,0.039492,0.065046,0.489937,0.084131,0.104387,0.066167,0.06784,0.309149,0.095017,0.213251,0.220668,0.177882,0.495852,0.344256,0.098148,0.161989,0.092753,0.12371,0.028176,0.020097,0.387979,0.84924,0.537468,0.795417,0.019592,0.04648,0.034467,0.026782,0.212772,0.808688,0.414511,0.232392,0.131178,0.026408,0.010254,0.072036,0.086759,0.710828,0.079258,0.085336,0.01562,0.624395,0.051603,0.167276,0.01473,0.28341,0.465344,0.327579,0.787177,0.606639,0.486119,0.21293,0.072556,0.324719
00283dcee1b0bec45ff654a0df06248d9f69a5c3a66e7827e10660ad88bf5bee,0.5266,0.614885,0.258713,0.389767,0.789966,0.724467,0.57154,0.967763,0.291989,0.021289,0.516867,0.296109,0.244836,0.8605,0.71567,0.924575,0.96849,0.837172,0.925201,0.760122,0.143616,0.32415,0.353952,0.732399,0.88502,0.956954,0.801989,0.299489,0.740694,0.036893,0.262166,0.711719,0.830663,0.362114,0.188171,0.626452,0.235078,0.29781,0.167221,0.863784,0.848653,0.303732,0.33428,0.126523,0.101359,0.934342,0.355163,0.614399,0.224636,0.394577,0.353212,0.681236,0.811928,0.052835,0.556868,0.720422,0.661394,0.278303,0.497171,0.457617
0033334e86cb352b0131027e58effa23adb23cf1cdd90d5760cc62254611c99a,0.56381,0.688602,0.52077,0.486146,0.877253,0.941794,0.877516,0.786566,0.953953,0.637254,0.724162,0.437219,0.511848,0.919635,0.956855,0.937908,0.929437,0.95666,0.902231,0.780797,0.661751,0.659244,0.602856,0.955341,0.96776,0.863017,0.404402,0.863395,0.908409,0.950806,0.554702,0.874749,0.905928,0.887714,0.65053,0.737742,0.844959,0.749974,0.946754,0.871522,0.950233,0.811946,0.728694,0.591851,0.949278,0.954007,0.8839,0.921593,0.789151,0.50418,0.977398,0.904713,0.906649,0.68341,0.599961,0.896923,0.427648,0.809621,0.677286,0.933748
0055f4541fe30a72e964fc168db7abef35822d27e4cdce8ef0adb8b3cba51b7d,0.654678,0.66647,0.549546,0.752763,0.882442,0.946826,0.874497,0.994796,0.953706,,,0.551776,0.796762,0.97122,0.978145,0.943856,0.980214,0.984904,,,,0.727291,0.788836,0.980208,0.947675,0.978365,0.977469,,,,0.75623,0.977553,0.957861,0.986316,0.99782,,,,0.982317,0.947037,0.984399,0.983914,,,,0.845061,0.979166,0.969678,,,0.977412,,,,,,,,,
005ac860a39ec7ae855a22b68f13ea17c1d7006bafe381e46cd9f2cbf3fa3a99,0.410807,0.697154,0.425629,0.818765,0.884527,0.958854,0.884232,0.9054,0.998261,0.838431,0.978269,0.627104,0.574905,0.970679,0.956108,0.988558,0.992553,0.952348,0.977642,0.973903,0.959005,0.544934,0.708364,0.982398,0.954283,0.988978,0.98364,0.979953,0.921636,0.992276,0.841827,0.974754,0.937327,0.705655,0.937284,0.987146,0.986139,0.971984,0.945515,0.977011,0.989564,0.925274,0.945632,0.973543,0.995174,0.952927,0.951038,0.999048,0.948825,0.985612,0.987062,0.976674,0.93241,0.958789,0.979521,0.924713,0.97709,0.91639,0.958075,0.997279


In [None]:
# loop through each col in prediction results and get a dict of index to sum
prob_index_dict = {}
for col in prediction_results.columns:
    # calculate the row at which the window ends
    end_frame = int(col.split('_')[0]) + int(col.split('_')[1])
    
    # if the end frame is not in the dict, add it
    if end_frame not in prob_index_dict.keys():
        prob_index_dict[end_frame] = [col]
    else:
        prob_index_dict[end_frame].append(col)


agg_predictions = pd.DataFrame()
# loop through each key in the dict and get the mean of the values
for filehash_i in prediction_results.index:
    for end_frame in prob_index_dict.keys():
        # calculate the mean of the values and add to the dataframe indexed by filehash_i
        agg_prob = np.mean(
            prediction_results.loc[
                filehash_i,
                prob_index_dict[end_frame]
            ].values
        )
        agg_predictions = agg_predictions.append(
            {
                'filehash': filehash_i,
                end_frame: agg_prob
            },
            ignore_index=True
        )

agg_predictions.set_index('filehash', inplace=True)
agg_predictions.head()


In [79]:
spawningtool_df = pd.read_csv('data/spawningtool_replays.csv')

In [83]:
# loop through each col in prediction results and get a dict of index to sum
prob_index_dict = {}
for col in prediction_results.columns:
    # calculate the row at which the window ends
    end_frame = int(col.split('_')[0]) + int(col.split('_')[1])
    
    # if the end frame is not in the dict, add it
    if end_frame not in prob_index_dict.keys():
        prob_index_dict[end_frame] = [col]
    else:
        prob_index_dict[end_frame].append(col)


agg_predictions = pd.DataFrame()
# loop through each key in the dict and get the mean of the values
for end_frame in prob_index_dict.keys():
    # calculate the row-wise mean of the values in these colums and add to the dataframe
    agg_prob = prediction_results.loc[:, prob_index_dict[end_frame]].mean(axis=1)
    
    # add the mean as a column to the dataframe
    agg_predictions[end_frame] = agg_prob

# sort columns in ascending order
agg_predictions = agg_predictions.reindex(
    sorted(agg_predictions.columns),
    axis=1
)

# append game_winner to the dataframe
agg_predictions['winner'] = pd.DataFrame().from_dict(labels, orient='index')

# append game length to the dataframe
agg_predictions['length'] = spawningtool_df.set_index('filehash')['game_length']/5

agg_predictions.head(10)


Unnamed: 0_level_0,12,22,24,32,34,36,42,44,46,48,52,54,56,58,60,62,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110,112,114,116,118,120,winner,length
filehash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
0016ddd5b36473f259ba8630c6c0047540a6858071737e8f890358d3f7ad355d,0.47854,0.59806,0.066167,0.087926,0.06784,0.161989,0.199271,0.309149,0.092753,0.019592,0.124007,0.095017,0.12371,0.04648,0.131178,0.135398,0.213251,0.028176,0.034467,0.026408,0.062414,0.220668,0.020097,0.026782,0.010254,0.040333,0.096306,0.387979,0.212772,0.072036,0.557166,0.389631,0.818208,0.808688,0.086759,0.067867,0.4048,0.572054,0.313721,0.710828,0.135832,0.212864,0.640768,0.152474,0.201989,0,161.2
00283dcee1b0bec45ff654a0df06248d9f69a5c3a66e7827e10660ad88bf5bee,0.5266,0.614885,0.296109,0.258713,0.244836,0.32415,0.389767,0.8605,0.353952,0.262166,0.789966,0.71567,0.732399,0.711719,0.167221,0.724467,0.924575,0.88502,0.830663,0.863784,0.752941,0.96849,0.956954,0.362114,0.848653,0.661463,0.595192,0.801989,0.188171,0.303732,0.453194,0.803219,0.428179,0.626452,0.33428,0.122963,0.786025,0.730558,0.25669,0.126523,0.455722,0.098226,0.349144,0.397491,0.279488,0,178.2
0033334e86cb352b0131027e58effa23adb23cf1cdd90d5760cc62254611c99a,0.56381,0.688602,0.437219,0.52077,0.511848,0.659244,0.486146,0.919635,0.602856,0.554702,0.877253,0.956855,0.955341,0.874749,0.946754,0.941794,0.937908,0.96776,0.905928,0.871522,0.915761,0.929437,0.863017,0.887714,0.950233,0.835233,0.967029,0.404402,0.65053,0.811946,0.937773,0.903472,0.731678,0.737742,0.728694,0.713203,0.843723,0.902666,0.82729,0.591851,0.614171,0.67258,0.689227,0.71363,0.941513,1,214.6
0055f4541fe30a72e964fc168db7abef35822d27e4cdce8ef0adb8b3cba51b7d,0.654678,0.66647,0.551776,0.549546,0.796762,0.727291,0.752763,0.97122,0.788836,0.75623,0.882442,0.978145,0.980208,0.977553,0.982317,0.946826,0.943856,0.947675,0.957861,0.947037,0.859779,0.980214,0.978365,0.986316,0.984399,0.986981,0.981158,0.977469,0.99782,0.983914,0.961692,,,,,,,,,,,,,,,1,94.2
005ac860a39ec7ae855a22b68f13ea17c1d7006bafe381e46cd9f2cbf3fa3a99,0.410807,0.697154,0.627104,0.425629,0.574905,0.544934,0.818765,0.970679,0.708364,0.841827,0.884527,0.956108,0.982398,0.974754,0.945515,0.958854,0.988558,0.954283,0.937327,0.977011,0.918579,0.992553,0.988978,0.705655,0.989564,0.928219,0.969705,0.98364,0.937284,0.925274,0.998654,0.977158,0.979737,0.987146,0.945632,0.893628,0.953157,0.923174,0.951265,0.973543,0.98194,0.958897,0.984683,0.96503,0.996227,1,126.4
005e3a2f3a37e7afcc8050a73dfba1ef61394aa96202bbae4e521d69cdabdea5,0.405662,0.552058,0.256317,0.550325,0.355961,0.707999,0.559632,0.751667,0.318067,0.184898,0.236122,0.2815,0.652278,0.447956,0.156842,0.614004,0.763232,0.949061,0.899879,0.820812,0.694565,0.339294,0.821932,0.950007,0.612075,0.877753,0.944757,0.782537,0.889469,0.916187,0.851685,0.91068,0.923992,0.986,0.370997,0.375315,0.757616,0.797698,0.619422,0.560731,0.305792,0.444103,0.57837,0.730623,0.517234,1,329.0
005ee057d917de0d9fea9964630d19df43e08f4377f67c14e85168f4b3df34a4,0.48609,0.584725,0.793851,0.596131,0.863285,0.900613,0.707706,0.985649,0.891718,0.533259,0.894479,0.342814,0.945802,0.640318,0.913347,0.959335,0.992768,0.939981,0.951848,0.937967,0.989068,0.67148,0.874622,0.884257,0.989481,0.977103,0.965211,0.830006,0.995495,0.856457,0.731042,0.710099,0.401789,0.92607,0.94848,0.804498,0.886618,0.878309,0.910309,0.970322,0.789138,0.96312,0.870599,0.944876,0.990914,1,143.4
00605ffabc4ff701bc81f82e2e7a446ca0f3dc97b08cd5b267140cfaff4d83ae,0.413913,0.61789,0.309687,0.529493,0.284613,0.659095,0.728172,0.939295,0.448593,0.31767,0.31391,0.246137,0.844136,0.940742,0.81323,0.487679,0.596709,0.474391,0.270629,0.662611,0.743733,0.930633,0.933388,0.614147,0.952221,0.944211,0.956925,0.956453,0.948823,0.988689,0.984003,0.992484,0.983547,0.991279,,,,,,,,,,,,1,99.4
0067505eb790ff1074f3594e9d21f159e42815eb1c2f673cd19e44ae48623e8f,0.411835,0.688843,0.488133,0.430451,0.607323,0.955554,0.558092,0.98631,0.895077,0.504149,0.779935,0.41297,0.849054,0.860607,0.939733,0.866047,0.14044,0.111801,0.201712,,,,,,,,,,,,,,,,,,,,,,,,,,,0,69.2
006ac63024244a6477a939e96b51051e43d8b98296146a54d6d92e4288687c80,0.569528,0.781655,0.74316,0.455927,0.747769,0.899116,0.713085,0.871311,0.693991,0.366794,0.885958,0.779268,0.850693,0.946365,0.940444,0.964224,0.92862,0.869872,0.860076,0.966617,0.949815,0.959236,0.98974,0.985181,0.994591,0.831012,0.990811,0.973,0.990271,0.938358,0.924808,0.987734,0.975576,0.981846,0.984233,0.949742,0.906456,0.971672,0.978805,0.823007,0.868244,0.9118,0.896956,0.942664,0.913571,1,219.8


In [84]:
# save data to csv
agg_predictions.to_csv('data/agg_predictions.csv')



In [82]:
spawningtool_df.set_index('filehash')['game_length']/5

filehash
9384fda8c370ea7d130ac20244f7d0fda9a9b8340044452d2692b739ac9cc1e1     80.4
f9864054498acf297aacf2be80896ba131716a341983de6714499ac77a0695ec     74.2
178a82fa5045e82a3920688274d7a524595f0ef6dc91ae42a4cb9c9711fc5998     74.2
99ba80721116d18f965c141581baa878abf02526f48bc6e937c2e3f1c89054be    360.6
b9e96a1dabcc0de0cfd89e8f0d02b5b8e064ec3f3c07c1272022d6651cd24dcb    100.4
                                                                    ...  
86edadabea6cede88a70ed9102e0d45cdea7850d23ade8d6883f03e0fd7439e2    160.8
42930fdc21561395bcefc960b2d5525f7b6adf11dc0eab635a034ae569a85167     76.2
8d1c2a7a406cdedfdd8db8ae24968b16edfde4c1a38230c32c3ecfff4b2b9b20    254.4
d6430bacdb3bc5a22e09d445655486cb06cf7e9d23b4799de8001f88b9cec84b    191.4
b64731a1f706d4fa6b79778884209d77dc3cfe40308f6df7668060508a6ce697    122.6
Name: game_length, Length: 36812, dtype: float64

In [85]:
for key, value in prob_index_dict.items():
    print(key, value)

12 ['0_12']
22 ['10_12']
32 ['20_12']
42 ['30_12']
52 ['40_12']
62 ['50_12']
72 ['60_12', '0_72']
82 ['70_12', '10_72']
92 ['80_12', '20_72']
102 ['90_12', '30_72']
112 ['100_12', '40_72']
24 ['0_24']
34 ['10_24']
44 ['20_24']
54 ['30_24']
64 ['40_24']
74 ['50_24']
84 ['60_24', '0_84']
94 ['70_24', '10_84']
104 ['80_24', '20_84']
114 ['90_24', '30_84']
36 ['0_36']
46 ['10_36']
56 ['20_36']
66 ['30_36']
76 ['40_36']
86 ['50_36']
96 ['60_36', '0_96']
106 ['70_36', '10_96']
116 ['80_36', '20_96']
48 ['0_48']
58 ['10_48']
68 ['20_48']
78 ['30_48']
88 ['40_48']
98 ['50_48']
108 ['60_48', '0_108']
118 ['70_48', '10_108']
60 ['0_60']
70 ['10_60']
80 ['20_60']
90 ['30_60']
100 ['40_60']
110 ['50_60']
120 ['60_60', '0_120']
