# Create CV dataset

This notebook creates a dataset that enables 3-fold cross-validation for the NNet bot. 

In [11]:
import draftsimtools as ds
import pickle

### Initial dataset
Load data from original M19 draftsim data files and process it. Some of this processing is unnecessary since it was already performed, but better safe than sorry

In [7]:
dataset_path = "../../data/"
rating_path1 = dataset_path + "m19_rating.tsv"
rating_path2 = dataset_path + "m19_land_rating.tsv"
drafts_path = dataset_path + "full_dataset/train.csv"
cur_set = ds.create_set(rating_path1, rating_path2)
raw_drafts = ds.load_drafts(drafts_path)
cur_set, raw_drafts = ds.fix_commas(cur_set, raw_drafts)
le = ds.create_le(cur_set["Name"].values)
drafts = ds.process_drafts(raw_drafts)
drafts = [d for d in drafts if len(d)==45] # Remove incomplete drafts

Processing draft: 0.
Processing draft: 10000.
Processing draft: 20000.
Processing draft: 30000.
Processing draft: 40000.
Processing draft: 50000.
Processing draft: 60000.
Processing draft: 70000.
Processing draft: 80000.


### Split data

Splits data for 3-fold cross-validation and saves to file.

In [8]:
# Separates the training data into thirds
third = int(len(drafts) / 3)
fold1 = drafts[0:third]
fold2 = drafts[third:(third*2)]
fold3 = drafts[(third*2):(len(drafts) + 1)]

# Gets CV splits and converts to tensor
split1_train = ds.drafts_to_tensor(fold1 + fold2, le)
split1_val = ds.drafts_to_tensor(fold3, le)
split2_train = ds.drafts_to_tensor(fold1 + fold3, le)
split2_val = ds.drafts_to_tensor(fold2, le)
split3_train = ds.drafts_to_tensor(fold2 + fold3, le)
split3_val = ds.drafts_to_tensor(fold1, le)


In [12]:
# Helper function for pickling data
def serialize_data(obj, path):
    """
    Serialize an object as a python pickle file.
    """
    with open(path, "wb") as f:
        pickle.dump(obj, f)

# Writes all splits to file
output_folder = "bots_data/nnet_train/"
serialize_data(split1_train, output_folder + "split1_train.pkl")
serialize_data(split1_val, output_folder + "split1_val.pkl")
serialize_data(split2_train, output_folder + "split2_train.pkl")
serialize_data(split2_val, output_folder + "split2_val.pkl")
serialize_data(split3_train, output_folder + "split3_train.pkl")
serialize_data(split3_val, output_folder + "split3_val.pkl")