In [None]:
from multiprocessing import Pool

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.colors import ListedColormap
from sklearn.model_selection import GroupKFold
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=pd.core.common.SettingWithCopyWarning)
    
plt.style.use('bmh')
plt.rcParams['figure.figsize'] = [14, 8]  # width, height

In [None]:
# this is code slightly modified from the sklearn docs here:
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_cv_indices.html#sphx-glr-auto-examples-model-selection-plot-cv-indices-py
def plot_cv_indices(cv, X, y, group, ax, n_splits, lw=10):
    """Create a sample plot for indices of a cross-validation object."""
    
    cmap_cv = plt.cm.coolwarm

    jet = plt.cm.get_cmap('jet', 256)
    seq = np.linspace(0, 1, 256)
    _ = np.random.shuffle(seq)   # inplace
    cmap_data = ListedColormap(jet(seq))

    # Generate the training/testing visualizations for each CV split
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y, groups=group)):
        # Fill in indices with the training/test groups
        indices = np.array([np.nan] * len(X))
        indices[tt] = 1
        indices[tr] = 0

        # Visualize the results
        ax.scatter(range(len(indices)), [ii + .5] * len(indices),
                   c=indices, marker='_', lw=lw, cmap=cmap_cv,
                   vmin=-.2, vmax=1.2)

    # Plot the data classes and groups at the end
    ax.scatter(range(len(X)), [ii + 1.5] * len(X),
               c=y, marker='_', lw=lw, cmap=plt.cm.Set3)

    ax.scatter(range(len(X)), [ii + 2.5] * len(X),
               c=group, marker='_', lw=lw, cmap=cmap_data)

    # Formatting
    yticklabels = list(range(n_splits)) + ['target', 'day']
    ax.set(yticks=np.arange(n_splits+2) + .5, yticklabels=yticklabels,
           xlabel='Sample index', ylabel="CV iteration",
           ylim=[n_splits+2.2, -.2], xlim=[0, len(y)])
    ax.set_title('{}'.format(type(cv).__name__), fontsize=15)
    return ax

In [None]:
# store target, features, and folds labels from a dataframe into several tfrecords
def save_tfrecords(input):
    df, idx, folder = input
    def _float_feature(list_of_floats):
        return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
    
    # def _int_feature(list_of_ints):
    #     return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))
    
    target_index = 4
    features_start_index, features_postfinal_index = 5, 305
    out_file = None
    for i, row in tqdm(enumerate(df.itertuples())):
        if i % 30000 == 0:
            if out_file is not None:
                out_file.close()
            filename = f'data/tfrecords/{folder}/fold{idx}/{row[0]}.tfrec'
            out_file = tf.io.TFRecordWriter(filename)
        feature_dict = {}
        feature_dict[row._fields[target_index]] = _float_feature([row[target_index]])
        feature_dict['features'] = _float_feature(row[features_start_index:features_postfinal_index])

        example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
        out_file.write(example.SerializeToString())
    out_file.close()

In [None]:
# Read data and choose cv strategy
N_SPLITS = 5
df = pd.read_parquet('data/train_low_mem.parquet')
cv = GroupKFold(n_splits=N_SPLITS)
# fig, ax = plt.subplots()
# plot_cv_indices(cv, df, y, df['investment_id'], ax, N_SPLITS, lw=20)

In [None]:
train_indices = []
valid_indices = []
for train_index, valid_index in cv.split(df, groups=df['investment_id']):
    train_indices.append(train_index)
    valid_indices.append(valid_index)
del cv

In [None]:
idx = 4

In [None]:
train_df, valid_df = df.iloc[train_indices[idx]], df.iloc[valid_indices[idx]]
del df

In [None]:
# split train_df to save tfrecords in parallel
size = 800000
list_of_train_dfs = [(train_df.iloc[i:i+size,:], idx, 'train') for i in range(0, len(train_df), size)]
number_dfs = len(list_of_train_dfs)

In [None]:
print("Train data recording.")
# save train tfrecords
with Pool(number_dfs) as p:
    p.map(save_tfrecords, list_of_train_dfs)
del train_df
del list_of_train_dfs

In [None]:
# split valid_df
size = 210000
list_of_valid_dfs = [(valid_df.iloc[i:i+size,:], idx, 'validation') for i in range(0, len(valid_df), size)]
number_dfs = len(list_of_valid_dfs)

In [None]:
print("Validation data recording.")
with Pool(number_dfs) as p:
    p.map(save_tfrecords, list_of_valid_dfs)

print("Validation data pickling.")
# save a validation dataframe
valid_df.to_pickle(f"data/tfrecords/validation/fold{idx}/validation.pkl") 
del valid_df
del list_of_valid_dfs