# Split Data into Train/Validation/Test

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import random
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
data_path = 'data/submissions.csv'
seed = 42
max_features = 40000
maxlen = 20

splits = [.8, .1, .1]
assert sum(splits) == 1

In [3]:
df = pd.read_csv(data_path, encoding='utf-8')

In [4]:
df.head(10)

Unnamed: 0,id,title,score,hour,minute,dayofweek,dayofyear,is_top_submission
0,cnhcty,People who downloaded their Google data and we...,95489,22,45,3,219,1
1,chm4um,Have you ever felt you don't know/have forgott...,90634,3,53,4,206,1
2,co16i3,Redditors who have tried to hide on your cruis...,87538,4,39,5,221,1
3,cj95jb,What knowledge might save your life one day?,87503,2,57,1,210,1
4,cuctl4,What if you suddenly feel a touch on the shoul...,86162,4,53,5,235,1
5,cbm4z1,If HBO's Chernobyl was a series with a new dis...,85131,13,40,3,191,1
6,ctysw7,How do we save this fucking planet?,82249,8,14,4,234,1
7,cpsi27,What is the scariest story you know that is 10...,80859,5,33,2,225,1
8,ci3nf2,"For people who like to listen to full albums, ...",79470,7,37,5,207,1
9,bvr285,What is that one fact you know that always mak...,77646,16,29,6,152,1


In [5]:
df.score.mean()

17.90230041718448

In [6]:
titles_raw = np.array(df.title)
hours = np.array(df.hour, dtype=int)
minutes = np.array(df.minute, dtype=int)
weekdays = np.array(df.dayofweek, dtype=int)
dates = np.array(df.dayofyear, dtype=int) - 1 # minus 1 to make January 1 = 0
is_top_submission = np.array(df.is_top_submission, dtype=int)

In [7]:
word_tokenizer = Tokenizer(max_features)
word_tokenizer.fit_on_texts(titles_raw)

In [8]:
titles = word_tokenizer.texts_to_sequences(titles_raw)
titles = sequence.pad_sequences(titles, maxlen)

In [9]:
n = len(titles)
np.random.seed(seed)
idx = np.array(range(n))
np.random.shuffle(idx)
idx_splits = (np.cumsum(splits[:2])*n).astype('int')
idx_train, idx_val, idx_test = np.split(idx, idx_splits)

In [10]:
payload = {}
d = {
    'titles_raw': titles_raw,
    'titles': titles,
    'hours': hours,
    'minutes': minutes,
    'weekdays': weekdays,
    'dates': dates,
    'is_top_submission': is_top_submission
}

for k,v in d.items():
    payload[k+'_train'] = v[idx_train]
    payload[k+'_val'] = v[idx_val]
    payload[k+'_test'] = v[idx_test]

payload['all_titles'] = titles
payload['tokenizer'] = word_tokenizer

In [11]:
with open('loaded_data.pickle', 'wb') as f:
    pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL)