In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

In [None]:
q1features_trainvalid = pd.read_pickle("q1features_trainvalid.pickle")

In [None]:
#Sorting data by participant sequence number (by date) :
q1features_trainvalid = q1features_trainvalid.sort_values(by = ["SEQNO"], ascending=False)

Overfitting occurs when a machine learning algorithm performs poorly on data that it was not trained on. Cross-validation can help prevent overfitting.

All BRFSS surveys were not conducted on the same date; there is a time series aspect to the dataset. Sliding window validation was therefore conducted, not the standard k-fold cross-validation methodology. This sliding window validation method ensures that the validation dataset always occurs chronologically after the training dataset.; the model cannot 'see the future' (which would artificially improve model performance).

In [None]:
#Splitting data for sliding window cross-validation
tscv = TimeSeriesSplit(n_splits=5)

tscv.split(q1features_trainvalid)

In [None]:
q1_train_index = []
q1_valid_index = []

#Iterating over tscv object to obtain indices that give train/validate splits
for train_index, valid_index in tscv.split(q1features_trainvalid):
  q1_train_index.append(train_index)
  q1_valid_index.append(valid_index)

In [None]:
#Using these indices to split data into five training and five validation
#sets
q1features_train_1 = q1features_trainvalid.iloc[q1_train_index[0], :]
q1features_valid_1 = q1features_trainvalid.iloc[q1_valid_index[0], :]

q1features_train_2 = q1features_trainvalid.iloc[q1_train_index[1], :]
q1features_valid_2 = q1features_trainvalid.iloc[q1_valid_index[1], :]

q1features_train_3 = q1features_trainvalid.iloc[q1_train_index[2], :]
q1features_valid_3 = q1features_trainvalid.iloc[q1_valid_index[2], :]

q1features_train_4 = q1features_trainvalid.iloc[q1_train_index[3], :]
q1features_valid_4 = q1features_trainvalid.iloc[q1_valid_index[3], :]

q1features_train_5 = q1features_trainvalid.iloc[q1_train_index[4], :]
q1features_valid_5 = q1features_trainvalid.iloc[q1_valid_index[4], :]

In [None]:
#Also splitting the labels for the target variable
q1target_train_1 = q1target_trainvalid.iloc[q1_train_index[0]]
q1target_valid_1 = q1target_trainvalid.iloc[q1_valid_index[0]]

q1target_train_2 = q1target_trainvalid.iloc[q1_train_index[1]]
q1target_valid_2 = q1target_trainvalid.iloc[q1_valid_index[1]]

q1target_train_3 = q1target_trainvalid.iloc[q1_train_index[2]]
q1target_valid_3 = q1target_trainvalid.iloc[q1_valid_index[2]]

q1target_train_4 = q1target_trainvalid.iloc[q1_train_index[3]]
q1target_valid_4 = q1target_trainvalid.iloc[q1_valid_index[3]]

q1target_train_5 = q1target_trainvalid.iloc[q1_train_index[4]]
q1target_valid_5 = q1target_trainvalid.iloc[q1_valid_index[4]]

In [None]:
q1features_train_1.to_pickle('q1features_train_1.pickle')
q1features_train_2.to_pickle('q1features_train_2.pickle')
q1features_train_3.to_pickle('q1features_train_3.pickle')
q1features_train_4.to_pickle('q1features_train_4.pickle')
q1features_train_5.to_pickle('q1features_train_5.pickle')

In [None]:
q1features_valid_1.to_pickle('q1features_valid_1.pickle')
q1features_valid_2.to_pickle('q1features_valid_2.pickle')
q1features_valid_3.to_pickle('q1features_valid_3.pickle')
q1features_valid_4.to_pickle('q1features_valid_4.pickle')
q1features_valid_5.to_pickle('q1features_valid_5.pickle')

In [None]:
q1target_train_1.to_pickle('q1target_train_1.pickle')
q1target_train_2.to_pickle('q1target_train_2.pickle')
q1target_train_3.to_pickle('q1target_train_3.pickle')
q1target_train_4.to_pickle('q1target_train_4.pickle')
q1target_train_5.to_pickle('q1target_train_5.pickle')

In [None]:
q1target_valid_1.to_pickle('q1target_valid_1.pickle')
q1target_valid_2.to_pickle('q1target_valid_2.pickle')
q1target_valid_3.to_pickle('q1target_valid_3.pickle')
q1target_valid_4.to_pickle('q1target_valid_4.pickle')
q1target_valid_5.to_pickle('q1target_valid_5.pickle')