In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit

In [None]:
q3features_trainvalid = pd.read_pickle("q3features_trainvalid.pickle")

In [None]:
#Sorting data by participant sequence number (by date) :
q3features_trainvalid = q3features_trainvalid.sort_values(by = ["SEQNO"], ascending=False)

Overfitting occurs when a machine learning algorithm performs poorly on data that it was not trained on. Cross-validation can help prevent overfitting.

All BRFSS surveys were not conducted on the same date; there is a time series aspect to the dataset. Sliding window validation was therefore conducted, not the standard k-fold cross-validation methodology. This sliding window validation method ensures that the validation dataset always occurs chronologically after the training dataset.; the model cannot 'see the future' (which would artificially improve model performance).

In [None]:
#Splitting data for sliding window cross-validation
tscv = TimeSeriesSplit(n_splits=5)

tscv.split(q3features_trainvalid)

In [None]:
q3_train_index = []
q3_valid_index = []

#Iterating over tscv object to obtain indices that give train/validate splits
for train_index, valid_index in tscv.split(q3features_trainvalid):
  q3_train_index.append(train_index)
  q3_valid_index.append(valid_index)

In [None]:
#Using these indices to split data into five training and five validation
#sets
q3features_train_1 = q3features_trainvalid.iloc[q3_train_index[0], :]
q3features_valid_1 = q3features_trainvalid.iloc[q3_valid_index[0], :]

q3features_train_2 = q3features_trainvalid.iloc[q3_train_index[1], :]
q3features_valid_2 = q3features_trainvalid.iloc[q3_valid_index[1], :]

q3features_train_3 = q3features_trainvalid.iloc[q3_train_index[2], :]
q3features_valid_3 = q3features_trainvalid.iloc[q3_valid_index[2], :]

q3features_train_4 = q3features_trainvalid.iloc[q3_train_index[3], :]
q3features_valid_4 = q3features_trainvalid.iloc[q3_valid_index[3], :]

q3features_train_5 = q3features_trainvalid.iloc[q3_train_index[4], :]
q3features_valid_5 = q3features_trainvalid.iloc[q3_valid_index[4], :]

In [None]:
#Also splitting the labels for the target variable
q3target_train_1 = q3target_trainvalid.iloc[q3_train_index[0]]
q3target_valid_1 = q3target_trainvalid.iloc[q3_valid_index[0]]

q3target_train_2 = q3target_trainvalid.iloc[q3_train_index[1]]
q3target_valid_2 = q3target_trainvalid.iloc[q3_valid_index[1]]

q3target_train_3 = q3target_trainvalid.iloc[q3_train_index[2]]
q3target_valid_3 = q3target_trainvalid.iloc[q3_valid_index[2]]

q3target_train_4 = q3target_trainvalid.iloc[q3_train_index[3]]
q3target_valid_4 = q3target_trainvalid.iloc[q3_valid_index[3]]

q3target_train_5 = q3target_trainvalid.iloc[q3_train_index[4]]
q3target_valid_5 = q3target_trainvalid.iloc[q3_valid_index[4]]

In [None]:
q3features_train_1.to_pickle('q3features_train_1.pickle')
q3features_train_2.to_pickle('q3features_train_2.pickle')
q3features_train_3.to_pickle('q3features_train_3.pickle')
q3features_train_4.to_pickle('q3features_train_4.pickle')
q3features_train_5.to_pickle('q3features_train_5.pickle')

In [None]:
q3features_valid_1.to_pickle('q3features_valid_1.pickle')
q3features_valid_2.to_pickle('q3features_valid_2.pickle')
q3features_valid_3.to_pickle('q3features_valid_3.pickle')
q3features_valid_4.to_pickle('q3features_valid_4.pickle')
q3features_valid_5.to_pickle('q3features_valid_5.pickle')

In [None]:
q3target_train_1.to_pickle('q3target_train_1.pickle')
q3target_train_2.to_pickle('q3target_train_2.pickle')
q3target_train_3.to_pickle('q3target_train_3.pickle')
q3target_train_4.to_pickle('q3target_train_4.pickle')
q3target_train_5.to_pickle('q3target_train_5.pickle')

In [None]:
q3target_valid_1.to_pickle('q3target_valid_1.pickle')
q3target_valid_2.to_pickle('q3target_valid_2.pickle')
q3target_valid_3.to_pickle('q3target_valid_3.pickle')
q3target_valid_4.to_pickle('q3target_valid_4.pickle')
q3target_valid_5.to_pickle('q3target_valid_5.pickle')