# Split data into k-folds

- This notebook consists of two parts in preparing for k-fold cross validation
- First, we will split the data into k folds and extract the training and test sets for each fold
- Then, we will conduct undersampling of the '0' class on the training sets
- The training and test sets for each fold will be saved in an excel file.

In [1]:
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# import data and set the range to be number of rows
# ***************** # IMPORT DATASET
df_2009 = pd.read_excel(r'.\train and test processed.xlsx')
df_2009.head()

## Splitting data into k folds

In [3]:
# to demonstrate how the data are split, we will create 3 and 5 folds
# KFold function has to be applied on the data and it returns a location
kf5 = KFold(n_splits=5, shuffle =False)
kf3 = KFold(n_splits=3, shuffle =False)

In [4]:
def kfoldize(kf, df):
    train = pd.DataFrame()
    test = pd.DataFrame()
    i=1
    for train_index, test_index in kf.split(range(len(df))):
        train_df = df.iloc[train_index]
        train_df["val"]=i
        train = train.append(train_df)
        
        test_df = df.iloc[test_index]
        test_df["val"]=i
        test = test.append(test_df)
        i+= 1
    return train, test

In [5]:
train5, test5= kfoldize(kf5, df_2009)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
train5

In [None]:
test5

## Undersample test datasets

- In this study, we will only sample the training set for each fold

In [8]:
# Define the training dataframes

train_fold_1 = train5[train5['val']==1]
train_fold_2 = train5[train5['val']==2]
train_fold_3 = train5[train5['val']==3]
train_fold_4 = train5[train5['val']==4]
train_fold_5 = train5[train5['val']==5]

In [9]:
# Define the testing dataframes

test_fold_1 = test5[test5['val']==1]
test_fold_2 = test5[test5['val']==2]
test_fold_3 = test5[test5['val']==3]
test_fold_4 = test5[test5['val']==4]
test_fold_5 = test5[test5['val']==5]

In [10]:
# Randomly sample the negative examples

def undersample(df):
    positive = df[df['label']==1]
    negative = df[df['label']==0]
    negative_us = negative.sample(n=len(positive), replace=False, random_state=1)
    us = positive.append(negative_us)
    return us

In [11]:
# Apply undersampling function on all 5 training sets

train_fold_1_us = undersample(train_fold_1)
train_fold_2_us = undersample(train_fold_2)
train_fold_3_us = undersample(train_fold_3)
train_fold_4_us = undersample(train_fold_4)
train_fold_5_us = undersample(train_fold_5)

In [None]:
train_fold_1_us

In [None]:
train_fold_1[train_fold_1['label']==1]

In [14]:
train_fold_5['label'].describe()

count    4071.000000
mean        0.132891
std         0.339499
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64

In [15]:
train_fold_1_us['label'].describe()

count    1086.00000
mean        0.50000
std         0.50023
min         0.00000
25%         0.00000
50%         0.50000
75%         1.00000
max         1.00000
Name: label, dtype: float64

## Save to excel

In [16]:
writer = pd.ExcelWriter('train_k_fold.xlsx')

train_fold_1_us.to_excel(writer, sheet_name = 'train_1',index=False)
test_fold_1.to_excel(writer, sheet_name = 'test_1',index=False)
train_fold_2_us.to_excel(writer, sheet_name = 'train_2',index=False)
test_fold_2.to_excel(writer, sheet_name = 'test_2',index=False)
train_fold_3_us.to_excel(writer, sheet_name = 'train_3',index=False)
test_fold_3.to_excel(writer, sheet_name = 'test_3',index=False)
train_fold_4_us.to_excel(writer, sheet_name = 'train_4',index=False)
test_fold_4.to_excel(writer, sheet_name = 'test_4',index=False)
train_fold_5_us.to_excel(writer, sheet_name = 'train_5',index=False)
test_fold_5.to_excel(writer, sheet_name = 'test_5',index=False)

writer.save()