## Creating k-folds for 30 Days of ML Challenge, by Juan Torres
#### Based on Abhishek Thakur's tutorials and notebooks:
https://www.youtube.com/watch?v=t5fhRP62YdE

https://www.kaggle.com/abhishek/30-days-create-folds

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import model_selection

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/30-days-of-ml/sample_submission.csv
/kaggle/input/30-days-of-ml/train.csv
/kaggle/input/30-days-of-ml/test.csv


In [2]:
# Load the training data.
df_train = pd.read_csv("../input/30-days-of-ml/train.csv") 

# Check the data shape.
df_train.shape

(300000, 26)

In [3]:
# We create a new column in the training data that represents which fold each row belongs to. For now, we set this value to be -1 in all rows.
df_train["kfold"] = -1

# Take a peek at the training data
df_train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,-1
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,-1
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,-1
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,-1
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,-1


In [4]:
fold_nums = [3,5,6,10,12,15,20,25,30,40,50,60]

for num in fold_nums:

    # Now we implement the K-Folds cross-validator, which provides train/test indices to split data in train/test sets by splitting the dataset into k consecutive folds.
    kf = model_selection.KFold(n_splits=num, shuffle=True, random_state=1)
    for fold, (train_indices, valid_indices) in enumerate(kf.split(X=df_train)):
        df_train.loc[valid_indices, "kfold"] = fold
    print("train_folds_" + str(num) + "_folds.csv")
    df_train.to_csv("train_folds_" + str(num) + "_folds.csv", index=False)

train_folds_3_folds.csv
train_folds_5_folds.csv
train_folds_6_folds.csv
train_folds_10_folds.csv
train_folds_12_folds.csv
train_folds_15_folds.csv
train_folds_20_folds.csv
train_folds_25_folds.csv
train_folds_30_folds.csv
train_folds_40_folds.csv
train_folds_50_folds.csv
train_folds_60_folds.csv
