In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GroupShuffleSplit

from tqdm import tqdm_notebook

import warnings
warnings.simplefilter('ignore')

In [2]:
def get_data():
    train = pd.read_csv("../data/train.tsv", sep="\t")
    test = pd.read_csv("../data/test.csv", sep="\t")
    
    print("Train Shape : {}\nTest Shape :  {}".format(train.shape, test.shape))
    
    return train, test

In [3]:
train, test = get_data()
train.head()

Train Shape : (1200000, 3)
Test Shape :  (92, 3)


Unnamed: 0,title,description,category
0,ZicZac // Black + Red (Euro: 44),Clothing & related products (B2C) - Shoes and ...,R
1,9X9 RESISTA/484938,Publishing/Printing - Printing Services,S
2,Halle Pant - Short Inseam 013049561D0010001_ 02,Clothing & related products (B2C) - General,R
3,Harry Houser Travel Expenses - Meals,Security - personnel,S
4,Tee Time: 740078609 : Greens Fee - Composite,Admissions - Green Fees for Privately Owned Go...,R


#### Dropping the 2 nulls in titles because as per the prrblem statement there is no possibility of null Title's.

In [4]:
train.dropna(inplace=True)

### Stratifying 10-Fold splitting train dataset as it is very large (1.2M)

In [5]:
skf = StratifiedKFold(n_splits=10, random_state=13)

train['fold_id'] = np.nan

for i, (trn_idx, val_idx) in enumerate(skf.split(train, train['category']), 1):
    print("Split : {}".format(i))
    train['fold_id'].iloc[val_idx] = i
    print("--"*30)
train['fold_id'] = train['fold_id'].astype(np.int)

Split : 1
------------------------------------------------------------
Split : 2
------------------------------------------------------------
Split : 3
------------------------------------------------------------
Split : 4
------------------------------------------------------------
Split : 5
------------------------------------------------------------
Split : 6
------------------------------------------------------------
Split : 7
------------------------------------------------------------
Split : 8
------------------------------------------------------------
Split : 9
------------------------------------------------------------
Split : 10
------------------------------------------------------------


In [6]:
train['fold_id'].value_counts()

2     120001
1     120001
6     120000
5     120000
4     120000
3     120000
10    119999
9     119999
8     119999
7     119999
Name: fold_id, dtype: int64

## Creating the Validation Strategy + Splits

As we have very less unique descriptions let's see there is a many-to-one relationship between Description and Target

In [7]:
train['description'].nunique() - train.groupby(['description'])['category'].value_counts().shape[0]

0

It seems we do have a fixed relationship between category and description and if we intend to include description (WE DO) we would have to make a group based validation strategy (group == description) so that we don't end up overfitting on descriptions and not genaralize.

In [8]:
from sklearn.model_selection import GroupShuffleSplit

In [9]:
train['source'] = np.nan

new_df = pd.DataFrame()

for i in tqdm_notebook(train['fold_id'].unique()):
#     print(i)
    gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=13)
    train_subsample = train[train['fold_id'] == i].copy()
    train_subsample.reset_index(drop=True, inplace=True)
    
    for idx, (trn_idx, val_idx) in enumerate(gss.split(
        X=train_subsample, y=train_subsample['category'], groups=train_subsample['description'])):
        
        train_set = train_subsample.iloc[trn_idx]
        valid_set = train_subsample.iloc[val_idx]

        train_subsample['source'].iloc[trn_idx] = "train"
        train_subsample['source'].iloc[val_idx] = "valid"
        
        print(len(set.intersection(set(train_set['description'].values), set(valid_set['description'].values))))
    new_df = pd.concat([new_df, train_subsample], axis=0)
    new_df.reset_index(drop=True, inplace=True)
#     print(new_df.shape)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

0
0
0
0
0
0
0
0
0
0



In [10]:
train = new_df.copy()
del new_df
gc.collect()

train.isnull().sum()

title          0
description    0
category       0
fold_id        0
source         0
dtype: int64

### Subsampling the train to train multiple hypothesis faster.

In [11]:
train_subsample = train[train['fold_id'] == np.random.choice(train['fold_id'].unique())]
train_subsample.reset_index(drop=True, inplace=True)
train_subsample[['title', 'description', 'category', 'source']].to_csv("../data/v2/train_subsample.csv", index=False)
train_subsample.head()

Unnamed: 0,title,description,category,fold_id,source
0,"03/22/18 Stationary Guard Service - Preciado, ...",Security - personnel,S,3,train
1,2c92a09a6ff0d9eb0170001fa1386b8d,Data - processing - electronic output,S,3,train
2,WRLS_C35519-882-17,Internet Access,S,3,train
3,2-4X4/DF AGENT NAME COVER UP AND POSTS REPAINT,Publishing/Printing - Printing Services,S,3,train
4,type=lppremium uid=85987272 dbid=11601222 dbid...,"ASP - hosted software, server not in state",S,3,valid


In [12]:
train.to_csv("../data/v2/train_10_skf.csv", index=False)

In [13]:
del train
del test
del train_subsample

gc.collect()

40