# Create cross validation set
This notebook generates cross-validation datasets used by all algorithms, making sure the set are well-balanced in terms of events and label.

History:
 - V1 Based on https://www.kaggle.com/c/talkingdata-mobile-user-demographics/forums/t/22410/best-single-model?page=2

In [1]:
import os

import pandas as pd
import numpy as np

from sklearn.cross_validation import train_test_split

In [2]:
rs = 1974

dir_in = 'data_ori'
dir_out = 'data_ori'

test_size = 0.1

### Load full training data set

In [3]:
train = pd.read_csv(os.path.join(dir_in,'gender_age_train.csv'), dtype={'device_id': np.str})
train['sample_nr'] = train.index

test = pd.read_csv(os.path.join(dir_in,'gender_age_test.csv'), dtype={'device_id': np.str})
test['sample_nr'] = test.index

events = pd.read_csv(os.path.join(dir_in,'events.csv'), dtype={'device_id': np.str})
unique_device_id = events.device_id.unique()

### Create CV sets for all devices
Create train and test set with balanced amount of events and groups.

In [4]:
# Find devices with (1) and without (2) events
index1 = train.device_id.isin(unique_device_id)
index2 = [not i for i in index1]

train1 = train[index1].sort_values(by='device_id').reset_index(drop=True)
train2 = train[index2].sort_values(by='device_id').reset_index(drop=True)

# Split sets with and without events into training and test
train1, test1 = train_test_split(train1, 
                                 test_size=test_size, 
                                 random_state=rs,     
                                 stratify=train1['group'].values)
train2, test2 = train_test_split(train2, 
                                 test_size=test_size - 0.001, 
                                 random_state=rs, 
                                 stratify=train2['group'].values)

# Combine devices with and without events
train = pd.concat([train1, train2], ignore_index=True) \
          .sort_values(by='device_id').reset_index(drop=True)
test = pd.concat([test1, test2], ignore_index=True) \
         .sort_values(by='device_id').reset_index(drop=True)

In [9]:
print 'train:', train.shape
print 'test:', test.shape

train: (67229, 5)
test: (7416, 5)


In [10]:
# Store results so all other scripts can access it
train.to_csv(os.path.join(dir_out, 'gender_age_train_cv.csv'), index = False)
test.to_csv(os.path.join(dir_out, 'gender_age_test_cv.csv'), index = False)

### Create CV sets for events only
Create train and test sets based only on devices with events.

In [11]:
# Select devices with events
train_w = train.loc[train['device_id'] \
                    .isin(set(train['device_id']) \
                          .intersection(events['device_id'].unique()))]

In [12]:
# Split (stratified) randomly 
train_w, test_w = train_test_split(train_w, 
                                   test_size=test_size, 
                                   random_state=rs, 
                                   stratify=train_w['group'].values)

In [13]:
# Store results so all other scripts can access it
train_w.to_csv(os.path.join(dir_out, 'gender_age_train_cv_w.csv'), index = False)
test_w.to_csv(os.path.join(dir_out, 'gender_age_test_cv_w.csv'), index = False)

In [4]:
val_w = test.loc[test['device_id'] \
                    .isin(set(test['device_id']) \
                          .intersection(events['device_id'].unique()))]
val_w.to_csv(os.path.join(dir_out, 'gender_age_val_cv_w.csv'), index = False)

In [5]:
events['device_id'].unique().shape

(60865,)