This notebook reads the data and transforms it into a sparse matrix with one-hot encoded features

In [1]:
import pandas as pd
import numpy as np
import scipy
import pickle
from sklearn.preprocessing import OneHotEncoder

In [2]:
train_df = pd.read_csv('train.gz')

In [3]:
train_df.head()

Unnamed: 0,id,click,hour,C1,banner_pos,site_id,site_domain,site_category,app_id,app_domain,...,device_type,device_conn_type,C14,C15,C16,C17,C18,C19,C20,C21
0,1.000009e+18,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,2,15706,320,50,1722,0,35,-1,79
1,1.000017e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
2,1.000037e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15704,320,50,1722,0,35,100084,79
3,1.000064e+19,0,14102100,1005,0,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,...,1,0,15706,320,50,1722,0,35,100084,79
4,1.000068e+19,0,14102100,1005,1,fe8cc448,9166c161,0569f928,ecad2386,7801e8d9,...,1,0,18993,320,50,2161,0,35,-1,157


In [4]:
print(train_df.nunique())

id                  40428967
click                      2
hour                     240
C1                         7
banner_pos                 7
site_id                 4737
site_domain             7745
site_category             26
app_id                  8552
app_domain               559
app_category              36
device_id            2686408
device_ip            6729486
device_model            8251
device_type                5
device_conn_type           4
C14                     2626
C15                        8
C16                        9
C17                      435
C18                        4
C19                       68
C20                      172
C21                       60
dtype: int64


In [5]:
#dropping variables that have many different observations
train_df = train_df[['click', 'C1', 'banner_pos', 'site_category',
                     'app_category', 'device_type', 'device_conn_type',
                     'C15', 'C16', 'C18']]

In [6]:
train_df.head()

Unnamed: 0,click,C1,banner_pos,site_category,app_category,device_type,device_conn_type,C15,C16,C18
0,0,1005,0,28905ebd,07d7df22,1,2,320,50,0
1,0,1005,0,28905ebd,07d7df22,1,0,320,50,0
2,0,1005,0,28905ebd,07d7df22,1,0,320,50,0
3,0,1005,0,28905ebd,07d7df22,1,0,320,50,0
4,0,1005,1,0569f928,07d7df22,1,0,320,50,0


In [7]:
#using one-hot encoding
encoder = OneHotEncoder(drop='first', sparse=True)
data_sparse = encoder.fit_transform(train_df)
feature_names = encoder.get_feature_names(train_df.columns)

In [8]:
#saving sparse matrix
scipy.sparse.save_npz('data_sparse.npz', data_sparse)

In [9]:
#saving feature names
with open("feature_names.txt", "wb") as fp:
    pickle.dump(feature_names, fp)