# Feat0 
Basic features

- try to store encode useful features, and doing basic preprocessing
- ignore text data

In [1]:
import pandas as pd 
import numpy as np 
import gc

In [2]:
%%time 
df_train = pd.read_pickle('../input/train.pkl')

Wall time: 5.04 s


In [3]:
%%time
df_test = pd.read_pickle('../input/test.pkl')

Wall time: 1.72 s


In [4]:
df_test.shape

(508438, 17)

In [5]:
df_train.shape

(1503424, 18)

preprocessing
1. `uidx`, `iidx` : re-encode user_id, item_seq_number (replace item_seq_number, user_id to these features) -- some useful info
2. `iid` :　index of item_id (pkey)  --- memory downcast
3. `region_city_label` : encode region_city combination into numerics (encode cnt >50) 
4. `titile length` / `desc length` : length of title, desc (and dont use text data)
5. `weekday`, `day`,`month` : extract from `activation_date`
6. `param_1` --> to category
7. `ads_cnt_by_uid`,`ads_cnt_by_iid`

In [6]:
df_train.dtypes

item_id                         object
user_id                         object
region                        category
city                          category
parent_category_name          category
category_name                 category
param_1                       category
param_2                       category
param_3                       category
title                           object
description                     object
price                          float32
item_seq_number                 uint16
activation_date         datetime64[ns]
user_type                     category
image                           object
image_top_1                    float32
deal_probability               float64
dtype: object

In [7]:
len_trn = df_train.shape[0]
print('length of train',len_trn)

length of train 1503424


In [8]:
df_train = df_train.append(df_test)
print('length of agg train+test',df_train.shape[0])

length of agg train+test 2011862


downcast 

In [9]:
df_train['param_1'] = df_train.param_1.astype('category')
df_train['param_2'] = df_train.param_2.astype('category')
df_train['param_3'] = df_train.param_3.astype('category')

In [10]:
df_train.dtypes

activation_date         datetime64[ns]
category_name                 category
city                            object
deal_probability               float64
description                     object
image                           object
image_top_1                    float32
item_id                         object
item_seq_number                 uint16
param_1                       category
param_2                       category
param_3                       category
parent_category_name          category
price                          float32
region                        category
title                           object
user_id                         object
user_type                     category
dtype: object

###### uidx, iidx, iid

In [11]:
def get_df_matrix_mappings(df, row_name, col_name):
    # Create mappings
    rid_to_idx = {}
    idx_to_rid = {}
    for (idx, rid) in enumerate(df[row_name].unique().tolist()):
        rid_to_idx[rid] = idx
        idx_to_rid[idx] = rid


    cid_to_idx = {}
    idx_to_cid = {}
    for (idx, cid) in enumerate(df[col_name].unique().tolist()):
        cid_to_idx[cid] = idx
        idx_to_cid[idx] = cid


    return rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid

In [12]:
rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid = get_df_matrix_mappings(df_train,'user_id','item_seq_number')
df_trn_uidx = pd.DataFrame()
df_trn_uidx['uidx']= df_train.user_id.map(rid_to_idx)
df_trn_uidx['iidx']= df_train.item_seq_number.map(cid_to_idx)
df_trn_uidx['uid'] = df_train.user_id
df_trn_uidx['iid'] = df_train.item_seq_number

df_train['iidx'] = df_train.item_seq_number.map(cid_to_idx).astype('uint16')
df_train['uidx'] = df_train.user_id.map(rid_to_idx).astype('uint32')

In [13]:
df_train['iid'] = range(df_train.shape[0])

###### region_city_label

In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
def create_label_encoding_with_min_count(df, column, min_count=50):
    column_counts = df.groupby([column])[column].transform("count").astype(int)
    column_values = np.where(column_counts >= min_count, df[column], "")
    df[column+"_label"] = LabelEncoder().fit_transform(column_values)
    
    return df[column+"_label"]

In [18]:
df_train['region_city'] = df_train.groupby(['region','city'])['region'].transform(lambda x:np.random.random()) ## faster and encode it correctly!!
df_train['region_city_label'] = create_label_encoding_with_min_count(df_train, 'region_city', min_count=1).astype('uint16')


###### title length, description length

In [19]:
df_train['title'] = df_train.title.fillna(" ")
df_train['tit_len'] = df_train.title.apply(lambda x:len(x.split())).astype('uint8')

df_train['description'] = df_train.description.fillna(" ")
df_train['desc_len'] = df_train.description.apply(lambda x:len(x.split())).astype('uint16')

##### activation date 
- weekday, 
- day
- month

In [20]:
df_train['weekday'] = df_train.activation_date.dt.weekday.astype('uint8')
df_train['day'] = df_train.activation_date.dt.day.astype('uint8')
df_train['month'] = df_train.activation_date.dt.month.astype('uint8')

#####  ads_cnt_by_uid

In [21]:
print('doing add cnt by user_id...')
tmp = df_train.groupby('uidx').size().to_frame().reset_index().rename(columns={0:'ads_cnt_by_uid'})
tmp['ads_cnt_by_uid'] = tmp.ads_cnt_by_uid.astype('uint32')
df_train = df_train.merge(tmp,how='left' ,on='uidx')

print('doing add cnt by iidx(item_seq_number)...')
tmp = df_train.groupby('iidx').size().to_frame().reset_index().rename(columns={0:'ads_cnt_by_iid'})
tmp['ads_cnt_by_iid'] = tmp.ads_cnt_by_iid.astype('uint32')
df_train =  df_train.merge(tmp,how='left' ,on='iidx')
print('done')
del tmp; gc.collect()

doing add cnt by user_id...
doing add cnt by iidx(item_seq_number)...
done


56

In [22]:
df_train.shape

(2011862, 30)

###### save to pickle

In [23]:
df_test = df_train.iloc[len_trn:,].copy()

In [24]:
df_test.shape

(508438, 30)

In [25]:
df_train = df_train.iloc[:len_trn,]

In [26]:
df_train.shape

(1503424, 30)

In [27]:
df_train.columns

Index(['activation_date', 'category_name', 'city', 'deal_probability',
       'description', 'image', 'image_top_1', 'item_id', 'item_seq_number',
       'param_1', 'param_2', 'param_3', 'parent_category_name', 'price',
       'region', 'title', 'user_id', 'user_type', 'iidx', 'uidx', 'iid',
       'region_city', 'region_city_label', 'tit_len', 'desc_len', 'weekday',
       'day', 'month', 'ads_cnt_by_uid', 'ads_cnt_by_iid'],
      dtype='object')

In [28]:
df_train.memory_usage(deep=True) /(1024*1024)

Index                    11.470215
activation_date          11.470215
category_name             1.438957
city                    168.230429
deal_probability         11.470215
description             630.896501
image                   163.930866
image_top_1               5.735107
item_id                  98.930603
item_seq_number           2.867554
param_1                   2.914475
param_2                   2.901638
param_3                   2.988298
parent_category_name      1.434679
price                     5.735107
region                    1.438120
title                   174.760880
user_id                  98.930603
user_type                 1.433957
iidx                      2.867554
uidx                      5.735107
iid                       5.735107
region_city              11.470215
region_city_label         2.867554
tit_len                   1.433777
desc_len                  2.867554
weekday                   1.433777
day                       1.433777
month               

In [29]:
df_train.dtypes

activation_date         datetime64[ns]
category_name                 category
city                            object
deal_probability               float64
description                     object
image                           object
image_top_1                    float32
item_id                         object
item_seq_number                 uint16
param_1                       category
param_2                       category
param_3                       category
parent_category_name          category
price                          float32
region                        category
title                           object
user_id                         object
user_type                     category
iidx                            uint16
uidx                            uint32
iid                              int32
region_city                    float64
region_city_label               uint16
tit_len                          uint8
desc_len                        uint16
weekday                  

In [30]:
selcols = [    
    'uidx','iidx','iid','region_city_label', ## added encoded col --
    'tit_len','desc_len', ## len of desc, title
    'activation_date', ## act time
    'month','day','weekday', ## date 
    'param_1','param_2','param_3', ## model_params
    'user_type', # user
    'parent_category_name','price','category_name','image_top_1', ## items info
    'ads_cnt_by_uid', 'ads_cnt_by_iid' ## group by count
#     'deal_probability'  # y_target
] 
# df_train[selcols].head().T

In [34]:
y_cols = ['deal_probability']
# df_train[selcols + y_cols].head().T
df_trn_feat0 = df_train[selcols+y_cols].copy()
df_test_feat0 = df_test[selcols].copy()

In [41]:
print('trn  shape',df_trn_feat0.shape)
print('test shape',df_test_feat0.shape)

trn  shape (1503424, 21)
test shape (508438, 20)


In [40]:
df_trn_feat0.to_pickle('../input/feats/df_trn_feat0.pkl')
df_test_feat0.to_pickle('../input/feats/df_test_feat0.pkl')