Build the lgbt model via following features

1. lag/mean encoding (numerical) features 
2. text <-- What we are going to build in this notebook
    - tfidf + truncated svd
    - hashing  + truncated svd 

In [1]:
import pandas as pd 
import numpy as np 
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import gc
gc.enable()

In [2]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_train = store['X_train']
    X_cv = store['X_cv']
    y_train = store['y_train']
    y_cv = store['y_cv']
    X_test = store['X_test']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [3]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

## Text data processing
process raw text with tfidf(b)/hashing svd

In [5]:
with pd.HDFStore('../data/feat/text_feats.h5') as text_io:
    print(text_io.keys())
    X_text_feats_test = text_io['X_text_feats_test']
    X_text_feats_cv = text_io['X_text_feats_cv']
    X_text_feats_train = text_io['X_text_feats_train']

['/X_text_feats', '/X_text_feats_cv', '/X_text_feats_test', '/X_text_feats_train']


In [6]:
X_text_feats_train.shape

(10675632, 4)

In [13]:
train_text_df = pd.concat([X_text_feats_train,X_text_feats_cv])
test_text_df = X_text_feats_test

train_texts = train_text_df['item_name'] + ' ' + train_text_df['item_category_name'] + ' ' + train_text_df['shop_name']
test_texts = test_text_df['item_name'] + ' ' + test_text_df['item_category_name'] + ' ' + test_text_df['shop_name']

In [18]:
all_texts = pd.Series(np.concatenate([train_texts, test_texts], axis=0))
del train_text_df, test_text_df, train_texts, test_texts; gc.collect()
all_texts.shape

(11128004,)

In [19]:
del X_text_feats_train, X_text_feats_cv, X_text_feats_test; gc.collect()

29

### 1. Tfidf + truncated svd

In [20]:
## tfidf 
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(lowercase=False, ngram_range=(1,2))

tv_features = tv.fit_transform(all_texts)
print('shape of tfidf Vectorizer:{}'.format(tv_features.shape))

### svd 
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=20, random_state=0)
tv_svd_features = svd.fit_transform(tv_features)

## memory used
import sys
print('memory used (tv_svd_features):{}'.format(sys.getsizeof(tv_svd_features)/(1024*1024*1024)))

shape of tfidf Vectorizer:(11128004, 77634)
memory used (tv_svd_features):1.6582019180059433


In [21]:
del svd; tv_features; gc.collect()

897

### 2. binarize Tfidf  + truncated svd

In [22]:
## binary tfidf
tvb_features = tv_features.astype(bool).astype(float)

## svd
svd = TruncatedSVD(n_components=20, random_state=0)
tvb_svd_features = svd.fit_transform(tvb_features)
print('shape of tvb features:{}'.format(tvb_svd_features.shape))
print('memory used (tvb_svd_features):{}'.format(sys.getsizeof(tvb_svd_features)/(1024*1024*1024)))

shape of tvb features:(11128004, 20)
memory used (tvb_svd_features):1.6582019180059433


In [23]:
del svd, tvb_features; gc.collect();

### 3. Hashing + truncated svd

In [27]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(ngram_range=(1, 2), lowercase=False)
hv_features = hv.fit_transform(all_texts).tocsr()
print('shape of hv features:{}'.format(hv_features.shape))

svd = TruncatedSVD(n_components=20, random_state=0)
hv_svd_features = svd.fit_transform(hv_features)

print('memory used (hv_svd_features):{:.3f}'.format(sys.getsizeof(hv_svd_features)/(1024*1024*1024)))

shape of hv features:(11128004, 1048576)
memory used (hv_svd_features):1.658


### 4. Binary Hasing + Truncated svd

In [28]:
hvb_features = hv_features.astype(bool).astype(float)

svd = TruncatedSVD(n_components=20, random_state=0)
hvb_svd_features = svd.fit_transform(hvb_features)

print(sys.getsizeof(hvb_svd_features)/(1024*1024*1024))
del hvb_features; gc.collect()

1.6582019180059433


65

### 5. Stack 1-4 and save 

In [29]:
text_features = np.concatenate([tv_svd_features, tvb_svd_features, hv_svd_features, hvb_svd_features], axis=1)

print('memory used for all text_features:{:3f}'.format(sys.getsizeof(text_features)/(1024*1024*1024)))

memory used for all text_features:6.632807


In [30]:
del tv_svd_features,tvb_svd_features,hv_svd_features,hvb_svd_features; gc.collect()

3543

In [31]:
text_features_df = pd.DataFrame(data=text_features, columns=['text_f_'+str(i) for i in range(80)])

In [32]:
text_features_df.head()

Unnamed: 0,text_f_0,text_f_1,text_f_2,text_f_3,text_f_4,text_f_5,text_f_6,text_f_7,text_f_8,text_f_9,...,text_f_70,text_f_71,text_f_72,text_f_73,text_f_74,text_f_75,text_f_76,text_f_77,text_f_78,text_f_79
0,0.277484,-0.173652,-0.06324,-0.090207,-0.023441,-0.013428,-0.019452,0.007843,-0.021224,0.023921,...,-0.03381,-0.234008,-0.110437,-0.110263,-0.022642,-0.003122,0.056965,0.105615,0.012552,0.030759
1,0.039884,0.02749,0.048064,-0.010053,-0.005663,-0.008881,-0.013801,0.008955,-0.008782,0.018617,...,0.050828,-0.241337,-0.108003,-0.252701,-0.059449,0.121994,-0.172504,0.083715,-0.342954,0.62724
2,0.029936,0.017483,0.032786,-0.004073,-0.004317,-0.006618,-0.011949,0.008032,-0.008361,0.019091,...,0.053172,-0.237839,-0.102671,-0.253724,-0.060433,0.113566,-0.16443,0.098105,-0.327809,0.576223
3,0.065214,0.058522,0.142886,-0.053096,-0.019396,-0.012969,-0.012155,0.005954,-0.010409,0.008706,...,0.017964,-0.241615,-0.115704,-0.12515,-0.025767,0.021801,-0.000432,0.083451,-0.112115,0.189098
4,0.043049,0.019077,0.038894,0.023213,-0.013758,-0.010637,-0.013122,0.005971,-0.012691,0.01775,...,0.027809,-0.240677,-0.094627,-0.177729,-0.043124,0.047342,-0.066383,0.108523,-0.167366,0.356736


In [33]:
text_io = pd.HDFStore('../data/feat/text_feat_df_all.h5') ###
text_io['text_feats_df'] = text_features_df
text_io.close()

#### merge all features in a single table

In [36]:
text_features_df.shape[0] == X_train.shape[0] + X_cv.shape[0] + X_test.shape[0]

True

In [40]:
merge = pd.concat([X_train,X_cv,X_test])


In [44]:
merge = pd.concat([merge, text_features_df],axis=1)## merge in axis=1

In [46]:
merge.head().T

Unnamed: 0,0,1,2,3,4
shop_id,59.000000,59.000000,59.000000,59.000000,59.000000
item_id,22154.000000,2552.000000,2554.000000,2555.000000,2564.000000
date_block_num,0.000000,0.000000,0.000000,0.000000,0.000000
item_category_id,37.000000,58.000000,58.000000,56.000000,59.000000
item_id_avg_item_price_lag_1,,,,,
item_id_sum_item_cnt_day_lag_1,,,,,
item_id_avg_item_cnt_day_lag_1,,,,,
shop_id_avg_item_price_lag_1,,,,,
shop_id_sum_item_cnt_day_lag_1,,,,,
shop_id_avg_item_cnt_day_lag_1,,,,,


#### save it as `all_features` tables

In [47]:
text_io = pd.HDFStore('../data/feat/all_feat_df_all.h5') ###
text_io['all_feats_df_all'] = merge
text_io.close()

_____