most of code from 
[here](https://www.kaggle.com/demery/lightgbm-with-ridge-feature/code)

In [2]:
import pandas as pd 
import numpy as np 
import re


In [2]:
%%time
df_train = pd.read_pickle('../input/train.pkl')
df_test = pd.read_pickle('../input/test.pkl')

Wall time: 9.27 s


In [3]:
df_train.columns

Index(['item_id', 'user_id', 'region', 'city', 'parent_category_name',
       'category_name', 'param_1', 'param_2', 'param_3', 'title',
       'description', 'price', 'item_seq_number', 'activation_date',
       'user_type', 'image', 'image_top_1', 'deal_probability'],
      dtype='object')

In [4]:
## copy from https://www.kaggle.com/demery/lightgbm-with-ridge-feature/code#L72-L81
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        regex = re.compile(u'[^[:alpha:]]')
        textProc = regex.sub(" ", textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [5]:
df_train['title'] = df_train.title.apply(lambda x: cleanName(x))
df_test['title']  = df_test.title.apply(lambda x: cleanName(x))

df_train['description'] = df_train.description.apply(lambda x:cleanName(x))
df_test['description'] = df_test.description.apply(lambda x:cleanName(x))

In [7]:
title_text_raw = df_train.title.append(df_test.title)
title_text_raw.reset_index(drop=True,inplace=True)

In [11]:
df = pd.concat([df_train,df_test],axis=0)
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features

df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

# Meta Text Features
textfeats = ["description","text_feat", "title"]

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words


In [6]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords 
from sklearn.decomposition import TruncatedSVD

In [12]:
def get_col(col_name): return lambda x: x[col_name]

## copy from https://www.kaggle.com/demery/lightgbm-with-ridge-feature

russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}


vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=50000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,            
            preprocessor=get_col('title')))
    ])

# vectorizer.fit(df.to_dict('records'))
df_transform = vectorizer.fit_transform(df.to_dict('records'))

In [17]:
tfvocab = vectorizer.get_feature_names()

In [23]:
from scipy.sparse import csr_matrix,hstack

In [21]:
traindex = df_train.index
testdex = df_test.index

In [39]:
df.drop(textfeats,axis=1,inplace=True)

In [44]:
df.drop(['activation_date','image'],axis=1,inplace=True)

In [48]:
csr_trn_text_feats = df_transform[:df_train.shape[0],:]
csr_test_text_feats = df_transform[df_train.shape[0]:,:]

In [49]:
csr_test_text_feats

<508438x1407920 sparse matrix of type '<class 'numpy.float64'>'
	with 19344390 stored elements in Compressed Sparse Row format>

In [50]:
csr_trn_text_feats

<1503424x1407920 sparse matrix of type '<class 'numpy.float64'>'
	with 55073494 stored elements in Compressed Sparse Row format>

TSVD 
n_ = 5

In [51]:
import scipy.sparse as sp

In [56]:
tsvd_text = TruncatedSVD(n_components=5,random_state=0)

In [57]:
df_text_all_svd = tsvd_text.fit_transform(df_transform)

In [62]:
np.cumsum(tsvd_text.explained_variance_ratio_)

array([ 0.134552  ,  0.20039762,  0.25646434,  0.2811317 ,  0.30250088])

In [71]:
df_text_feats = pd.DataFrame(df_text_all_svd)
df_text_feats.columns = ['text_feat_{}'.format(e) for e in range(5)]
df_trn_text_feats = df_text_feats.iloc[:df_train.shape[0],:]
df_test_text_feats = df_text_feats.iloc[df_train.shape[0]:,:]

In [75]:
print('trn text feat shape:',df_trn_text_feats.shape)
print('test text feat shape:',df_test_text_feats.shape)

trn text feat shape: (1503424, 5)
test text feat shape: (508438, 5)


save

In [77]:
store = pd.HDFStore('../input/feats/txt_svd_feat2.h5')
store['df_trn_text_feats'] = df_trn_text_feats
store['df_test_text_feats'] = df_test_text_feats
store.close()

load

In [3]:
%%time 
with pd.HDFStore('../input/feats/txt_svd_feat2.h5') as store:
    print(store.keys())
    df_trn_text_feat  = store['df_trn_text_feats']
    df_test_text_feat = store['df_test_text_feats']

['/df_test_text_feats', '/df_trn_text_feats']
Wall time: 179 ms


In [6]:
df_trn_text_feat = df_trn_text_feat.astype('float16')
df_test_text_feat = df_test_text_feat.astype('float16')

In [40]:
pipe = Pipeline([
    ('txt_feats', vectorizer),
    ('tsvd',TruncatedSVD(random_state=0))
])

In [41]:
tmp = pipe.fit_transform(df_train.head().to_dict('rec'))

In [49]:
tsvd = pipe.get_params()['tsvd']

In [50]:
tsvd.explained_variance_ratio_

array([ 0.16893574,  0.2       ])

### save hdf5

In [24]:
all_text_feats =  np.concatenate([tv_svd_feats,hv_svd_features,tv_svd_feats1,hv_svd_features1],axis=1)

In [25]:
df_text_feats = pd.DataFrame(all_text_feats)
colnames = ['title_tfidf_{}'.format(e) for e in range(5)] + ['title_hash_{}'.format(i) for i in range(5)] + \
            ['desc_tfidf_{}'.format(e) for e in range(5)] + ['desc_hash_{}'.format(i) for i in range(5)] 
df_text_feats.columns = colnames

In [26]:
df_text_feats.head().T

Unnamed: 0,0,1,2,3,4
title_tfidf_0,3.155938e-06,2.005842e-05,9.995545e-06,5.558904e-05,-3.539002e-06
title_tfidf_1,8.473042e-06,0.0001133695,0.0001629503,0.0003003903,8.301491e-05
title_tfidf_2,1.255558e-05,7.503001e-05,1.882395e-05,2.437479e-05,8.447903e-05
title_tfidf_3,1.057398e-05,4.090454e-05,7.641594e-05,0.0001652462,0.0002656529
title_tfidf_4,5.858753e-06,1.590257e-05,7.149039e-05,-4.580733e-05,8.570438e-05
title_hash_0,0.0004894437,0.0006017932,1.424015e-05,2.587222e-05,2.56736e-06
title_hash_1,0.02333327,0.02780037,4.522878e-05,0.0001614659,-7.717952e-06
title_hash_2,0.03659733,0.04402271,0.0001751792,0.000262371,7.541814e-05
title_hash_3,0.2834564,0.3382862,0.0004820311,0.001199343,0.000140532
title_hash_4,-0.1910312,-0.2278477,0.0001803965,0.001410686,0.0001864875


In [27]:
def downcast(df):
    for idx, col in enumerate(df):
        if df.iloc[:,idx].dtypes == 'float64':
            df[col] = df[col].astype('float32')
        else:
            pass    

In [28]:
downcast(df_text_feats)

In [29]:
df_text_feats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2011862 entries, 0 to 2011861
Data columns (total 20 columns):
title_tfidf_0    float32
title_tfidf_1    float32
title_tfidf_2    float32
title_tfidf_3    float32
title_tfidf_4    float32
title_hash_0     float32
title_hash_1     float32
title_hash_2     float32
title_hash_3     float32
title_hash_4     float32
desc_tfidf_0     float32
desc_tfidf_1     float32
desc_tfidf_2     float32
desc_tfidf_3     float32
desc_tfidf_4     float32
desc_hash_0      float32
desc_hash_1      float32
desc_hash_2      float32
desc_hash_3      float32
desc_hash_4      float32
dtypes: float32(20)
memory usage: 153.5 MB


In [30]:
df_text_feats.shape

(2011862, 20)

In [31]:
len_trn = df_train.shape[0]

In [32]:
store = pd.HDFStore('../input/feats/txt_td_tfidf_hash5.h5')
store['df_trn_text_feat'] = df_text_feats.iloc[:len_trn,:]
store['df_test_text_feat'] = df_text_feats.iloc[len_trn:,:]
store.close()