most of code from 
[here](https://www.kaggle.com/demery/lightgbm-with-ridge-feature/code)

In [5]:
import pandas as pd 
import numpy as np 
import re


In [6]:
%%time
df_train = pd.read_pickle('../input/train.pkl')
df_test = pd.read_pickle('../input/test.pkl')

Wall time: 6.64 s


In [7]:
## copy from https://www.kaggle.com/demery/lightgbm-with-ridge-feature/code#L72-L81
def cleanName(text):
    try:
        textProc = text.lower()
        textProc = " ".join(map(str.strip, re.split('(\d+)',textProc)))
        regex = re.compile(u'[^[:alpha:]]')
        textProc = regex.sub(" ", textProc)
        textProc = " ".join(textProc.split())
        return textProc
    except: 
        return "name error"

In [8]:
df_train['title'] = df_train.title.apply(lambda x: cleanName(x))
df_test['title']  = df_test.title.apply(lambda x: cleanName(x))

df_train['description'] = df_train.description.apply(lambda x:cleanName(x))
df_test['description'] = df_test.description.apply(lambda x:cleanName(x))

In [9]:
title_text_raw = df_train.title.append(df_test.title)
title_text_raw.reset_index(drop=True,inplace=True)

In [10]:
df = pd.concat([df_train,df_test],axis=0)
df['text_feat'] = df.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features

df.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

# Meta Text Features
textfeats = ["description","text_feat", "title"]

for cols in textfeats:
    df[cols] = df[cols].astype(str) 
    df[cols] = df[cols].astype(str).fillna('missing') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    df[cols + '_num_chars'] = df[cols].apply(len) # Count number of Characters
    df[cols + '_num_words'] = df[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    df[cols + '_num_unique_words'] = df[cols].apply(lambda comment: len(set(w for w in comment.split())))
    df[cols + '_words_vs_unique'] = df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100 # Count Unique Words


In [11]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords 
from sklearn.decomposition import TruncatedSVD

In [12]:
def get_col(col_name): return lambda x: x[col_name]

## copy from https://www.kaggle.com/demery/lightgbm-with-ridge-feature

russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}


vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=50000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,            
            preprocessor=get_col('title')))
    ])

# vectorizer.fit(df.to_dict('records'))
sp_txt_transform = vectorizer.fit_transform(df.to_dict('records')).astype('float16')

tfvocab = vectorizer.get_feature_names()

In [47]:
np.finfo(np.float16).max

65504.0

In [63]:
def downcast(df):
    for col in df:
        if df[col].dtypes in ['float32','float64'] :
            if df[col].max() <  np.finfo(np.float16).max:
                df[col] = df[col].astype('float16')
            else:
                df[col] = df[col].astype('float32')
        elif df[col].dtypes=='int64':            
            if df[col].max() < np.iinfo(np.int16).max:
                df[col] = df[col].astype('int16')
            if df[col].max() > np.iinfo(np.int16).max:
                df[col] = df[col].astype('int32')

In [78]:
from scipy.sparse import csr_matrix,hstack,coo_matrix

In [117]:
sp_txt_transform.data.nbytes /(2**20)

141.94084930419922

save 
- sparse bag of words features , 
- text df features
    - desc/title/text_feat num of words, num of unique words, num of chars

In [105]:
textfeats = ["description","text_feat", "title"]
cols = []
for col in textfeats:
    cols.append(col + '_num_chars')
    cols.append(col + '_num_words')
    cols.append(col +'_num_unique_words') 
    cols.append(col +'_words_vs_unique')

df_text_feat =df[cols]

In [108]:
ntrain = df_train.shape[0]
ntest  = df_test.shape[0]

In [111]:
df_text_trn_feat2 = df_text_feat.iloc[:ntrain,]
df_text_test_feat2 = df_text_feat.iloc[ntrain:,]

In [115]:
store = pd.HDFStore('../input/feats/txt_wc_feat2.h5')
store['df_txt_trn_wc_feat']  = df_text_trn_feat2
store['df_txt_test_wc_feat'] = df_text_test_feat2
store.close()

In [119]:
import scipy.sparse as sp

In [120]:
sp.save_npz('../input/feats/sp_bow_txt.npz',sp_txt_transform)

In [21]:
traindex = df_train.index
testdex = df_test.index

In [39]:
df.drop(textfeats,axis=1,inplace=True)

In [44]:
df.drop(['activation_date','image'],axis=1,inplace=True)

In [48]:
csr_trn_text_feats = df_transform[:df_train.shape[0],:]
csr_test_text_feats = df_transform[df_train.shape[0]:,:]

In [49]:
csr_test_text_feats

<508438x1407920 sparse matrix of type '<class 'numpy.float64'>'
	with 19344390 stored elements in Compressed Sparse Row format>

In [50]:
csr_trn_text_feats

<1503424x1407920 sparse matrix of type '<class 'numpy.float64'>'
	with 55073494 stored elements in Compressed Sparse Row format>

TSVD 
n_ = 5

In [51]:
import scipy.sparse as sp

In [56]:
tsvd_text = TruncatedSVD(n_components=5,random_state=0)

In [57]:
df_text_all_svd = tsvd_text.fit_transform(df_transform)

In [62]:
np.cumsum(tsvd_text.explained_variance_ratio_)

array([ 0.134552  ,  0.20039762,  0.25646434,  0.2811317 ,  0.30250088])

In [71]:
df_text_feats = pd.DataFrame(df_text_all_svd)
df_text_feats.columns = ['text_feat_{}'.format(e) for e in range(5)]
df_trn_text_feats = df_text_feats.iloc[:df_train.shape[0],:]
df_test_text_feats = df_text_feats.iloc[df_train.shape[0]:,:]

In [75]:
print('trn text feat shape:',df_trn_text_feats.shape)
print('test text feat shape:',df_test_text_feats.shape)

trn text feat shape: (1503424, 5)
test text feat shape: (508438, 5)


save

In [77]:
store = pd.HDFStore('../input/feats/txt_svd_feat2.h5')
store['df_trn_text_feats'] = df_trn_text_feats
store['df_test_text_feats'] = df_test_text_feats
store.close()

load

In [2]:
%%time 
with pd.HDFStore('../input/feats/txt_svd_feat2.h5') as store:
    print(store.keys())
    df_trn_text_feat  = store['df_trn_text_feats']
    df_test_text_feat = store['df_test_text_feats']

['/df_test_text_feats', '/df_trn_text_feats']
Wall time: 1.25 s


In [6]:
df_trn_text_feat = df_trn_text_feat.astype('float16')
df_test_text_feat = df_test_text_feat.astype('float16')

In [40]:
pipe = Pipeline([
    ('txt_feats', vectorizer),
    ('tsvd',TruncatedSVD(random_state=0))
])

In [41]:
tmp = pipe.fit_transform(df_train.head().to_dict('rec'))

In [49]:
tsvd = pipe.get_params()['tsvd']

In [50]:
tsvd.explained_variance_ratio_

array([ 0.16893574,  0.2       ])