In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import datetime, time
import sklearn, sklearn.metrics, sklearn.preprocessing, sklearn.linear_model, sklearn.ensemble, sklearn.model_selection
import nltk, nltk.stem

import collections

import modutils

data_dir = '../DataSets/MercariPrice/'
src_file = data_dir + 'train_text.csv' 

In [2]:
%%time
src = pd.read_csv(src_file)

Wall time: 10.5 s


In [3]:
%%time
stemmer = nltk.stem.SnowballStemmer('english')
src['name_proc'] = src.name.map(lambda x: [stemmer.stem(y.lower()) for y in x.split()])

Wall time: 1min 19s


In [4]:
%%time
name_dict = collections.Counter([y for x in src.name_proc for y in x])

Wall time: 995 ms


In [6]:
topNwords = list(sorted(list(name_dict.items()), key=lambda x: x[1], reverse=True))[:1000]

In [11]:
src_hit = src[['name', 'fcst_diff_simple']].copy()

In [12]:
%%time
i = 0
sample = [x for (x,_) in topNwords[:1000]] 
for x in sample:
    src_hit[x] = src.name_proc.map(lambda z: x in z)
    i += 1
    print('Done {0} out of {1}'.format(i, len(sample)), end='\r')

Wall time: 5min 57s00


In [17]:
%%time
sortedNpairs = []
for (i,(x,xn)) in enumerate(topNwords):
    tmp = src_hit[src_hit[x]]
    for j in range(i, len(topNwords)):
        y = topNwords[j][0]
        res = tmp.fcst_diff_simple[tmp[y]].agg(['count','mean'])
        if res[0] == 0:
            res[1] = 0
        sortedNpairs.append(((x,y), res[0], res[1]))
    print('Done {0} out of {1}'.format(i+1, len(topNwords)), end='\r')

Wall time: 9h 40min 3s


In [22]:
sorted(sortedNpairs, key=lambda x:abs(x[2]*x[1]), reverse=True)[:10]

[(('bundl', 'bundl'), 29178.0, 0.33101167723292341),
 (('lularo', 'lularo'), 25009.0, 0.23650043712832872),
 (('for', 'for'), 23357.0, 0.21425363778375922),
 (('bundl', 'for'), 7224.0, 0.47665894150339605),
 (('set', 'set'), 15027.0, 0.15841957756653116),
 (('&', '&'), 14659.0, 0.13665723823000916),
 (('nwt', 'nwt'), 14452.0, 0.12394013422537664),
 (('lot', 'lot'), 6826.0, 0.25254058778794886),
 (('size', 'size'), 28757.0, -0.058799418625461168),
 (('and', 'and'), 18268.0, 0.091980782779687187)]

In [None]:
tmp_words = pd.DataFrame({'name':[x[0] for x in sorted500words]})
for (x,xn,_,_) in top500words:
    tmp_words[x] = [z[1] for z in sorted500pairs if z[0][0]==x]

In [None]:
tmp_words.to_csv('train_title_num_matrix.csv', index=False)

In [24]:
features_1lev = [x[0] for x in topNwords]
src_train = src[['name', 'price', 'fcst_log_simple', 'fcst_diff_simple']].copy()

In [25]:
for f in features_1lev:
    src_train['f_{0}'.format(f)] = src_hit[f]*1

In [34]:
X = src_train[['f_{0}'.format(f) for f in features_1lev]].values
Y = src_train.fcst_diff_simple.values
(Xtrain,Ytrain),(Xtest,Ytest) = modutils.splitSample((X, Y), pcts = [0.1,0.9])

In [35]:
Xtrain.shape

(59384, 1000)

In [None]:
%%time
alphas = [1e-2,1e-3,3e-4,1e-4]
mod0s = [sklearn.linear_model.Lasso(alpha=a).fit(Xtrain, Ytrain) for a in alphas]

In [None]:
for i in range(len(alphas)):
    Ptrain = mod0s[i].predict(Xtrain)
    Ptest = mod0s[i].predict(Xtest)
    print('Alpha={0}, coefs={1}, Rsqr={2:.4f} (train), {3:.4f} (test)'.format(alphas[i],
                                                  np.sum(np.abs(mod0s[i].coef_)>1e-6),
                                                  sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest)))

In [46]:
%%time
mod0gb = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100, n_estimators=30, max_depth=2, subsample=0.5).fit(Xtrain, Ytrain)

Wall time: 1min 9s


##### %%time
#mod0lr = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)
mod0lr = sklearn.linear_model.SGDRegressor(n_iter=100).fit(Xtrain, Ytrain) #

### sortedNpairs

In [55]:
%%time
mod0rf2 = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=100, n_estimators=100, max_depth=2, n_jobs=4).fit(Xtrain, Ytrain)

Wall time: 1min 33s


In [60]:
Ptrain = mod0gb.predict(Xtrain)
Ptest = mod0gb.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0331 (train), 0.0311 (test)


In [65]:
%%time
Ptrain = mod0lr.predict(Xtrain)
Ptest = mod0lr.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.1558 (train), 0.1346 (test)
Wall time: 9.36 s


In [56]:
%%time
Ptrain = mod0rf2.predict(Xtrain)
Ptest = mod0rf2.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0254 (train), 0.0228 (test)
Wall time: 6.03 s


In [None]:
mod0 = mod0s[2]
np.sum(np.abs(mod0.coef_)>1e-6)

In [None]:
src_train['fcst_lev0'] = mod0.predict(X)

In [None]:
src_train['fcst_diff_lev0'] = src_train.fcst_diff_simple - src_train.fcst_lev0

In [None]:
src_train[['fcst_diff_simple', 'fcst_diff_lev0']].describe()

In [None]:
src_hit['fcst_diff_lev0'] = src_train.fcst_diff_lev0

In [None]:
%%time
sorted500pairs_lev1 = []
for (x,xn,_,_) in sorted500words:
    tmp = src_hit[src_hit[x]]
    for (y,_,_,_) in sorted500words:
        res = tmp.fcst_diff_lev0[tmp[y]].agg(['count','mean'])
        sorted500pairs_lev1.append(((x,y),res[0], 0 if res[0] < 1 else res[1]))

In [None]:
tmp = sorted(sorted500pairs_lev1, key=lambda q: abs(q[2]*q[1]))

In [None]:
tmp[-10:]

In [None]:
sorted(tmp, key=lambda x: x[3], reverse=True)[:20]

In [None]:
tmp2= np.array([x[3] for x in tmp])

In [None]:
tmp2