In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import datetime, time
import sklearn, sklearn.metrics, sklearn.preprocessing, sklearn.linear_model, sklearn.ensemble, sklearn.model_selection
import nltk, nltk.stem

import collections

import modutils

data_dir = '../DataSets/MercariPrice/'
src_file = data_dir + 'train_text.csv' 
dst_file = data_dir + 'train_descr.csv'

In [2]:
%%time
src = pd.read_csv(src_file)

Wall time: 10.6 s


In [3]:
%%time
stemmer = nltk.stem.SnowballStemmer('english')
src['name_proc'] = src.name.map(lambda x: [stemmer.stem(y.lower()) for y in x.split()])

Wall time: 1min 27s


In [4]:
%%time
name_dict = collections.Counter([y for x in src.name_proc for y in x])

Wall time: 1.13 s


In [5]:
topNwords = list(sorted(list(name_dict.items()), key=lambda x: x[1], reverse=True))[:1000]

In [6]:
src_hit = src[['name', 'fcst_diff_simple']].copy()

In [7]:
%%time
i = 0
sample = [x for (x,_) in topNwords[:1000]] 
for x in sample:
    src_hit[x] = src.name_proc.map(lambda z: x in z)
    i += 1
    print('Done {0} out of {1}'.format(i, len(sample)), end='\r')

Wall time: 6minf 1000


In [8]:
%%time
topNpairs = []
for (i,(x,xn)) in enumerate(topNwords):
    tmp = src_hit[src_hit[x]]
    for j in range(i, len(topNwords)):
        y = topNwords[j][0]
        res = tmp.fcst_diff_simple[tmp[y]].agg(['count','mean'])
        if res[0] == 0:
            res[1] = 0
        topNpairs.append(((x,y), res[0], res[1]))
    print('Done {0} out of {1}'.format(i+1, len(topNwords)), end='\r')

Wall time: 16min 58s0


In [9]:
sorted(topNpairs, key=lambda x:abs(x[2]*x[1]), reverse=True)[:10]

[(('bundl', 'bundl'), 29178.0, 0.33101167723292341),
 (('lularo', 'lularo'), 25009.0, 0.23650043712832872),
 (('for', 'for'), 23357.0, 0.21425363778375922),
 (('bundl', 'for'), 7224.0, 0.47665894150339605),
 (('set', 'set'), 15027.0, 0.15841957756653116),
 (('&', '&'), 14659.0, 0.13665723823000916),
 (('nwt', 'nwt'), 14452.0, 0.12394013422537664),
 (('lot', 'lot'), 6826.0, 0.25254058778794886),
 (('size', 'size'), 28757.0, -0.058799418625461168),
 (('and', 'and'), 18268.0, 0.091980782779687187)]

In [10]:
features_1lev = [x[0] for x in topNwords]
src_train = src[['name', 'price', 'fcst_log_simple', 'fcst_diff_simple']].copy()

In [11]:
for f in features_1lev:
    src_train['f_{0}'.format(f)] = src_hit[f]*1

In [12]:
X = src_train[['f_{0}'.format(f) for f in features_1lev]].values
Y = src_train.fcst_diff_simple.values
(Xtrain,Ytrain),(Xtest,Ytest) = modutils.splitSample((X, Y), pcts = [0.3,0.7])

In [35]:
Xtrain.shape

(59384, 1000)

In [46]:
%%time
mod0gb = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100, n_estimators=30, max_depth=2, subsample=0.5).fit(Xtrain, Ytrain)

Wall time: 1min 9s


In [95]:
%%time
#mod0lr = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)
#mod0lr = sklearn.linear_model.SGDRegressor(n_iter=100).fit(Xtrain, Ytrain) #15.5 vs 13.5
#mod0lr = sklearn.linear_model.Lasso(alpha=1e-3).fit(Xtrain, Ytrain) #1e-4: 14.7 vs 13.1, 1e-5: 16.1 vs 13.6, 1e-3: 4.89
#mod0lr = sklearn.linear_model.LassoLars(alpha=1e-5).fit(Xtrain, Ytrain) 
#mod0lr = sklearn.linear_model.Ridge(alpha=1e1).fit(Xtrain, Ytrain) #def: 16.1 vs 13.5
mod0lr = sklearn.linear_model.Lasso(alpha=2e-5).fit(Xtrain, Ytrain)

Wall time: 49.5 s


In [55]:
%%time
mod0rf2 = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=100, n_estimators=100, max_depth=2, n_jobs=4).fit(Xtrain, Ytrain)

Wall time: 1min 33s


In [60]:
Ptrain = mod0gb.predict(Xtrain)
Ptest = mod0gb.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0331 (train), 0.0311 (test)


In [96]:
%%time
Ptrain = mod0lr.predict(Xtrain)
Ptest = mod0lr.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.1495 (train), 0.1446 (test)
Wall time: 4.76 s


In [56]:
%%time
Ptrain = mod0rf2.predict(Xtrain)
Ptest = mod0rf2.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0254 (train), 0.0228 (test)
Wall time: 6.03 s


In [97]:
Ztrain = Ytrain - mod0lr.predict(Xtrain)

In [13]:
mod0 = sklearn.linear_model.Lasso(alpha=2e-5).fit(Xtrain, Ytrain)

In [14]:
src_hit['fcst_diff_lev1'] = src_hit.fcst_diff_simple - mod0.predict(X)

In [15]:
src_hit[['fcst_diff_lev1', 'fcst_diff_simple']].describe()

Unnamed: 0,fcst_diff_lev1,fcst_diff_simple
count,593376.0,593376.0
mean,-0.000822,-0.000583
std,0.565424,0.611888
min,-4.970987,-4.826117
25%,-0.362045,-0.406212
50%,-0.038435,-0.055451
75%,0.311027,0.341269
max,4.698386,4.636777


In [16]:
%%time
lev1Npairs = []
for (i,(x,xn)) in enumerate(topNwords):
    tmp = src_hit[src_hit[x]]
    for j in range(i, len(topNwords)):
        y = topNwords[j][0]
        res = tmp.fcst_diff_lev1[tmp[y]].agg(['count','mean'])
        if res[0] == 0:
            res[1] = 0
        lev1Npairs.append(((x,y), res[0], res[1]))
    print('Done {0} out of {1}'.format(i+1, len(topNwords)), end='\r')

Wall time: 24min 24s0


In [19]:
lev0pairs = [(x[0],x[1],x[2],abs(np.power(x[1], 0.7)*x[2])) for x in topNpairs]
lev1pairs = [(x[0],x[1],x[2],abs(np.power(x[1], 0.7)*x[2])) for x in lev1Npairs]

In [145]:
sorted([x for x in lev0pairs if x[1]>500], key=lambda x:x[3], reverse=True)[49]

(('bundl', 'vs'), 891.0, 0.53264896274449647, 61.852246563620675)

In [20]:
lev2features = sorted([x for x in lev1pairs if x[1]>500], key=lambda x:x[3], reverse=True)[:50]

In [21]:
lev2features

[(('watch', 'appl'), 678.0, 0.47929002940943927, 45.968378037234203),
 (('bundl', 'for'), 7224.0, 0.06577360402594315, 33.051871493181622),
 (('pink', 'set'), 846.0, 0.25625612512750218, 28.696802512531711),
 (('black', 'and'), 1762.0, -0.12396515141145711, 23.200842082371704),
 (('pink', 'vs'), 11599.0, 0.033133208942037834, 23.193051007763426),
 (('vs', 'set'), 801.0, 0.20930288505875916, 22.558902956786053),
 (('kor', 'purs'), 554.0, 0.26621253980546739, 22.165898700420396),
 (('michael', 'purs'), 518.0, 0.27479237563755737, 21.829083920397352),
 (('one', 'xbox'), 990.0, 0.17327711117950548, 21.661365735231904),
 (('lularo', 'free'), 605.0, 0.24117117885392669, 21.357684586278459),
 (('lularo', 'black'), 1287.0, 0.13502338606908995, 20.282149412417603),
 (('new', 'nike'), 1001.0, 0.15605808340745198, 19.660299213843508),
 (('bundl', 'makeup'), 505.0, -0.24918300059071535, 19.44564602343533),
 (('2', 'of'), 1324.0, -0.12158826413400491, 18.630016855888204),
 (('girl', 'american'), 15

In [22]:
features_2lev = [f[0] for f in lev2features]
src_train['fcst_diff_lev1'] = src_hit.fcst_diff_lev1
for f1, f2 in features_2lev:
    src_train['f2_{0}_{1}'.format(f1, f2)] = (src_hit[f1]*1)*(src_hit[f2]*1)

In [23]:
X2 = src_train[['f2_{0}_{1}'.format(f1,f2) for f1,f2 in features_2lev]].values
Y2 = src_train.fcst_diff_lev1.values
(Xtrain,Ytrain),(Xtest,Ytest) = modutils.splitSample((X2, Y2), pcts = [0.7,0.3])

In [24]:
X2.shape, X.shape

((593376, 50), (593376, 1000))

In [25]:
%%time
#mod1 = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)
#mod1 = sklearn.linear_model.SGDRegressor(n_iter=100).fit(Xtrain, Ytrain) #15.5 vs 13.5
#mod1 = sklearn.linear_model.Lasso(alpha=1e-4).fit(Xtrain, Ytrain) #1e-4: 14.7 vs 13.1, 1e-5: 16.1 vs 13.6, 1e-3: 4.89
#mod1 = sklearn.linear_model.LassoLars(alpha=1e-5).fit(Xtrain, Ytrain) 
#mod1 = sklearn.linear_model.Ridge(alpha=1e1).fit(Xtrain, Ytrain) #def: 16.1 vs 13.5
mod1 = sklearn.linear_model.Lasso(alpha=1e-4).fit(Xtrain, Ytrain)

Wall time: 5.65 s


In [175]:
%%time
mod1 = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100, n_estimators=100, max_depth=2).fit(Xtrain, Ytrain)

Wall time: 37 s


In [181]:
%%time
mod1 = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=100, n_estimators=10, max_depth=3).fit(Xtrain, Ytrain)

Wall time: 3.86 s


In [26]:
%%time
Ptrain = mod1.predict(Xtrain)
Ptest = mod1.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0036 (train), 0.0033 (test)
Wall time: 211 ms


In [27]:
src_hit['fcst_diff_lev2'] = src_hit.fcst_diff_lev1 - mod1.predict(X2)

In [28]:
src_hit[['fcst_diff_simple', 'fcst_diff_lev1', 'fcst_diff_lev2']].describe()

Unnamed: 0,fcst_diff_simple,fcst_diff_lev1,fcst_diff_lev2
count,593376.0,593376.0,593376.0
mean,-0.000583,-0.000822,0.000513
std,0.611888,0.565424,0.564432
min,-4.826117,-4.970987,-4.967552
25%,-0.406212,-0.362045,-0.359768
50%,-0.055451,-0.038435,-0.036708
75%,0.341269,0.311027,0.311668
max,4.636777,4.698386,4.701821


In [30]:
src['fcst_log_simple_title'] = src.fcst_log_simple + mod0.predict(X) + mod1.predict(X2)

In [31]:
src['fcst_diff_simple_title'] = np.log(1+src.price) - src.fcst_log_simple_title

In [32]:
src[['fcst_diff_simple', 'fcst_diff_simple_title']].describe()

Unnamed: 0,fcst_diff_simple,fcst_diff_simple_title
count,593376.0,593376.0
mean,-0.000583,0.000513
std,0.611888,0.564432
min,-4.826117,-4.967552
25%,-0.406212,-0.359768
50%,-0.055451,-0.036708
75%,0.341269,0.311668
max,4.636777,4.701821


In [206]:
sklearn.metrics.r2_score(np.log(1+src.price), src.fcst_log_simple)

0.33107475204617176

In [207]:
sklearn.metrics.r2_score(np.log(1+src.price), src.fcst_log_simple_title)

0.43084089710748941

In [33]:
src_res = src[[x for x in list(src.columns) if x != 'name_proc']].copy()

In [34]:
src_res.to_csv(dst_file, index=False, encoding='utf-8')