In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import datetime, time
import sklearn, sklearn.metrics, sklearn.preprocessing, sklearn.linear_model, sklearn.ensemble, sklearn.model_selection
import nltk, nltk.stem

import collections

import modutils

data_dir = '../DataSets/MercariPrice/'
src_file = data_dir + 'train_simple.csv' 
dst_file = data_dir + 'train_title.csv'

In [2]:
%%time
src = pd.read_csv(src_file)

Wall time: 4.76 s


In [3]:
%%time
stemmer = nltk.stem.SnowballStemmer('english')
src['name_proc'] = src.name.map(lambda x: [stemmer.stem(y.lower()) for y in x.split()])

Wall time: 41.8 s


In [4]:
%%time
name_dict = collections.Counter([y for x in src.name_proc for y in x])

Wall time: 491 ms


In [5]:
topNwords = list(sorted(list(name_dict.items()), key=lambda x: x[1], reverse=True))[:1000]

In [6]:
src_hit = src[['name', 'fcst_diff_simple']].copy()

In [7]:
%%time
i = 0
sample = [x for (x,_) in topNwords[:1000]] 
for x in sample:
    src_hit[x] = src.name_proc.map(lambda z: x in z)
    i += 1
    print('Done {0} out of {1}'.format(i, len(sample)), end='\r')

Wall time: 3min 4s000


In [8]:
%%time
topNpairs = []
for (i,(x,xn)) in enumerate(topNwords):
    tmp = src_hit[src_hit[x]]
    for j in range(i, len(topNwords)):
        y = topNwords[j][0]
        res = tmp.fcst_diff_simple[tmp[y]].agg(['count','mean'])
        if res[0] == 0:
            res[1] = 0
        topNpairs.append(((x,y), res[0], res[1]))
    print('Done {0} out of {1}'.format(i+1, len(topNwords)), end='\r')

Wall time: 8min 3s000


In [9]:
sorted(topNpairs, key=lambda x:abs(x[2]*x[1]), reverse=True)[:10]

[(('bundl', 'bundl'), 29178.0, 0.33122653016724407),
 (('lularo', 'lularo'), 25009.0, 0.23260376490633902),
 (('for', 'for'), 23357.0, 0.21415967172981221),
 (('bundl', 'for'), 7224.0, 0.47591511329661451),
 (('set', 'set'), 15027.0, 0.15762321809369126),
 (('&', '&'), 14659.0, 0.13737297290212691),
 (('nwt', 'nwt'), 14452.0, 0.12270215503351693),
 (('lot', 'lot'), 6826.0, 0.25288536652625437),
 (('and', 'and'), 18268.0, 0.092291414273561148),
 (('size', 'size'), 28757.0, -0.058239882698287788)]

In [12]:
features_1lev = [x[0] for x in topNwords]
src_train = src[['name', 'price', 'fcst_simple', 'fcst_diff_simple']].copy()

In [13]:
for f in features_1lev:
    src_train['f_{0}'.format(f)] = src_hit[f]*1

In [14]:
X = src_train[['f_{0}'.format(f) for f in features_1lev]].values
Y = src_train.fcst_diff_simple.values
(Xtrain,Ytrain),(Xtest,Ytest) = modutils.splitSample((X, Y), pcts = [0.3,0.7])

In [15]:
Xtrain.shape

(178024, 1000)

In [16]:
%%time
mod0gb = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100, n_estimators=30, max_depth=2, subsample=0.5).fit(Xtrain, Ytrain)

Wall time: 2min 15s


In [17]:
%%time
#mod0lr = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)
#mod0lr = sklearn.linear_model.SGDRegressor(n_iter=100).fit(Xtrain, Ytrain) #15.5 vs 13.5
#mod0lr = sklearn.linear_model.Lasso(alpha=1e-3).fit(Xtrain, Ytrain) #1e-4: 14.7 vs 13.1, 1e-5: 16.1 vs 13.6, 1e-3: 4.89
#mod0lr = sklearn.linear_model.LassoLars(alpha=1e-5).fit(Xtrain, Ytrain) 
#mod0lr = sklearn.linear_model.Ridge(alpha=1e1).fit(Xtrain, Ytrain) #def: 16.1 vs 13.5
mod0lr = sklearn.linear_model.Lasso(alpha=2e-5).fit(Xtrain, Ytrain)

Wall time: 27.8 s


In [18]:
%%time
mod0rf2 = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=100, n_estimators=100, max_depth=2, n_jobs=4).fit(Xtrain, Ytrain)

Wall time: 2min 14s


In [19]:
Ptrain = mod0gb.predict(Xtrain)
Ptest = mod0gb.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0378 (train), 0.0379 (test)


In [20]:
%%time
Ptrain = mod0lr.predict(Xtrain)
Ptest = mod0lr.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.1523 (train), 0.1436 (test)
Wall time: 2 s


In [21]:
%%time
Ptrain = mod0rf2.predict(Xtrain)
Ptest = mod0rf2.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0222 (train), 0.0232 (test)
Wall time: 2.73 s


In [22]:
Ztrain = Ytrain - mod0lr.predict(Xtrain)

In [23]:
mod0 = sklearn.linear_model.Lasso(alpha=2e-5).fit(Xtrain, Ytrain)

In [24]:
src_hit['fcst_diff_lev1'] = src_hit.fcst_diff_simple - mod0.predict(X)

In [25]:
src_hit[['fcst_diff_lev1', 'fcst_diff_simple']].describe()

Unnamed: 0,fcst_diff_lev1,fcst_diff_simple
count,593376.0,593376.0
mean,0.001877,-0.000393
std,0.565513,0.612034
min,-4.877501,-4.782042
25%,-0.35909,-0.405822
50%,-0.035914,-0.055312
75%,0.313937,0.340884
max,4.680613,4.625071


In [26]:
%%time
lev1Npairs = []
for (i,(x,xn)) in enumerate(topNwords):
    tmp = src_hit[src_hit[x]]
    for j in range(i, len(topNwords)):
        y = topNwords[j][0]
        res = tmp.fcst_diff_lev1[tmp[y]].agg(['count','mean'])
        if res[0] == 0:
            res[1] = 0
        lev1Npairs.append(((x,y), res[0], res[1]))
    print('Done {0} out of {1}'.format(i+1, len(topNwords)), end='\r')

Wall time: 8min 7s000


In [27]:
lev0pairs = [(x[0],x[1],x[2],abs(np.power(x[1], 0.7)*x[2])) for x in topNpairs]
lev1pairs = [(x[0],x[1],x[2],abs(np.power(x[1], 0.7)*x[2])) for x in lev1Npairs]

In [145]:
sorted([x for x in lev0pairs if x[1]>500], key=lambda x:x[3], reverse=True)[49]

(('bundl', 'vs'), 891.0, 0.53264896274449647, 61.852246563620675)

In [30]:
lev2features = sorted([x for x in lev1pairs if x[1]>500], key=lambda x:x[3], reverse=True)[:200]

In [31]:
lev2features

[(('watch', 'appl'), 678.0, 0.33636448859067331, 32.260487431563718),
 (('bundl', 'for'), 7224.0, 0.061343830060119349, 30.825867277197045),
 (('pink', 'set'), 846.0, 0.2538331126469659, 28.425461834900094),
 (('lularo', 'free'), 605.0, 0.25344401707211384, 22.444545001723153),
 (('vs', 'set'), 801.0, 0.2074577964798624, 22.360037211642016),
 (('kor', 'purs'), 554.0, 0.26726073150912733, 22.253175246968222),
 (('michael', 'purs'), 518.0, 0.27943088083607065, 22.19755964323581),
 (('black', 'and'), 1762.0, -0.11852321733252197, 22.182350580925579),
 (('new', 'nike'), 1001.0, 0.17026864000646513, 21.450554410060512),
 (('girl', 'american'), 1536.0, 0.1201719116314084, 20.430397700505264),
 (('one', 'xbox'), 990.0, 0.15687217094246642, 19.610584717934135),
 (('pink', 'vs'), 11599.0, 0.027715772979688077, 19.400877758686573),
 (('lularo', 'ship'), 511.0, 0.23518943649126223, 18.505995049130998),
 (('lularo', 'black'), 1287.0, 0.12260510052238358, 18.41677238228235),
 (('case', '6s'), 764.0

In [32]:
features_2lev = [f[0] for f in lev2features]
src_train['fcst_diff_lev1'] = src_hit.fcst_diff_lev1
for f1, f2 in features_2lev:
    src_train['f2_{0}_{1}'.format(f1, f2)] = (src_hit[f1]*1)*(src_hit[f2]*1)

In [33]:
X2 = src_train[['f2_{0}_{1}'.format(f1,f2) for f1,f2 in features_2lev]].values
Y2 = src_train.fcst_diff_lev1.values
(Xtrain,Ytrain),(Xtest,Ytest) = modutils.splitSample((X2, Y2), pcts = [0.7,0.3])

In [34]:
X2.shape, X.shape

((593376, 200), (593376, 1000))

In [39]:
%%time
#mod1 = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)
#mod1 = sklearn.linear_model.SGDRegressor(n_iter=100).fit(Xtrain, Ytrain) #15.5 vs 13.5
#mod1 = sklearn.linear_model.Lasso(alpha=1e-4).fit(Xtrain, Ytrain) #1e-4: 14.7 vs 13.1, 1e-5: 16.1 vs 13.6, 1e-3: 4.89
#mod1 = sklearn.linear_model.LassoLars(alpha=1e-5).fit(Xtrain, Ytrain) 
#mod1 = sklearn.linear_model.Ridge(alpha=1e1).fit(Xtrain, Ytrain) #def: 16.1 vs 13.5
mod1 = sklearn.linear_model.Lasso(alpha=1e-4).fit(Xtrain, Ytrain)

Wall time: 5.62 s


In [36]:
%%time
mod1 = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100, n_estimators=100, max_depth=2).fit(Xtrain, Ytrain)

Wall time: 3min 42s


In [37]:
%%time
mod1 = sklearn.ensemble.RandomForestRegressor(min_samples_leaf=100, n_estimators=10, max_depth=3).fit(Xtrain, Ytrain)

Wall time: 21.4 s


In [40]:
%%time
Ptrain = mod1.predict(Xtrain)
Ptest = mod1.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.0050 (train), 0.0046 (test)
Wall time: 443 ms


In [41]:
src_hit['fcst_diff_lev2'] = src_hit.fcst_diff_lev1 - mod1.predict(X2)

In [42]:
src_hit[['fcst_diff_simple', 'fcst_diff_lev1', 'fcst_diff_lev2']].describe()

Unnamed: 0,fcst_diff_simple,fcst_diff_lev1,fcst_diff_lev2
count,593376.0,593376.0,593376.0
mean,-0.000393,0.001877,0.000263
std,0.612034,0.565513,0.564125
min,-4.782042,-4.877501,-4.87756
25%,-0.405822,-0.35909,-0.359428
50%,-0.055312,-0.035914,-0.03738
75%,0.340884,0.313937,0.311331
max,4.625071,4.680613,4.680554


In [44]:
src['fcst_simple_title'] = src.fcst_simple + mod0.predict(X) + mod1.predict(X2)

In [45]:
src['fcst_diff_simple_title'] = np.log(1+src.price) - src.fcst_simple_title

In [46]:
src[['fcst_diff_simple', 'fcst_diff_simple_title']].describe()

Unnamed: 0,fcst_diff_simple,fcst_diff_simple_title
count,593376.0,593376.0
mean,-0.000393,0.000263
std,0.612034,0.564125
min,-4.782042,-4.87756
25%,-0.405822,-0.359428
50%,-0.055312,-0.03738
75%,0.340884,0.311331
max,4.625071,4.680554


In [47]:
sklearn.metrics.r2_score(np.log(1+src.price), src.fcst_log_simple)

AttributeError: 'DataFrame' object has no attribute 'fcst_log_simple'

In [None]:
sklearn.metrics.r2_score(np.log(1+src.price), src.fcst_log_simple_title)

In [None]:
src_res = src[[x for x in list(src.columns) if x != 'name_proc']].copy()

In [None]:
src_res.to_csv(dst_file, index=False, encoding='utf-8')