In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import tensorflow as tf
import sklearn, sklearn.metrics, sklearn.preprocessing, sklearn.linear_model, sklearn.ensemble, sklearn.model_selection
import nltk, nltk.stem
import re
import pickle

import collections

import modutils

data_dir = '../DataSets/MercariPrice/'
src_file = data_dir + 'train_title.csv' 
stat_file = '21MerPrice04_TextStat.csv'
w2vsrc_file = '21MerPrice04_W2VSrc.csv'

In [4]:
%%time
src = pd.read_csv(src_file)

Wall time: 5.57 s


In [5]:
re1 = re.compile('[^a-z0-9\[\]&\+]')
re2 = re.compile('[\[\]]')
re3 = re.compile('[0-9]+')
re4 = re.compile('[&\+]')

In [6]:
%%time
src['item_text'] = src.item_description.astype(str).map(lambda x: re4.sub(' and ', re3.sub(' <num> ', re2.sub(' ', re1.sub(' ', x.lower()).replace('[rm]','<rm>')))))

Wall time: 9.28 s


In [7]:
%%time
stemmer = nltk.stem.SnowballStemmer('english')
src['item_proc'] = src.item_text.astype(str).map(lambda x: [stemmer.stem(y.lower()) for y in x.split() if len(y) > 0])

Wall time: 4min 6s


In [8]:
%%time
word_dict = collections.Counter([y for x in src.item_proc for y in x if len(y) > 1])

Wall time: 4.17 s


In [9]:
allWords = list(sorted(list(word_dict.items()), key=lambda x: x[1], reverse=True))
len(allWords)

65021

In [10]:
sum([x for (_,x) in allWords[:6800]]) / sum([x for (_,x) in allWords])

0.9804709952583157

In [11]:
idx = 6800
allWords[idx], allWords[idx][1] / sum([x for (_,x) in allWords])

(('wildflow', 56), 3.7488962513045823e-06)

## Dictionary coverage properties:

<table>
<th>Dictionary size</th><th>Last word</th><th>Last-word frequency</th>
<tr><td>50% / 100 words</td><td>'by'</td><td>23600 / 0.16%</td></tr>
<tr><td>75% / 500 words</td><td>'decor'</td><td>4409 / 0.03%</td></tr>
<tr><td>90% / 1700 words</td><td>'butteri'</td><td>843 / 5.7e-3%</td></tr>
<tr><td>95% / 3200 words</td><td>'swaddl'</td><td>273 / 1.8e-3%</td></tr>
<tr><td>97% / 5000 words</td><td>'gameplay'</td><td>109 / 7.3e-4%</td></tr>
<tr><td>98% / 6800 words</td><td>'lap'</td><td>57 / 3.8e-4%</td></tr>
<tr><td>99% / 11500 words</td><td>'grovia'</td><td>17 / 1.1e-4%</td></tr>
<tr><td>99.5% / 20000 words</td><td>'goon'</td><td>5 / 3.4e-5%</td></tr>
<tr><td>99.7% / 29000 words</td><td>'cinnamarol'</td><td>2 / 1.3e-5%</td></tr>
<tr><td>99.9% / 53000 words</td><td>'volm'</td><td>1 / 6.7e-6%</td></tr>
</table>

In [12]:
topNwords = list(sorted(list(word_dict.items()), key=lambda x: x[1], reverse=True))[:6800]

In [13]:
%%time
statNwords = [(z[0],z[1],tuple(src.fcst_diff_simple_title[src.item_proc.map(lambda x: z[0] in x)].agg(['mean', 'std'])))
              for z in topNwords]

Wall time: 49min 51s


In [14]:
sorted(statNwords, key=lambda x: abs(x[2][0]/x[2][1])*(x[1]**0.5), reverse=True)[:10]

[('<num>', 848931, (0.051297657066670693, 0.58452474698813783)),
 ('box', 43846, (0.20456952926852379, 0.65494762688800934)),
 ('<rm>', 88744, (0.12990638636404458, 0.61274165886665655)),
 ('authent', 26868, (0.23631473579978479, 0.63025086254346996)),
 ('retail', 20313, (0.23935557697420937, 0.58869535329390232)),
 ('and', 408742, (0.05121744858037202, 0.58124049340977157)),
 ('with', 186851, (0.073057848124179856, 0.58620271856202033)),
 ('the', 260826, (0.057850948562996767, 0.5872065789159443)),
 ('come', 38352, (0.16481804952543094, 0.66297236020213213)),
 ('origin', 21377, (0.20382794468309259, 0.64026241892653202))]

In [15]:
with open(stat_file, 'wb') as f:
    pickle.dump(statNwords, f)

In [16]:
mapper = {x[0]:i for (i,x) in enumerate(statNwords)}

def word2idx(w):
    if w in mapper:
        return mapper[w]+1
    else:
        return 0
    
def idx2word(i):
    if i == 0:
        return '<unk>'
    if i-1 >= len(statNwords):
        return '<err>'
    return statNwords[i-1][0]

In [17]:
w2v_src = list(src.item_proc.map(lambda x: [word2idx(z) for z in x]))

In [18]:
len(w2v_src)

593376

In [19]:
with open(w2vsrc_file, 'wb') as f:
    pickle.dump(w2v_src, f)

In [2]:
#Load state

with open(stat_file, 'rb') as f:
    statNwords = pickle.load(f)

with open(w2vsrc_file, 'rb') as f:
    w2v_src = pickle.load(f)
    
mapper = {x[0]:i for (i,x) in enumerate(statNwords)}

def word2idx(w):
    if w in mapper:
        return mapper[w]+1
    else:
        return 0
    
def idx2word(i):
    if i == 0:
        return '<unk>'
    if i-1 >= len(statNwords):
        return '<err>'
    return statNwords[i-1][0]

In [3]:
def form_batch(data, ids):
    tmp = np.array([[data[r[0]][r[1]], data[r[0]][r[2]]] for r in ids])
    return tmp[:,0], tmp[:,1]

def yield_batch(data, batch_size, max_skip=3, p_take=0.8, num_batches=-1):
    batch_id = 0
    data_len = len(data)
    while True:
        if num_batches > 0 and batch_id > num_batches:
            break
        ids = []
        while len(ids) < batch_size:
            id0 = np.random.randint(data_len)
            idi = np.random.randint(len(data[id0]))
            for j in range(max(0, idi-max_skip), min(len(data[id0]), idi+max_skip+1)):
                if j==idi:
                    continue
                if np.random.uniform() > p_take:
                    continue
                ids.append((id0, idi, j))
        yield form_batch(data, ids[:batch_size])

In [53]:
DICT_SIZE = len(statNwords) + 1
EMBED_SIZE = 200

tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_in_word = tf.placeholder(tf.int32, shape=(None, 1))
    tf_in_context = tf.placeholder(tf.int32, shape=(None, 1))
    
with tf.name_scope('Embedding'):
    tf_embedding = tf.Variable(np.random.normal(size=(DICT_SIZE, EMBED_SIZE)) / np.sqrt(DICT_SIZE * EMBED_SIZE))
    tf_embedded_word = tf.nn.embedding_lookup(tf_embedding, tf_in_word)
    
with tf.name_scope('Training'):
    tf_nce_beta = tf.Variable(np.random.normal(size=(DICT_SIZE, EMBED_SIZE))/np.sqrt(EMBED_SIZE))
    tf_nce_intercept = tf.Variable(np.zeros(size=(DICT_SIZE))
    tf.nn.nce_loss()

print('Graph creation complete.')

[[  76    7]
 [  76  410]
 [  76    1]
 [   1 1805]
 [   1   21]
 [   1   56]
 [   1 1624]
 [   1    2]
 [ 131 1251]
 [ 131   57]]


In [36]:
tmp = np.random.normal(size=(32,1))  / np.sqrt(32)
np.sqrt(np.dot(tmp.transpose(), tmp))

array([[ 0.89204199]])

## Simple linear model

In [112]:
%%time
src_var = src.copy()
for (x,_,_) in tmpDict:
    src_var['f_{0}'.format(x)] = 1*src_var.item_proc.map(lambda z: x in z)

Wall time: 15min 28s


In [113]:
%%time
X = src_var[['f_{}'.format(x) for (x,_,_) in tmpDict]].values
Y = src_var.fcst_diff_simple_title.values
(Xtrain, Ytrain), (Xvalid, Yvalid), (Xtest, Ytest) = modutils.splitSample((X,Y), [0.3,0.2,0.5])

Wall time: 2min 5s
Parser   : 131 ms


In [114]:
%%time
#mgb0 = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100).fit(Xtrain, Ytrain)

Wall time: 0 ns


In [115]:
%%time
mlr0 = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)

Wall time: 3min 37s


In [116]:
%%time
Ptrain = mlr0.predict(Xtrain)
Ptest = mlr0.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.1150 (train), 0.0945 (test)
Wall time: 25.3 s
