In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime, time
import tensorflow as tf
import sklearn, sklearn.metrics, sklearn.preprocessing, sklearn.linear_model, sklearn.ensemble, sklearn.model_selection
import nltk, nltk.stem
import re
import pickle

import collections

import modutils

data_dir = '../DataSets/MercariPrice/'
src_file = data_dir + 'train_title.csv' 
stat_file = '21MerPrice04_TextStat.csv'
w2vsrc_file = '21MerPrice04_W2VSrc.csv'

In [4]:
%%time
src = pd.read_csv(src_file)

Wall time: 5.57 s


In [5]:
re1 = re.compile('[^a-z0-9\[\]&\+]')
re2 = re.compile('[\[\]]')
re3 = re.compile('[0-9]+')
re4 = re.compile('[&\+]')

In [6]:
%%time
src['item_text'] = src.item_description.astype(str).map(lambda x: re4.sub(' and ', re3.sub(' <num> ', re2.sub(' ', re1.sub(' ', x.lower()).replace('[rm]','<rm>')))))

Wall time: 9.28 s


In [7]:
%%time
stemmer = nltk.stem.SnowballStemmer('english')
src['item_proc'] = src.item_text.astype(str).map(lambda x: [stemmer.stem(y.lower()) for y in x.split() if len(y) > 0])

Wall time: 4min 6s


In [8]:
%%time
word_dict = collections.Counter([y for x in src.item_proc for y in x if len(y) > 1])

Wall time: 4.17 s


In [9]:
allWords = list(sorted(list(word_dict.items()), key=lambda x: x[1], reverse=True))
len(allWords)

65021

In [10]:
sum([x for (_,x) in allWords[:6800]]) / sum([x for (_,x) in allWords])

0.9804709952583157

In [11]:
idx = 6800
allWords[idx], allWords[idx][1] / sum([x for (_,x) in allWords])

(('wildflow', 56), 3.7488962513045823e-06)

## Dictionary coverage properties:

<table>
<th>Dictionary size</th><th>Last word</th><th>Last-word frequency</th>
<tr><td>50% / 100 words</td><td>'by'</td><td>23600 / 0.16%</td></tr>
<tr><td>75% / 500 words</td><td>'decor'</td><td>4409 / 0.03%</td></tr>
<tr><td>90% / 1700 words</td><td>'butteri'</td><td>843 / 5.7e-3%</td></tr>
<tr><td>95% / 3200 words</td><td>'swaddl'</td><td>273 / 1.8e-3%</td></tr>
<tr><td>97% / 5000 words</td><td>'gameplay'</td><td>109 / 7.3e-4%</td></tr>
<tr><td>98% / 6800 words</td><td>'lap'</td><td>57 / 3.8e-4%</td></tr>
<tr><td>99% / 11500 words</td><td>'grovia'</td><td>17 / 1.1e-4%</td></tr>
<tr><td>99.5% / 20000 words</td><td>'goon'</td><td>5 / 3.4e-5%</td></tr>
<tr><td>99.7% / 29000 words</td><td>'cinnamarol'</td><td>2 / 1.3e-5%</td></tr>
<tr><td>99.9% / 53000 words</td><td>'volm'</td><td>1 / 6.7e-6%</td></tr>
</table>

In [12]:
topNwords = list(sorted(list(word_dict.items()), key=lambda x: x[1], reverse=True))[:6800]

In [13]:
%%time
statNwords = [(z[0],z[1],tuple(src.fcst_diff_simple_title[src.item_proc.map(lambda x: z[0] in x)].agg(['mean', 'std'])))
              for z in topNwords]

Wall time: 49min 51s


In [14]:
sorted(statNwords, key=lambda x: abs(x[2][0]/x[2][1])*(x[1]**0.5), reverse=True)[:10]

[('<num>', 848931, (0.051297657066670693, 0.58452474698813783)),
 ('box', 43846, (0.20456952926852379, 0.65494762688800934)),
 ('<rm>', 88744, (0.12990638636404458, 0.61274165886665655)),
 ('authent', 26868, (0.23631473579978479, 0.63025086254346996)),
 ('retail', 20313, (0.23935557697420937, 0.58869535329390232)),
 ('and', 408742, (0.05121744858037202, 0.58124049340977157)),
 ('with', 186851, (0.073057848124179856, 0.58620271856202033)),
 ('the', 260826, (0.057850948562996767, 0.5872065789159443)),
 ('come', 38352, (0.16481804952543094, 0.66297236020213213)),
 ('origin', 21377, (0.20382794468309259, 0.64026241892653202))]

In [15]:
with open(stat_file, 'wb') as f:
    pickle.dump(statNwords, f)

In [16]:
mapper = {x[0]:i for (i,x) in enumerate(statNwords)}

def word2idx(w):
    if w in mapper:
        return mapper[w]+1
    else:
        return 0
    
def idx2word(i):
    if i == 0:
        return '<unk>'
    if i-1 >= len(statNwords):
        return '<err>'
    return statNwords[i-1][0]

In [17]:
w2v_src = list(src.item_proc.map(lambda x: [word2idx(z) for z in x]))

In [5]:
len(w2v_src)

593376

In [19]:
with open(w2vsrc_file, 'wb') as f:
    pickle.dump(w2v_src, f)

In [2]:
#Load state

with open(stat_file, 'rb') as f:
    statNwords = pickle.load(f)

with open(w2vsrc_file, 'rb') as f:
    w2v_src = pickle.load(f)
    
mapper = {x[0]:i for (i,x) in enumerate(statNwords)}

def word2idx(w):
    if w in mapper:
        return mapper[w]+1
    else:
        return 0
    
def idx2word(i):
    if i == 0:
        return '<unk>'
    if i-1 >= len(statNwords):
        return '<err>'
    return statNwords[i-1][0]

In [165]:
def form_batch(data, ids):
    tmp = np.array([[data[r[0]][r[1]], data[r[0]][r[2]]] for r in ids])
    return tmp[:,0], tmp[:,1]

def yield_batch(data, batch_size, p_word = 1, p_context = [(-1, 0.8), (1, 0.8)], num_batches=-1, verbose=True):
    batch_id = 0
    data_len = len(data)
    while True:
        batch_id += 1
        if num_batches > 0:
            if batch_id > num_batches:
                print('Completed yielding batches {}\t\t'.format(num_batches))
                break
            if not verbose:
                print('Yielding batch {} out of {}'.format(batch_id, num_batches), end='\r')
        ids = []
        while len(ids) < batch_size:
            id0 = np.random.randint(data_len)
            if len(data[id0]) == 0:
                continue
            idi = np.random.randint(len(data[id0]))
            idx = data[id0][idi]
            if type(p_word) in (list, np.ndarray):
                if np.random.uniform() > p_word[idx]:
                    continue
            for (rj, prob) in p_context:
                j = idi + rj
                if j < 0 or j >= len(data[id0]):
                    continue
                if np.random.uniform() > prob:
                    continue
                ids.append((id0, idi, j))
        
        yield form_batch(data, ids[:batch_size])

In [267]:
DICT_SIZE = len(statNwords) + 1
EMBED_SIZE = 25
NCE_NUM_SAMPLED = 50

tf.reset_default_graph()

with tf.name_scope('Input'):
    tf_in_word = tf.placeholder(tf.int32, shape=(None, ), name='in_word')
    tf_in_context = tf.placeholder(tf.int32, shape=(None, 1), name='in_context')
    
with tf.name_scope('Embedding'):
    tf_embedding = tf.Variable(np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE), dtype=tf.float32)
    tf_embedded_word = tf.nn.embedding_lookup(tf_embedding, tf_in_word, name='out_embedding')
    
with tf.name_scope('Training'):
    tf_nce_beta = tf.Variable(np.random.multivariate_normal(np.zeros(EMBED_SIZE), np.identity(EMBED_SIZE), size=DICT_SIZE), dtype=tf.float32)
    tf_nce_intercept = tf.Variable(np.random.normal(size=(DICT_SIZE,)), dtype=tf.float32)
    tf_nce_loss = tf.reduce_mean(
                    tf.nn.nce_loss(weights=tf_nce_beta, biases=tf_nce_intercept,
                                   labels=tf_in_context, inputs=tf_embedded_word,
                                   num_sampled=NCE_NUM_SAMPLED, num_classes=DICT_SIZE))
    tf_reg_loss = tf.reduce_mean(tf.square(tf.sqrt(tf.reduce_sum(tf.square(tf_embedding), axis=1)) - 1))
    tf_full_loss = tf_nce_loss# + tf_reg_loss
    tf_train = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(tf_full_loss)
    
with tf.name_scope('Validation'):
    tf_valid_dictionary = tf.constant(np.array(range(DICT_SIZE)))
    tf_valid_embedding = tf.nn.embedding_lookup(tf_embedding, tf_valid_dictionary)
    tf_valid_in_norm = tf_embedded_word / tf.sqrt(tf.reduce_sum(tf.square(tf_embedded_word), 1, keep_dims=True))
    tf_valid_dic_norm = tf_valid_embedding / tf.sqrt(tf.reduce_sum(tf.square(tf_valid_embedding), 1, keep_dims=True))
    tf_valid_similarity = tf.matmul(tf_valid_in_norm, tf_valid_dic_norm, transpose_b=True)
    
tffw = tf.summary.FileWriter('D:/Jupyter/Logs/00_B', tf.get_default_graph())
    
print('Graph creation complete.')

Graph creation complete.


In [241]:
%%time
valid_set = [x for x in yield_batch(w2v_src, batch_size=32768, num_batches=16)]
(valid_x, valid_y) = (np.hstack(x) for x in list(zip(*valid_set)))

Completed yielding batches 16		
Wall time: 7.92 s


In [268]:
tfsSaver = tf.train.Saver(max_to_keep=5)

simvalid_x = np.array([word2idx('two'), word2idx('this'), word2idx('awesom'), word2idx('bad'),
                       word2idx('price'), word2idx('ring'), word2idx('xbox'), word2idx('call'),
                      word2idx('book'), word2idx('shirt'), word2idx('<num>'), word2idx('<rm>')])
simvalid_dict = {tf_in_word: simvalid_x}
valid_dict = {tf_in_word: valid_x, tf_in_context: valid_y.reshape(-1, 1)}

hp_w2v_num0 = 50
hp_w2v_alpha = -0.5

p_w2v_wordnum = np.array([hp_w2v_num0] + [x[1] for x in statNwords])
p_w2v_word = 1 #np.power(np.maximum(1, p_w2v_wordnum / hp_w2v_num0), hp_w2v_alpha) 
p_w2v_context = [(-2, 0.3), (-1, 0.8), (1, 0.8), (2, 0.5), (3, 0.2)]
#p_w2v_context = [(-1, 0.2), (1, 0.8), (2, 0.5)]

In [269]:
num_epochs = 100

with tf.Session() as tfs:
    tfs.run(tf.global_variables_initializer())
    
    sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
    [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
    print('Starting loss={:.3f} ({:.3f} reg-loss)'.format(nce_loss, reg_loss))
    for q in range(len(sim)):
        print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        
    for i in range(num_epochs):
        t0 = time.perf_counter()
        for (train_x, train_y) in yield_batch(w2v_src, p_word=p_w2v_word, p_context=p_w2v_context,
                                              batch_size=512, num_batches=10000, verbose=False):
            train_dict = {tf_in_word: train_x, tf_in_context: train_y.reshape(-1, 1)}
            tf_train.run(feed_dict=train_dict)

        sim = tf_valid_similarity.eval(feed_dict=simvalid_dict)
        [nce_loss, reg_loss] = tfs.run([tf_nce_loss, tf_reg_loss], feed_dict=valid_dict)
        dic_embed = tf_valid_dic_norm.eval()
        t1 = time.perf_counter()
        print('Step complete in {0:.2f} sec, loss={1:.3f} ({2:.3f} reg-loss)'.format(t1-t0, nce_loss, reg_loss))
        for q in range(len(sim)):
            print([idx2word(z) for z in list(reversed(sim[q,:].argsort()))[:10]])
        p = tfsSaver.save(tfs, 'D:/Jupyter/Models-21MerPrice04-W2V/model-{0:02d}.ckpt'.format(i))
        print('Model saved at checkpoint: {0}'.format(p))
    
print('Complete')

Starting loss=215.411 (16.001 reg-loss)
['two', 'jaw', 'offer', 'choker', 'worm', 'exact', 'th', 'ninja', 'nikki', 'coca']
['this', 'earphon', 'steam', 'eastern', 'toofac', 'polyamid', 'exposur', 'exact', 'banner', 'theft']
['awesom', 'majest', 'hippi', 'mud', 'dressi', 'licens', 'densiti', 'minim', 'odorless', 'snap']
['bad', 'extend', 'dd', 'grape', 'turkey', 'druzi', 'blueberri', 'infal', 'er', 'fring']
['price', 'ram', 'winchest', 'januari', 'bailey', 'advis', 'keurig', 'simpson', 'unlimit', 'rope']
['ring', 'shore', 'kevin', 'scallop', 'ciat', 'tube', 'refil', 'os', 'tartelett', 'useabl']
['xbox', 'research', 'sheep', 'earli', 'teapot', 'marker', 'minus', 'orlando', 'cactus', 'processor']
['call', 'superman', 'plain', 'brooch', 'bitti', 'cord', 'apo', 'grillz', 'appl', 'rescu']
['book', 'bauer', 'forc', 'vintag', 'eau', 'june', 'randi', 'freshen', 'creativ', 'cam']
['shirt', 'foundat', 'marl', 'nivea', 'jessica', 'bronz', 'swag', 'scoobi', 'scoop', 'rosett']
['<num>', 'egg', 'espr

KeyboardInterrupt: 

In [None]:
word2idx('ring'), word2idx('shirt'), word2idx('book'), word2idx('xbox')

In [270]:
np.mean(dic_embed, axis=0)

array([ 0.12082284, -0.11939144,  0.12321651,  0.12471975,  0.10879699,
       -0.12246475,  0.12836537, -0.13117318, -0.13070413,  0.13072409,
        0.11504891,  0.12889196, -0.12317298, -0.10389047,  0.12465525,
       -0.12415253,  0.10928587,  0.12867165, -0.11030703,  0.12536304,
       -0.12166201,  0.10459193,  0.13441706,  0.13206217, -0.13261577], dtype=float32)

In [271]:
np.std(dic_embed, axis=0)

array([ 0.15574615,  0.15616973,  0.15664187,  0.15925401,  0.1541833 ,
        0.15634127,  0.16082509,  0.16339707,  0.16245647,  0.16218348,
        0.15690722,  0.15919511,  0.15955788,  0.15215985,  0.15896773,
        0.15876476,  0.152426  ,  0.15973669,  0.15225206,  0.16068704,
        0.16040784,  0.15007742,  0.15788537,  0.1597278 ,  0.16191949], dtype=float32)

In [272]:
dic_embed[:10]

array([[ 0.2173498 , -0.22903879,  0.22286132,  0.13756789,  0.1990371 ,
        -0.16451138,  0.19596237, -0.18101329, -0.18045586,  0.16646686,
         0.23006666,  0.17033021, -0.19361262, -0.1999778 ,  0.186573  ,
        -0.17190142,  0.19666769,  0.21611533, -0.19643372,  0.22396116,
        -0.15412666,  0.22039597,  0.23910256,  0.23738678, -0.22321671],
       [ 0.23985499, -0.21388969,  0.25334546,  0.11352131,  0.2127897 ,
        -0.19762003,  0.16952735, -0.19452524, -0.1572832 ,  0.17038959,
         0.21420483,  0.12099274, -0.19096471, -0.19581595,  0.18808624,
        -0.14558859,  0.17790687,  0.21009009, -0.22555663,  0.23373504,
        -0.15956974,  0.23497406,  0.2331166 ,  0.2403014 , -0.22135891],
       [ 0.2165428 , -0.21533097,  0.22044113,  0.1540786 ,  0.19411729,
        -0.15430568,  0.18733428, -0.16363351, -0.18572168,  0.17800497,
         0.23226233,  0.2057184 , -0.20317858, -0.19071822,  0.18615259,
        -0.18618186,  0.21078692,  0.20230702, -0

## Simple linear model

In [112]:
%%time
src_var = src.copy()
for (x,_,_) in tmpDict:
    src_var['f_{0}'.format(x)] = 1*src_var.item_proc.map(lambda z: x in z)

Wall time: 15min 28s


In [113]:
%%time
X = src_var[['f_{}'.format(x) for (x,_,_) in tmpDict]].values
Y = src_var.fcst_diff_simple_title.values
(Xtrain, Ytrain), (Xvalid, Yvalid), (Xtest, Ytest) = modutils.splitSample((X,Y), [0.3,0.2,0.5])

Wall time: 2min 5s
Parser   : 131 ms


In [114]:
%%time
#mgb0 = sklearn.ensemble.GradientBoostingRegressor(min_samples_leaf=100).fit(Xtrain, Ytrain)

Wall time: 0 ns


In [115]:
%%time
mlr0 = sklearn.linear_model.LinearRegression().fit(Xtrain, Ytrain)

Wall time: 3min 37s


In [116]:
%%time
Ptrain = mlr0.predict(Xtrain)
Ptest = mlr0.predict(Xtest)
print('Rsqr={:.4f} (train), {:.4f} (test)'.format(sklearn.metrics.r2_score(Ytrain, Ptrain),
                                                  sklearn.metrics.r2_score(Ytest, Ptest))) 

Rsqr=0.1150 (train), 0.0945 (test)
Wall time: 25.3 s


In [22]:
%%time
dists = [[np.sqrt(np.dot(dic_embed[i,:]-dic_embed[j,:], dic_embed[i,:]-dic_embed[j,:])) for j in range(len(dic_embed))]
         for i in range(len(dic_embed))]

Wall time: 6min 47s


In [23]:
ndists = np.array(dists)

In [34]:
allids_i = np.array([[i for j in range(len(dic_embed))] for i in range(len(dic_embed))])
allids_j = np.array([[j for j in range(len(dic_embed))] for i in range(len(dic_embed))])

In [57]:
mask = (ndists>0.7)&(ndists>0)
res = [(idx2word(i),idx2word(j),ndists[i,j], i, j) for (i,j) in zip(list(allids_i[mask]), list(allids_j[mask])) if i < j]

In [131]:
tmp = [statNwords[i] for i in np.random.randint(0, len(statNwords), 10)]

In [147]:
alpha = -0.7
num0 = 50
[(x[0], x[1], np.power(max(num0,x[1]), alpha)/np.power(num0,alpha)) for x in tmp]

[('bike', 300, 0.28529497656828423),
 ('singl', 1636, 0.087022327248350337),
 ('abov', 2700, 0.061280791801226107),
 ('philip', 62, 0.86021066043918659),
 ('mouthwash', 76, 0.74594960520796938),
 ('dolc', 677, 0.16138640034844332),
 ('screwdriv', 109, 0.5795360474360336),
 ('aso', 506, 0.19786712395596842),
 ('bacteri', 78, 0.73250868985133333),
 ('team', 1083, 0.11615531406126813)]

In [154]:
type(np.array([1,2])) in (list, np.ndarray) 

True

In [155]:
type(np.array([1,2]))

numpy.ndarray

In [157]:
tmp

[('bike', 300, 1.7689360204744258),
 ('singl', 1636, 2.0959375092312422),
 ('abov', 2700, 2.2036183309053405),
 ('philip', 62, 1.5109119946911631),
 ('mouthwash', 76, 1.5419892968416637),
 ('dolc', 677, 1.9189288261197288),
 ('screwdriv', 109, 1.5986104580964025),
 ('aso', 506, 1.8638675687448392),
 ('bacteri', 78, 1.5459998956507686),
 ('team', 1083, 2.0112351276167502)]