In [325]:
import tensorflow as tf
import numpy as np
import pandas as pd
import datetime, time

from sklearn.linear_model import LinearRegression

import json
import re, pylev
from collections import Counter

def shuffleBatches(tensorTuple, batchSize=64):
    if type(tensorTuple) is list or type(tensorTuple) is tuple: 
        ids = list(range(tensorTuple[0].shape[0]))
        np.random.shuffle(ids)
        for i in range(0,len(ids),batchSize):
            lst = min(len(ids), i + batchSize)
            yield (np.array(x[ids[i:lst],]) for x in tensorTuple)
    else:
        ids = list(range(tensorTuple.shape[0]))
        np.random.shuffle(ids)
        for i in range(0,len(ids),batchSize):
            lst = min(len(ids), i + batchSize)
            yield np.array(tensorTuple[ids[i:lst],])
            
def forecast_timeleft(steps, times, final_step, alpha=0.1):
    if len(times) < 2:
        return 0
    vx = np.array(steps)
    vt = np.array(times)
    dx = vx[1:]-vx[:-1]
    dt = vt[1:]-vt[:-1]
    sx = vx[:-1]
    nf = 1 / np.mean(dx)
    nx = sx * nf
    dnx = dx * nf
    dtx = dt / dx
    wx0 = np.exp( - (nx[-1] - nx) * alpha)
    wx = wx0 / np.sum(wx0)
    
    y = dtx
    x = np.array([nx, dx])
    
  #  if(len(y) > 50):
  #      print(wx[-10:])
    
    lr = LinearRegression().fit(np.transpose(x), y, sample_weight = wx)
    
    res = 0
    for j in range(np.max(vx), final_step, int(1/nf)):
        xf = np.array([j*nf, 1]).reshape([1,2])
        res += lr.predict(xf)[0] / nf
    return res

In [3]:
src0 = pd.read_csv('../DataSets/UsedCars/autos_utf8.csv', low_memory=False)

In [65]:
rep_pattern = '[\+\\:;,\[\]\(\)]'
del_pattern = '[^a-z0-9_.]'
src1 = [re.sub(pattern=del_pattern, string=re.sub(pattern=rep_pattern, string=x.lower(), repl='_'), repl='') for x in list(src0.name) if type(x) is str]
src = [[y for y in x.split('_') if y != ''] for x in src1]

src_flat = [y for x in src for y in x]
src_count = list(dict(Counter(src_flat)).items())

In [60]:
src_fullness = [(i, sum([x[1] for x in src_pairs if x[1] > i])/len(src_flat), sum([1 for x in src_pairs if x[1]>i])) for i in range(100)]

In [62]:
#fix 5k dictionary size, which corresponds to at least 14 repeats of the word and 4.7% missing
#if 10k: at least 4 repeats and 2.7% missing
#if 2k: at least 60 repeats and 8.7% missin
#src_fullness[:50]

In [245]:
def word_prob(p1, p2, pow_base):
    low_err = abs(len(p1[0])-len(p2[0]))
    if np.power(pow_base, low_err) * float(p1[1] / p2[1]) > 1.0:
        return 1.0
    s1 = set(p1[0])
    s2 = set(p2[0])
    low_err = min(len(s1.difference(s2)), len(s2.difference(s1)))
    if np.power(pow_base, low_err) * float(p1[1] / p2[1]) > 1.0:
        return 1.0
    
    num_err = pylev.damerau_levenshtein(p1[0], p2[0])
    res = np.power(pow_base, num_err) * float(p1[1] / p2[1])
    
    #print(p1[0], ' - ', p2, ' / ', res)
    return res

def word_subs(p, plist):
    bestp = (p[0], p[1], 1)
    pow_base = max(10.0, 50.0 / min(10, len(p[0])))
    pow_basep = pow_base * p[1]
    for (i,p2) in enumerate(plist):
        if p2[1] < pow_basep:
            break
        if p[0] == p2[0]:
            continue
        prob = word_prob(p, p2, pow_base)
        if bestp is None or bestp[2] > prob:
            bestp = (p2[0], p2[1], prob)
    return bestp

In [10]:
r1_freq = 30
r1_mapper = {x:x for x,y in src_pairs if y >= r1_freq}
r1_mapper.update({x:'<UNK>' for x,y in src_pairs if y < r1_freq})
r1_src = [[r1_mapper[y] for y in x] for x in src]
r1_flat = [y for x in r1_src for y in x]

In [240]:
ts = []
for x in src_count[40000:40100]:
    t0 = time.perf_counter()
    for j in range(1):
        word_subs(x, src_count_s)
    t1 = time.perf_counter()
    ts.append((x, t1-t0))

In [243]:
[x for x in enumerate(ts) if x[1][1]>0.1]

[(17, (('ibizza', 1), 0.10464752308325842)),
 (60, (('mann..', 1), 0.12278421697556041)),
 (61, (('18535', 1), 0.10563904905575328)),
 (63, (('20.600', 1), 0.10161857362254523)),
 (79, (('vollvoll', 1), 0.11392920292564668))]

In [235]:
word_subs(src_count[40007], src_count_s)

eigentlich  -  ('bmw', 38077)  /  27.538303963
eigentlich  -  ('volkswagen', 37411)  /  28.0285477533
eigentlich  -  ('opel', 36017)  /  1.81958519588
eigentlich  -  ('tdi', 33706)  /  1.9443422536
eigentlich  -  ('audi', 31021)  /  8.45053350956
eigentlich  -  ('mercedes', 30822)  /  8.50509376419
eigentlich  -  ('golf', 30370)  /  2.15791899901
eigentlich  -  ('2.0', 25774)  /  40.6834794754
eigentlich  -  ('benz', 24650)  /  2.65866125761
eigentlich  -  ('ford', 23843)  /  43.9783584281
eigentlich  -  ('dpf', 19994)  /  52.44453336
eigentlich  -  ('1.6', 19121)  /  54.8389728571
eigentlich  -  ('mit', 18965)  /  3.45562878988
eigentlich  -  ('renault', 15671)  /  4.18199221492
eigentlich  -  ('polo', 13389)  /  19.5790574352
eigentlich  -  ('klima', 13220)  /  4.95733736762
eigentlich  -  ('tuev', 12864)  /  20.3781094527
eigentlich  -  ('corsa', 12570)  /  83.4189339698
eigentlich  -  ('automatik', 11387)  /  23.0213401247
eigentlich  -  ('astra', 10790)  /  24.2950880445
eigentlic

eigentlich  -  ('multijet', 428)  /  612.485981308
eigentlich  -  ('service', 423)  /  38.7328605201
eigentlich  -  ('daewoo', 421)  /  622.669833729
eigentlich  -  ('automatic', 419)  /  156.410501193
eigentlich  -  ('102017', 419)  /  2502.56801909
eigentlich  -  ('xdrive', 414)  /  633.198067633
eigentlich  -  ('kamera', 412)  /  636.27184466
eigentlich  -  ('ducato', 411)  /  637.819951338
eigentlich  -  ('cayenne', 403)  /  162.620347395
eigentlich  -  ('schlachten', 402)  /  2608.39800995
eigentlich  -  ('alhambra', 393)  /  2668.13231552
eigentlich  -  ('vitara', 389)  /  168.473007712
eigentlich  -  ('syncro', 389)  /  673.892030848
eigentlich  -  ('reserviert', 388)  /  675.628865979
eigentlich  -  ('tueren', 388)  /  675.628865979
eigentlich  -  ('kadett', 386)  /  169.78238342
eigentlich  -  ('carrera', 386)  /  2716.51813472
eigentlich  -  ('112017', 386)  /  2716.51813472
eigentlich  -  ('anfaengerauto', 384)  /  10922.6666667
eigentlich  -  ('unfallwagen', 382)  /  10979.

('highline', 4221, 0.97038616441601522)

In [326]:
%%time
#should approximately take 10x44 secs = 7.5minutes
src_count_s = sorted([x for x in src_count if x[1] >= 10], key=lambda x:x[1], reverse=True)
src_best_subs = []
ts = []
ns = []
for (i,x) in enumerate(src_count):
    src_best_subs.append(word_subs(x, src_count_s))
    if i % 100 == 0:
        ts.append(time.perf_counter())
        ns.append(i)
        time_left = 0
        if i > 0:
            time_left = forecast_timeleft(ns, ts, len(src_count), alpha=0.01)
        print("Processed {0} items, run-time is {1:.3f}, {2:.3f} left".format(i, (ts[-1] - ts[0]), time_left),end='\r')

Processed 24900 items, run-time is 478.618, 697.381 leftt

KeyboardInterrupt: 

In [290]:
t = np.array(ts)
n = np.array(ns)
s = (t[1:]-t[:-1]) / (n[1:]-n[:-1])
xn = (np.max(n) - n) / (n[-1] - n[-2])
w = np.exp(-xn[1:] * 0.5)
avg_s = np.sum(s * w) / np.sum(w)
avg_s * 

In [294]:
forecast_time(ns, ts, 42000)

76.390739773632959