In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import modutils
import pickle
import time

src_file = '../DataSets/Quora/w2v_src_180115.pickle'
tfidf_file = '../DataSets/Quora/tfidf_src_180124.pickle'

In [2]:
with open(src_file, 'rb') as f:
    (full_dict, full_sentences) = pickle.load(f)

In [3]:
%%time
totfreq = np.sum([x[1] for x in full_dict])
idfs = [np.log(totfreq/x[1]) for x in full_dict]

Wall time: 325 ms


In [4]:
p1 = full_sentences[:(len(full_sentences)//2)]
p2 = full_sentences[(len(full_sentences)//2):]

In [5]:
def transform_tfidf(corpus, idfs, vocab_size=100, verbose=True):
    res = []
    for i in range(vocab_size):
        tfidf = [(i, x.count(i), len(x)) for x in corpus]
        res.append(tfidf)
        if verbose:
            print('{0}/{1}\t\t'.format(i, vocab_size), end='\r')
    if verbose:
        print('Finalizing', end='\r')
    r = list(map(list, zip(*res)))
    r = [[(y[0], y[1]/y[2]*idfs[y[0]]) for y in x if y[1] > 0] for x in r]
    if verbose:
        print('Complete  ')
    return r

def features_oov(p1, p2, vocab_size=100):
    r = []
    for i in range(len(p1)):
        s1 = set([x for x in p1[i] if x > vocab_size])
        s2 = set([x for x in p2[i] if x > vocab_size])
        fAB = len(set.intersection(s1, s2))
        fAUB = len(set.union(s1, s2))
        fRAB = fAB / fAUB if fAUB > 0 else 0
        r.append([fAB, fAUB, fRAB])
    return r

def features_all(p1, p2, idfs, vocab_size=100, verbose=True):
    pt1 = transform_tfidf(p1, idfs, vocab_size=vocab_size, verbose=verbose)
    pt2 = transform_tfidf(p2, idfs, vocab_size=vocab_size, verbose=verbose)
    foov = features_oov(p1, p2, vocab_size=vocab_size)
    return pt1, pt2, foov

In [6]:
%%time
res = []
batch = 10000
step = 0
while step < len(p1):
    tmp = features_all(p1[step:(step+batch)], p2[step:(step+batch)], idfs, vocab_size=1000)
    res.append(tmp)
    step += batch
    print('Done {0} of {1}'.format(step, len(p1)))

Complete  
Complete  
Done 10000 of 242506
Complete  
Complete  
Done 20000 of 242506
Complete  
Complete  
Done 30000 of 242506
Complete  
Complete  
Done 40000 of 242506
Complete  
Complete  
Done 50000 of 242506
Complete  
Complete  
Done 60000 of 242506
Complete  
Complete  
Done 70000 of 242506
Complete  
Complete  
Done 80000 of 242506
Complete  
Complete  
Done 90000 of 242506
Complete  
Complete  
Done 100000 of 242506
Complete  
Complete  
Done 110000 of 242506
Complete  
Complete  
Done 120000 of 242506
Complete  
Complete  
Done 130000 of 242506
Complete  
Complete  
Done 140000 of 242506
Complete  
Complete  
Done 150000 of 242506
Complete  
Complete  
Done 160000 of 242506
Complete  
Complete  
Done 170000 of 242506
Complete  
Complete  
Done 180000 of 242506
Complete  
Complete  
Done 190000 of 242506
Complete  
Complete  
Done 200000 of 242506
Complete  
Complete  
Done 210000 of 242506
Complete  
Complete  
Done 220000 of 242506
Complete  
Complete  
Done 230000 of 2425

In [13]:
src_p1 = []
src_p2 = []
src_foov = []
src_vocab_size = 1000
for x in res:
    src_p1 += x[0]
    src_p2 += x[1]
    src_foov += x[2]

In [3]:
src_data = [(src_p1[i], src_p2[i], src_foov[i]) for i in range(len(src_p1))]

In [4]:
with open(tfidf_file, 'wb') as f:
    pickle.dump((src_data, src_vocab_size), f)