In [1]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from tqdm import tqdm
import nltk
import string
import math
from multiprocessing import Pool
from datasets import load_dataset
import numpy as np
import torch.nn as nn
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import format_yelp
from LossyCounter import LossyCounter
import pygtrie

In [2]:
eps = 1e-6
max_n = 1
lift_threshold = 10

In [3]:
dataset = load_dataset("yelp_review_full")
train_corpus = dataset['train']['text']
test_corpus = dataset['test']['text']

In [4]:
def tokenize_chunk(corpus, skip_words = None):
    token_corpus = []
    if skip_words is None:
        skip_words = set(string.punctuation)# + stopwords.words('english'))
    for raw_text in corpus:
        formatted_text = format_yelp.format_text(raw_text).lower()
        word_list = [word for word in word_tokenize(formatted_text) if word not in skip_words]
        #word_list = [word for word in word_list if word not in skip_words]
        token_corpus.append(word_list)
    return token_corpus

In [5]:
def tokenize(corpus, skip_words = None, chunk_size = 1):
    L = len(corpus)
    N = math.ceil(L/chunk_size)
    pool = Pool()
    result = []
    with tqdm(total = N, position=0, leave=True) as pbar:
        for r in pool.imap(tokenize_chunk,[corpus[i:i+chunk_size] for i in range(0,L,chunk_size)]):
            result += r
            pbar.update()
    pool.close()
    return result

In [6]:
train_token_corpus = tokenize(train_corpus)
test_token_corpus = tokenize(test_corpus)

100%|█████████████████████████████████| 650000/650000 [01:47<00:00, 6046.35it/s]
100%|███████████████████████████████████| 50000/50000 [00:08<00:00, 6237.94it/s]


In [7]:
def get_word_freq(word_corpus, eps):
    lc = LossyCounter(eps)
    for text in tqdm(word_corpus):
        lc.cache(text)
    lc.flush()
    lc.prune()
    word_list = lc.getFreqItems()
    approx_count = lc.getCounts(word_list, 'median')
    result = {}
    total = lc.total
    sorted_word_list = sorted(word_list, 
                        key = approx_count.get, 
                        reverse = True)
    for word in sorted_word_list:
        result[word] = approx_count[word]/total
    return result

In [8]:
word_freq = get_word_freq(train_token_corpus, eps)

100%|████████████████████████████████| 650000/650000 [00:09<00:00, 68970.03it/s]


In [9]:
def get_ngram_freq(corpus, n, eps, batch_size=10):
    lc = LossyCounter(eps)
    L = len(corpus)
    for i in tqdm(range(0,L,batch_size)):
        chunk = []
        for text in corpus[i:i+batch_size]:
            T = len(text)
            chunk += ['_'.join(text[i:i+n]) for i in range(T-n+1)]
        lc.cache(chunk)
    lc.flush()
    lc.prune()
    ngram_list = lc.getFreqItems()
    approx_count = lc.getCounts(ngram_list, 'median')
    result = {}
    total = lc.total
    sorted_ngram_list = sorted(ngram_list, 
                        key = approx_count.get, 
                        reverse = True)
    for ngram in sorted_ngram_list:
        result[ngram] = approx_count[ngram]/total
    return result

In [10]:
ngram_freq_pool = word_freq.copy()
for n in range(2,max_n+1):
    ngram_freq = get_ngram_freq(train_token_corpus, n, eps)
    ngram_freq_pool = ngram_freq_pool | ngram_freq

In [11]:
def get_baseline_prob(phrase, ngram_freq, eps = 1e-7):
    words = phrase.split('_')
    n = len(words)
    baseline = 0
    for i in range(1,n):
        part1 = '_'.join(words[:i])
        if part1 not in ngram_freq and i > 1:
            continue
        prob1 = ngram_freq.get(part1, eps)
        part2 = '_'.join(words[i:])
        if part2 not in ngram_freq and i < n-1:
            continue
        prob2 = ngram_freq.get(part2, eps)
        prob = prob1*prob2
        if prob > baseline:
            baseline = prob
    return baseline

In [12]:
def expand_vocab(vocab, ngram_freq, scores, threshold = 100):
    for ngram in ngram_freq:
        if ngram in vocab:
            continue
        baseline = get_baseline_prob(ngram, ngram_freq)
        if baseline == 0:
            continue
        if ngram_freq[ngram] > threshold * baseline:
            vocab[ngram] = ngram_freq[ngram]
            scores[ngram] = np.log(ngram_freq[ngram]/baseline)

In [13]:
vocab = word_freq.copy()
scores = {}
expand_vocab(vocab, ngram_freq_pool, scores,lift_threshold)

In [14]:
keys = sorted(scores, key = scores.get, reverse=True)

In [15]:
def prefix_decode(text, t):
    #t = pygtrie.Trie()
    #for c in codebook:
    #    t[c.split('_')] = codebook[c]
    result = []
    parsed_text = []
    i = 0
    L = len(text)
    while i < L:
        key, value = t.longest_prefix(text[i:])
        if key is None:
            parsed_text.append(text[i])
            i += 1
            result.append('UNKNOWN')
        else:
            parsed_text.append(key)
            i += len(key)
            result.append(value)
    return result

In [16]:
t = pygtrie.Trie()
for c in vocab:
    t[c.split('_')] = c

In [17]:
r = []
for text in tqdm(train_token_corpus):
    r.append(prefix_decode(text,t))
rtest = []
for text in tqdm(test_token_corpus):
    rtest.append(prefix_decode(text,t))

100%|█████████████████████████████████| 650000/650000 [07:44<00:00, 1400.70it/s]
100%|███████████████████████████████████| 50000/50000 [00:32<00:00, 1549.18it/s]


In [18]:
from gensim.models import Word2Vec

In [19]:
model = Word2Vec(sentences=r, vector_size=300, window=5, min_count=1, workers=12, epochs = 20)

In [21]:
model.wv.most_similar('bad', topn=10)

[('terrible', 0.7135301828384399),
 ('horrible', 0.6551201343536377),
 ('good', 0.6434661746025085),
 ('awful', 0.6165274977684021),
 ('stellar', 0.5785626173019409),
 ('lousy', 0.5498121976852417),
 ('poor', 0.5445555448532104),
 ('horrid', 0.5273641347885132),
 ('crappy', 0.5091362595558167),
 ('sub-par', 0.5053234100341797)]

In [22]:
length = {}
for x in vocab:
    length[x] = len(x.split('_'))
length['UNKNOWN'] = 1

In [23]:
def embed(phrases, wv, a=1e-4):
    vector = [wv[x]*length[x] for x in phrases if x in wv]
    if len(vector) == 0:
        return np.zeros(300)
    result = np.sum(vector,axis=0) / np.sum([length[x] for x in phrases if x in wv],axis=0)
    return result

In [24]:
embedding = []
label = []
for i in tqdm(range(len(r))):
    embedding.append(embed(r[i], model.wv))
    #label.append(dataset['train']['label'][i])

100%|█████████████████████████████████| 650000/650000 [05:04<00:00, 2135.70it/s]


In [25]:
test_embed = []
for i in tqdm(range(len(rtest))):
    test_embed.append(embed(rtest[i], model.wv))
    #label.append(dataset['train']['label'][i])

100%|███████████████████████████████████| 50000/50000 [00:23<00:00, 2141.23it/s]


In [26]:
import lightgbm as lgb

In [27]:
Xtrain = embedding[:500000]
ytrain = dataset['train']['label'][:500000]
#Xvalid1 = np.array(embedding[450000:500000])
#yvalid1 = dataset['train']['label'][450000:500000]
Xvalid2 = np.array(embedding[500000:550000])
yvalid2 = dataset['train']['label'][500000:550000]

In [28]:
bcls = lgb.LGBMClassifier(num_leaves = 31, learning_rate = 0.1, n_estimators=1000)
bcls.fit(Xtrain, ytrain, eval_set = (Xvalid2, yvalid2))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.245357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 300
[LightGBM] [Info] Start training from score -1.614501
[LightGBM] [Info] Start training from score -1.590587
[LightGBM] [Info] Start training from score -1.564837
[LightGBM] [Info] Start training from score -1.581492
[LightGBM] [Info] Start training from score -1.701476


In [29]:
Xtest = np.array(test_embed)
ytest = np.array(dataset['test']['label'])

In [30]:
best_iter = np.argmin(bcls.evals_result_['valid_0']['multi_logloss'])
ytesthat = bcls.predict(Xtest, num_iteration = best_iter)

In [31]:
best_iter

991

In [32]:
np.mean(ytest==ytesthat)

0.5645

In [38]:
index2 = np.where(np.array(ytrain) != 2)[0]
index2valid = np.where(np.array(yvalid2) != 2)[0]
Xtrain2 = np.array(Xtrain)[index2]
ytrain2 = (np.array(ytrain)[index2]>2).astype(int)
Xvalid22 = np.array(Xvalid2)[index2valid]
yvalid22 = (np.array(yvalid2)[index2valid]>2).astype(int)
bcls2 = lgb.LGBMClassifier(num_leaves = 31, learning_rate = 0.1, n_estimators=2000)
bcls2.fit(Xtrain2, ytrain2, eval_set = (Xvalid22, yvalid22))

[LightGBM] [Info] Number of positive: 194041, number of negative: 201398
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.332774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 395439, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490698 -> initscore=-0.037214
[LightGBM] [Info] Start training from score -0.037214


In [39]:
Xtest = np.array(test_embed)
ytest = np.array(dataset['test']['label'])
index2test = np.where(np.array(ytest) != 2)[0]
Xtest2 = Xtest[index2test]
ytest2 = (np.array(ytest)[index2test]>2).astype(int)

In [40]:
best_iter = np.argmin(bcls2.evals_result_['valid_0']['binary_logloss'])
ytesthat = bcls2.predict(Xtest2, num_iteration = best_iter)

In [41]:
np.mean(ytest2==ytesthat)

0.919125

In [42]:
best_iter

1964

In [43]:
word_vectors = model.wv
word_vectors.save("word2vec_lift10.wordvectors")