In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

from collections import defaultdict
from itertools import groupby
from sklearn import datasets
from numpy import random
from scipy.stats import dirichlet, norm, poisson

In [3]:
# from keras.datasets import reuters, imdb

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [5]:
import numpy as np
import os

In [6]:
from pathlib import Path
from collections import OrderedDict
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt

Path.ls = lambda x: list(x.iterdir())

## URSA Datasets 10K 

In [50]:
folder_ds_path = Path('./data/User Review Structure Analysis (URSA)/')
xml_path = (folder_ds_path/'Classified_Corpus.xml')
ds_path = (folder_ds_path/'10k')
sentence_npy_path = (folder_ds_path/'sentence.npy')
vocab_pkl_path = (folder_ds_path/'vocab.pkl')
seed_words_path = (ds_path/'seed_words.txt')

# log words not pass
aspect_tags = ['Food', 'Staff', 'Ambience']
polatiry_tags = ['Positive', 'Negative', 'Neutral']
xml_review_tag = './/Review'
log_np = [[], [], []]

# length allowed sentences
length_allowed = [11, 7, 4]
min_freq_allowed = -1

## URSA Pre-process Data

In [9]:
import xml.etree.ElementTree as ET

In [10]:
def string_nested_xml(axml):
    return ' '.join([the_aiter for the_aiter in axml.itertext()])

def get_firstchild(axml):
    try:
        if len(axml.getchildren()) > 0:
            return axml.getchildren()[0].tag
        else:
            raise (Exception('ListIndex', 'aXmlElement input has no children.'))
    except Exception as e:
        print (str(e))

def xml_unique_valid(axml, alist_tag_allowed):
    return (len(axml.getchildren()) == 0) or (get_firstchild(axml) in alist_tag_allowed)

def xml_name_valid(axml, atag_name):
    return axml.tag == atag_name

In [11]:
def get_listsentence_unique(alist_xml, alist_tag_allowed):
    the_listsentence = []
    for the_axml in alist_xml:
        if xml_unique_valid(the_axml, alist_tag_allowed):
            the_listsentence.append(string_nested_xml(the_axml))

    return the_listsentence

In [12]:
def get_listxml_child(list_xml, tag):
    return_ = []
    for xml_ in list_xml:
        for xml_child in xml_:
            if xml_name_valid(xml_child, tag):
                return_.append(xml_child)

    return return_

In [13]:
def get_listxml_child_list(document_list, tag_list):
    return_ = []
    for tag in tag_list:
        xml_children = get_listxml_child(document_list, tag)
        return_.append(xml_children)
    
    return return_

In [14]:
def get_xml_unique_list(xml_children_list, polatiry_tags):
    return_ = []
    for xml_children in xml_children_list:
        xml_unique = get_listsentence_unique(xml_children, polatiry_tags)
        return_.append(xml_unique)
    
    return return_ 

In [15]:
corpus_tree = ET.parse(xml_path)
corpus_root = corpus_tree.getroot()

In [16]:
document_list = corpus_root.findall(xml_review_tag)

In [17]:
xml_children_list = get_listxml_child_list(document_list, aspect_tags)
for idx in range(0, len(xml_children_list)): print (len(xml_children_list[idx]))

96235
32892
16803


In [18]:
xml_unique_list = get_xml_unique_list(xml_children_list, polatiry_tags)
for idx in range(0, len(xml_unique_list)): print (len(xml_unique_list[idx]))

62348
23730
13385


In [19]:
xml_unique_list[0][0]

' The food is a melding of Moroccan comfort food and Spanish tapas fare : tagines , stews and salads , with surprises like baby eggplants and olives where you might not expect them . '

## Process Sentences

In [20]:
import nltk
import re

In [21]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/huylb314/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/huylb314/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
st = nltk.stem.porter.PorterStemmer()

In [23]:
def alphabet(atext):
    return re.sub("[^a-zA-Z]", " ", atext)

def liststopword():
    en_stopwords = list(nltk.corpus.stopwords.words("english"))
    additional_list = ["'s","...","'ve","``","''","'m",'--',"'ll","'d"]
    stopwords_ = set(en_stopwords + additional_list)
    return stopwords_

In [24]:
def process_sentence(sentence, sw):
    alphabet_ = alphabet(sentence)
    tokenized_ = nltk.word_tokenize(alphabet_.lower())
    stemmed_ = [st.stem(word) for word in tokenized_ if word not in sw]

    return (stemmed_, len(stemmed_))

In [25]:
def process_sentence_list(sentence_list, allowed_length, sw):
    np_log_ = []
    return_ = []
    for idx_, sentence_ in enumerate(sentence_list):
        processed_, length_ = process_sentence(sentence_, sw)
        if length_ > allowed_length:
            return_.append(processed_)
        else:
            np_log_.append(processed_)

    return return_, np_log_

In [26]:
def get_process_sentence_list(xml_list, length_allowed):
    return_ = []
    np_ = []
    sw = liststopword()
    for xml_, la_ in zip(xml_list, length_allowed):
        processed_, np_log_ = process_sentence_list(xml_, la_, sw)
        return_.append(processed_)
        np_.append(np_log_)
    
    return return_, np_

In [27]:
p_sentence_list, log_np = get_process_sentence_list(xml_unique_list, length_allowed)

In [28]:
for idx in range(0, len(p_sentence_list)): print (len(p_sentence_list[idx]))

11547
10175
10640


In [29]:
label_list = []
for idx in range(0, len(p_sentence_list)): label_list.append([idx] * len(p_sentence_list[idx]))

In [30]:
for idx in range(0, len(label_list)): print (len(label_list[idx]))

11547
10175
10640


## Create Vocab

In [31]:
def word_valid(aword):
    return aword not in [""," "]

def create_vocab_listsentence(alist_sentence, amin_freq_allowed):
    the_words = []
    for sentence_list_ in alist_sentence:
        for the_asentence in sentence_list_:
            for the_aword in the_asentence:
                the_words.append(the_aword)
        the_words_freq = nltk.FreqDist(the_words)
        the_vocab = []
        for the_aword, the_afreq in the_words_freq.items():
            if the_afreq > amin_freq_allowed:
                if word_valid(the_aword):
                    the_vocab.append(the_aword)

    the_vocab_sorted = sorted(the_vocab)
    #Assign a number corresponding to each word. Makes counting easier.
    the_vocab_sorted_dict = dict(zip(the_vocab_sorted, range(len(the_vocab_sorted))))
    return the_vocab_sorted, the_vocab_sorted_dict

In [32]:
vocab, vocab2id = create_vocab_listsentence(p_sentence_list, min_freq_allowed)

In [34]:
vocab_size = len(vocab)

## Dataset

In [35]:
from sklearn.model_selection import train_test_split

In [36]:
x_, y_ = [], []
for p_sentence, label_ in zip(p_sentence_list, label_list): 
    x_.extend(p_sentence)
    y_.extend(label_)

In [37]:
len(x_) == len(y_)

True

In [38]:
train_x, test_x, train_y, test_y =  train_test_split(
    x_, y_, test_size=0.9, random_state=0)

In [39]:
print ('Data Loaded')
print ('Dim Training Data',len(train_x), vocab_size)
print ('Dim Test Data', len(test_x), vocab_size)

Data Loaded
Dim Training Data 3236 18073
Dim Test Data 29126 18073


## Constants

In [40]:
bs = 200
en1_units=100
en2_units=100
num_topic=3
num_input=vocab_size
variance=0.995
init_mult=1.0
learning_rate=0.002
batch_size=200
momentum=0.99
num_epoch=100
nogpu=True
drop_rate=0.2

## Topic Model Utility Functions

In [51]:
def read_file_seed_words(fn):
    with open(fn, "r") as fr:
        def p_string_sw(l):
            return l.replace('\n','').split(',')
        rl = [p_string_sw(l) for l in fr]
    return rl

In [52]:
seed_words = read_file_seed_words(seed_words_path)

In [91]:
gamma = np.zeros((vocab_size, num_topic))

In [92]:
gamma.shape

(18073, 3)

In [93]:
gamma_bin = np.zeros((batch_size, vocab_size, num_topic))

In [94]:
gamma_bin.shape

(200, 18073, 3)

In [95]:
len(seed_words[1])

17

In [96]:
seed_words[1]

['staff',
 'servic',
 'friendli',
 'rude',
 'hostess',
 'waiter',
 'bartend',
 'waitress',
 'help',
 'polit',
 'bar',
 'courteou',
 'member',
 'waitstaff',
 'attitud',
 'reserv',
 'tip']

In [97]:
for k in range(len(seed_words)): # number of topics
    for idx in range(len(seed_words[k])): # number of words
        ivocab = vocab2id[seed_words[k][idx]]
        gamma[ivocab, k] = 1.0
        gamma_bin[:, ivocab, :] = 1.0

In [121]:
def setup_prior(fn, n_k=3):
    gamma = torch.zeros((len(vocab),n_k))
    gamma_bin = torch.zeros((batch_size, len(vocab),n_k))

    full_vocab = read_file_seed_words(fn)
    for k in range(len(full_vocab)):
        for idx in range(len(full_vocab[k])):
            ivocab = vocab2id[full_vocab[k][idx]]
            gamma[ivocab, k] = 1.0
            gamma_bin[:, ivocab, :] = 1.0

    return (gamma, gamma_bin)

In [122]:
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]
def setify(o): return o if isinstance(o,set) else set(listify(o))
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

In [123]:
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_ = tensor_te
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = tensor_te.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))
    
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_, _ = next(iter(test_dl))
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = input_.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))

def print_top_words(beta, feature_names, n_top_words=10):
    print ('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                         for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        print('{}'.format(line))
    print ('---------------End of Topics------------------')

## Data Utility Functions

In [124]:
def collate(b):
    x, y = zip(*b)
    return torch.stack(x), torch.stack(y)

class IdifyAndLimitedVocab():
    _order=-1
    def __init__(self, vocab2id, limited_vocab):
        self.vocab2id = vocab2id
        self.limited_vocab = limited_vocab
    def __call__(self, item):
        idlist = [self.vocab2id[w] for w in item if self.vocab2id[w] < self.limited_vocab]
        return np.array(idlist)
    

class Numpyify():
    _order=0
    def __call__(self, item):
        return np.array(item)

class Onehotify():
    _order=1
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    def __call__(self, item):
        return np.array(np.bincount(item.astype('int'), minlength=self.vocab_size))
    
class YToOnehot():
    _order=1
    def __init__(self, num_classes):
        self.num_classes = num_classes
    def __call__(self, item):
        categorical = np.zeros((1, self.num_classes))
        categorical[0, item] = 1
        return categorical

class Tensorify():
    _order=2
    def __call__(self, item):
        return torch.from_numpy(item)

class Floatify():
    _order=3
    def __call__(self, item):
        return item.float()
    
class CheckAndCudify():
    _order=100
    def __init__(self):
        self.ic = torch.cuda.is_available()
    def __call__(self, item):
        return item.cuda() if self.ic else item
    
class URSADataset(Dataset):
    def __init__(self, x, y, tfms_x, tfms_y): 
        self.x, self.y = x, y
        self.x_tfms = tfms_x
        self.y_tfms = tfms_y
    def __len__(self): 
        return len(self.x)
    def __getitem__(self, i): 
        return compose(self.x[i], self.x_tfms), compose(self.y[i], self.y_tfms)
    
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
        
    def __iter__(self):
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])

## Load Data

In [125]:
num_classes = np.max(train_y) + 1

In [126]:
num_classes

3

In [127]:
tfms_x = [IdifyAndLimitedVocab(vocab2id, vocab_size), Numpyify(), Onehotify(vocab_size=vocab_size), Tensorify(), Floatify(), CheckAndCudify()]
tfms_y = [YToOnehot(num_classes=num_classes), Tensorify(), Floatify(), CheckAndCudify()]

In [128]:
train_ds = URSADataset(train_x, train_y, tfms_x=tfms_x, tfms_y=tfms_y)
test_ds = URSADataset(test_x, test_y, tfms_x=tfms_x, tfms_y=tfms_y)

In [129]:
train_samp = Sampler(train_ds, bs, shuffle=False)
test_samp = Sampler(test_ds, bs, shuffle=False)

In [130]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
test_dl = DataLoader(test_ds, sampler=test_samp, collate_fn=collate)

In [131]:
abatchx, _  = next(iter(train_dl))

In [132]:
abatchx.size()

torch.Size([200, 18073])

In [133]:
gamma, gamma_bin = setup_prior(seed_words_path, 3)

In [135]:
gamma_bin.size()

torch.Size([200, 18073, 3])

In [136]:
t = (abatchx > 0).unsqueeze(dim=-1)

In [137]:
t.size()

torch.Size([200, 18073, 1])

In [140]:
((gamma_bin == 1) & t).size()

torch.Size([200, 18073, 3])

## Define Model

In [49]:
def encoder(in_feature, hidden_feature1, hidden_feature2, drop_rate):
    return nn.Sequential(OrderedDict([
                ('linear1', nn.Linear(in_feature, hidden_feature1)),
                ('act1', nn.Softplus()),
                ('linear2', nn.Linear(hidden_feature1, hidden_feature2)),
                ('act2', nn.Softplus()),
                ('dropout', nn.Dropout(drop_rate))
            ]))

def decoder(in_feature, out_feature, drop_rate):
     return nn.Sequential(OrderedDict([
                ('act1', nn.Softmax(dim=-1)),
                ('dropout', nn.Dropout(drop_rate)),
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature)),
                ('act2', nn.Softmax(dim=-1))
            ]))

In [50]:
def hidden(in_feature, out_feature):
    return nn.Sequential(OrderedDict([
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature))
            ]))

In [111]:
class ProdLDA(nn.Module):
    def __init__(self, num_input, en1_units, en2_units, num_topic, drop_rate, init_mult):
        super(ProdLDA, self).__init__()
        self.num_input, self.en1_units, self.en2_units, \
        self.num_topic, self.drop_rate, self.init_mult = num_input, en1_units, en2_units, \
                                                            num_topic, drop_rate, init_mult
        # encoder
        self.en = encoder(num_input, en1_units, en2_units, drop_rate)
        self.mean = hidden(en2_units, num_topic)
        self.logvar = hidden(en2_units, num_topic)
        # decoder
        self.de = decoder(num_topic, num_input, drop_rate)
        # prior mean and variance as constant buffers
        self.prior_mean   = torch.Tensor(1, num_topic).fill_(0)
        self.prior_var    = torch.Tensor(1, num_topic).fill_(variance)
        self.prior_mean   = nn.Parameter(self.prior_mean, requires_grad=False)
        self.prior_var    = nn.Parameter(self.prior_var, requires_grad=False)
        self.prior_logvar = nn.Parameter(self.prior_var.log(), requires_grad=False)
        # initialize decoder weight
        if init_mult != 0:
            #std = 1. / math.sqrt( init_mult * (num_topic + num_input))
            self.de.linear.weight.data.uniform_(0, init_mult)
        # remove BN's scale parameters
        for component in [self.mean, self.logvar, self.de]:
            component.batchnorm.weight.requires_grad = False
            component.batchnorm.weight.fill_(1.0)

    def gamma(self):
        # this function have to run after self.encode
        encoder_w1 = self.en.linear1.weight
        encoder_b1 = self.en.linear1.bias
        encoder_w2 = self.en.linear2.weight
        encoder_b2 = self.en.linear2.bias
        mean_w = self.mean.linear.weight
        mean_b = self.mean.linear.bias
        mean_running_mean = self.mean.batchnorm.running_mean
        mean_running_var = self.mean.batchnorm.running_var
        logvar_w = self.logvar.linear.weight
        logvar_b = self.logvar.linear.bias
        logvar_running_mean = self.logvar.batchnorm.running_mean
        logvar_running_var = self.logvar.batchnorm.running_var
        
        w1 = F.softplus(encoder_w1.t() + encoder_b1)
        w2 = F.softplus(F.linear(w1, encoder_w2, encoder_b2))
        wdr = F.dropout(w2, self.drop_rate)
        wo_mean = F.softmax(F.batch_norm(F.linear(wdr, mean_w, mean_b), mean_running_mean, mean_running_var), dim=-1)
        wo_logvar = F.softmax(F.batch_norm(F.linear(wdr, logvar_w, logvar_b), logvar_running_mean, logvar_running_var), dim=-1)
        
        print ("wo_mean: {}".format(wo_mean[0]))
        print ("gamma_mean: {}".format(wo_mean.size()))
        print ("gamma_logvar: {}".format(wo_logvar.size()))
        return wo_mean, wo_logvar
            
    def encode(self, input_):
        encoded = self.en(input_)
        posterior_mean = self.mean(encoded)
        posterior_logvar = self.logvar(encoded)
        return encoded, posterior_mean, posterior_logvar
    
    def decode(self, input_, posterior_mean, posterior_var):
        # take sample
        eps = input_.data.new().resize_as_(posterior_mean.data).normal_() # noise 
        z = posterior_mean + posterior_var.sqrt() * eps                   # reparameterization
        # do reconstruction
        recon = self.de(z)          # reconstructed distribution over vocabulary
        return recon
    
    def forward(self, input_, compute_loss=False, avg_loss=True):
        # compute posterior
        en2, posterior_mean, posterior_logvar = self.encode(input_) 
        posterior_var    = posterior_logvar.exp()
        
        recon = self.decode(input_, posterior_mean, posterior_var)
        if compute_loss:
            return recon, self.loss(input_, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input_, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input_ * (recon + 1e-10).log()).sum(1)
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = self.prior_mean.expand_as(posterior_mean)
        prior_var    = self.prior_var.expand_as(posterior_mean)
        prior_logvar = self.prior_logvar.expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.num_topic)
        # loss
        loss = (NL + KLD)
        
        self.gamma()
        
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean()
        else:
            return loss

## Train

In [112]:
model = ProdLDA(num_input, en1_units, en2_units, num_topic, drop_rate, init_mult)
optimizer = torch.optim.Adam(model.parameters(), learning_rate, betas=(momentum, 0.999))

In [113]:
if torch.cuda.is_available():
    model = model.cuda()

In [114]:
for epoch in range(num_epoch):
    loss_epoch = 0.0
    model.train()                    # switch to training mode
    for input_, _ in train_dl:
        recon, loss = model(input_, compute_loss=True)
        # optimize
        optimizer.zero_grad()        # clear previous gradients
        loss.backward()              # backprop
        optimizer.step()             # update parameters
        # report
        loss_epoch += loss.item()    # add loss to loss_epoch
        break
    if epoch % 5 == 0:
        print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(input_)))
        emb = model.de.linear.weight.data.detach().cpu().numpy().T
        print_top_words(emb, vocab, 50)
        print_perp(model)
    break

wo_mean: tensor([0.2799, 0.2894, 0.4306], grad_fn=<SelectBackward>)
gamma_mean: torch.Size([18073, 3])
gamma_logvar: torch.Size([18073, 3])
Epoch 0, loss=0.6498851013183594
---------------Printing the Topics------------------
staff steak outsid entre foodist cole hardcor monro boat amaron testicl lousi frenchwoman proffer jame whereupon deconstruct peynirli reat chindian at authit deferenti specjal windowsil lumpia pseudo huachinango kanzuri monther serrano fixin blond child pleasingli cutesi lanter savarin valrhona kassel atmo across sate place teow poppyse squab ragout ottaman pack
occas girl ask mosaic waaaaaaay pumpkin fest strongest cobb repeat gianduja blase woodburn bogart banxiao decent elect morsel krystal snarlli chowder waitservic roti forno zarela mei modernist absoulut pointless wizard vp rel complai heidelberg robitussin je stormi aris bocca atrium ill spinkl lancast unrel wrought duct guayabera jd bib manhattan
asparagu return yell ny bellavita papardel curv sea refer lc

## Test

In [78]:
emb = model.de.linear.weight.data.cpu().numpy().T
print_top_words(emb, vocab, 50)
print_perp(model)

---------------Printing the Topics------------------
coconut bland stew rib shrimp green tasti fresh yellowtail veget oyster wrap ravioli meati brioch simpl sashimi assort ingredi mackerel app rabbit ginger chewi salad carpaccio serv beef roll tast vanilla benedict foie grandma standout yuzu size spice cream pickl walnut naan rice stuf mixtur dri mango asparagu tart cheddar
time look sens luger design arch abund nobl pub sop ambienc iron seven limit bistro cement squar suburb travel rule new chandeli tall scandinavian happili empir hesser mahogani eas mathur downtown brick reminisc wallac calm noisi bento eleg tablecloth slope dine predecessor dissappear frustat irish undul regardless swoop interior ivori
us would romant next beverag kept came accomod unaccommod somewher sweater staf brusqu atleast crowd retro friend cozier paid waitress warm wew stool trunk left children great farmhous nobodi boutiq oxiou decemb bra endless event comfort heat bangalor half romanit impati ismoodi memor