In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
# import sys
# !{sys.executable} -m pip install pandas seaborn

In [3]:
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

from collections import defaultdict
from itertools import groupby
from sklearn import datasets
from numpy import random
from scipy.stats import dirichlet, norm, poisson

In [4]:
from keras.datasets import reuters, imdb

Using TensorFlow backend.


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [6]:
import numpy as np
import os

In [7]:
from pathlib import Path
from collections import OrderedDict
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt

Path.ls = lambda x: list(x.iterdir())

## URSA Datasets

In [10]:
folder_ds_path = Path('./data/User Review Structure Analysis (URSA)/')
xml_path = (folder_ds_path/'Classified_Corpus.xml')
ds_path = (folder_ds_path/'10k')
sentence_npy_path = (folder_ds_path/'sentence.npy')
vocab_pkl_path = (folder_ds_path/'vocab.pkl')

# log words not pass
aspect_tags = ['Food', 'Staff', 'Ambience']
polatiry_tags = ['Positive', 'Negative', 'Neutral']
xml_review_tag = './/Review'
log_np = [[], [], []]

# length allowed sentences
length_allowed = [-1, -1, -1]
min_freq_allowed = -1

In [11]:
train_filename = 'train.txt.npy'
vocab_filename = 'vocab.pkl'

In [14]:
data_onehot = np.load((ds_path/train_filename), allow_pickle=True)

In [37]:
vocab = pickle.load(open((ds_path/vocab_filename), 'rb'))
vocab_size=len(vocab)

In [18]:
data_onehot.shape

(32362, 2)

In [19]:
data = data_onehot[:,0]

In [20]:
label = data_onehot[:,1]

In [21]:
data.shape

(32362,)

In [22]:
label.shape

(32362,)

## URSA Pre-process Data

In [9]:
import xml.etree.ElementTree as ET

In [10]:
def string_nested_xml(axml):
    return ' '.join([the_aiter for the_aiter in axml.itertext()])

def get_firstchild(axml):
    try:
        if len(axml.getchildren()) > 0:
            return axml.getchildren()[0].tag
        else:
            raise (Exception('ListIndex', 'aXmlElement input has no children.'))
    except Exception as e:
        print (str(e))

def xml_unique_valid(axml, alist_tag_allowed):
    return (len(axml.getchildren()) == 0) or (get_firstchild(axml) in alist_tag_allowed)

def xml_name_valid(axml, atag_name):
    return axml.tag == atag_name

In [11]:
def get_listsentence_unique(alist_xml, alist_tag_allowed):
    the_listsentence = []
    for the_axml in alist_xml:
        if xml_unique_valid(the_axml, alist_tag_allowed):
            the_listsentence.append(string_nested_xml(the_axml))

    return the_listsentence

In [12]:
def get_listxml_child(list_xml, tag):
    return_ = []
    for xml_ in list_xml:
        for xml_child in xml_:
            if xml_name_valid(xml_child, tag):
                return_.append(xml_child)

    return return_

In [13]:
def get_listxml_child_list(document_list, tag_list):
    return_ = []
    for tag in tag_list:
        xml_children = get_listxml_child(document_list, tag)
        return_.append(xml_children)
    
    return return_

In [14]:
def get_xml_unique_list(xml_children_list, polatiry_tags):
    return_ = []
    for xml_children in xml_children_list:
        xml_unique = get_listsentence_unique(xml_children, polatiry_tags)
        return_.append(xml_unique)
    
    return return_ 

In [15]:
corpus_tree = ET.parse(xml_path)
corpus_root = corpus_tree.getroot()

In [16]:
document_list = corpus_root.findall(xml_review_tag)

In [17]:
xml_children_list = get_listxml_child_list(document_list, aspect_tags)
for idx in range(0, len(xml_children_list)): print (len(xml_children_list[idx]))

96235
32892
16803


In [18]:
xml_unique_list = get_xml_unique_list(xml_children_list, polatiry_tags)
for idx in range(0, len(xml_unique_list)): print (len(xml_unique_list[idx]))

62348
23730
13385


In [19]:
xml_unique_list[0][0]

' The food is a melding of Moroccan comfort food and Spanish tapas fare : tagines , stews and salads , with surprises like baby eggplants and olives where you might not expect them . '

## Process Sentences

In [20]:
import nltk
import re

In [21]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/huylb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/huylb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
st = nltk.stem.porter.PorterStemmer()

In [23]:
def alphabet(atext):
    return re.sub("[^a-zA-Z]", " ", atext)

def liststopword():
    en_stopwords = list(nltk.corpus.stopwords.words("english"))
    additional_list = ["'s","...","'ve","``","''","'m",'--',"'ll","'d"]
    stopwords_ = set(en_stopwords + additional_list)
    return stopwords_

In [24]:
def process_sentence(sentence, sw):
    alphabet_ = alphabet(sentence)
    tokenized_ = nltk.word_tokenize(alphabet_.lower())
    stemmed_ = [st.stem(word) for word in tokenized_ if word not in sw]

    return (stemmed_, len(stemmed_))

In [25]:
def process_sentence_list(sentence_list, allowed_length, sw):
    np_log_ = []
    return_ = []
    for idx_, sentence_ in enumerate(sentence_list):
        processed_, length_ = process_sentence(sentence_, sw)
        if length_ > allowed_length:
            return_.append(processed_)
        else:
            np_log_.append(processed_)

    return return_, np_log_

In [26]:
def get_process_sentence_list(xml_list, length_allowed):
    return_ = []
    np_ = []
    sw = liststopword()
    for xml_, la_ in zip(xml_list, length_allowed):
        processed_, np_log_ = process_sentence_list(xml_, la_, sw)
        return_.append(processed_)
        np_.append(np_log_)
    
    return return_, np_

In [27]:
p_sentence_list, log_np = get_process_sentence_list(xml_unique_list, length_allowed)

In [28]:
for idx in range(0, len(p_sentence_list)): print (len(p_sentence_list[idx]))

62348
23730
13385


In [29]:
label_list = []
for idx in range(0, len(p_sentence_list)): label_list.append([idx] * len(p_sentence_list[idx]))

In [30]:
for idx in range(0, len(label_list)): print (len(label_list[idx]))

62348
23730
13385


## Create Vocab

In [31]:
def word_valid(aword):
    return aword not in [""," "]

def create_vocab_listsentence(alist_sentence, amin_freq_allowed):
    the_words = []
    for sentence_list_ in alist_sentence:
        for the_asentence in sentence_list_:
            for the_aword in the_asentence:
                the_words.append(the_aword)
        the_words_freq = nltk.FreqDist(the_words)
        the_vocab = []
        for the_aword, the_afreq in the_words_freq.items():
            if the_afreq > amin_freq_allowed:
                if word_valid(the_aword):
                    the_vocab.append(the_aword)

    the_vocab_sorted = sorted(the_vocab)
    #Assign a number corresponding to each word. Makes counting easier.
    the_vocab_sorted_dict = dict(zip(the_vocab_sorted, range(len(the_vocab_sorted))))
    return the_vocab_sorted, the_vocab_sorted_dict

In [32]:
vocab, vocab2id = create_vocab_listsentence(p_sentence_list, min_freq_allowed)

In [33]:
len(vocab)

31902

## Dataset

In [23]:
from sklearn.model_selection import StratifiedShuffleSplit

In [24]:
sss = StratifiedShuffleSplit(n_splits=2, test_size=0.9, random_state=0)
_, index_train_test = sss.split(data, label)

In [25]:
# from sklearn.model_selection import train_test_split

In [26]:
# x_, y_ = [], []
# for p_sentence, label_ in zip(p_sentence_list, label_list): 
#     x_.extend(p_sentence)
#     y_.extend(label_)

In [27]:
# len(x_) == len(y_)

In [29]:
# train_x, test_x, train_y, test_y =  train_test_split(
#     x_, y_, test_size=0.2, random_state=42)

In [47]:
train_x, train_y = data[index_train_test[0]], label[index_train_test[0]]
test_x, test_y = data[index_train_test[1]], label[index_train_test[1]]

In [49]:
print ('Data Loaded')
print ('Dim Training Data',train_x.shape[0], vocab_size)
print ('Dim Test Data',test_x.shape[0], vocab_size)

Data Loaded
Dim Training Data 3236 18073
Dim Test Data 29126 18073


## Constants

In [42]:
bs = 200
en1_units=100
en2_units=100
num_topic=3
num_input=vocab_size
variance=0.995
init_mult=1.0
learning_rate=0.002
batch_size=200
momentum=0.99
num_epoch=100
nogpu=True
drop_rate=0.2

## Topic Model Utility Functions

In [43]:
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]
def setify(o): return o if isinstance(o,set) else set(listify(o))
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

In [44]:
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_ = tensor_te
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = tensor_te.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))
    
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_, _ = next(iter(test_dl))
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = input_.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))

def print_top_words(beta, feature_names, n_top_words=10):
    print ('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                         for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        print('{}'.format(line))
    print ('---------------End of Topics------------------')

## Data Utility Functions

In [45]:
def collate(b):
    x, y = zip(*b)
    return torch.stack(x), torch.stack(y)

class IdifyAndLimitedVocab():
    _order=-1
    def __init__(self, vocab2id, limited_vocab):
        self.vocab2id = vocab2id
        self.limited_vocab = limited_vocab
    def __call__(self, item):
        idlist = [self.vocab2id[w] for w in item if self.vocab2id[w] < limited_vocab]
        return np.array(idlist)
    

class Numpyify():
    _order=0
    def __call__(self, item):
        return np.array(item)

class Onehotify():
    _order=1
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    def __call__(self, item):
        return np.array(np.bincount(item.astype('int'), minlength=self.vocab_size))
    
class YToOnehot():
    _order=1
    def __init__(self, num_classes):
        self.num_classes = num_classes
    def __call__(self, item):
        categorical = np.zeros((1, self.num_classes))
        categorical[0, item] = 1
        return categorical

class Tensorify():
    _order=2
    def __call__(self, item):
        return torch.from_numpy(item)

class Floatify():
    _order=3
    def __call__(self, item):
        return item.float()
    
class CheckAndCudify():
    _order=100
    def __init__(self):
        self.ic = torch.cuda.is_available()
    def __call__(self, item):
        return item.cuda() if self.ic else item
    
class URSADataset(Dataset):
    def __init__(self, x, y, tfms_x, tfms_y): 
        self.x, self.y = x, y
        self.x_tfms = tfms_x
        self.y_tfms = tfms_y
    def __len__(self): 
        return len(self.x)
    def __getitem__(self, i): 
        return compose(self.x[i], self.x_tfms), compose(self.y[i], self.y_tfms)
    
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
        
    def __iter__(self):
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])

## Load Data

In [69]:
vocab2id={v: k for k, v in vocab.items()}

In [94]:
vocab = vocab2id

In [70]:
num_classes = np.max(train_y) + 1

In [71]:
num_classes

3

In [84]:
# tfms_x = [IdifyAndLimitedVocab(vocab2id, vocab_size), Numpyify(), Onehotify(vocab_size=vocab_size), Tensorify(), Floatify(), CheckAndCudify()]
tfms_x = [Numpyify(), Onehotify(vocab_size=vocab_size), Tensorify(), Floatify(), CheckAndCudify()]
tfms_y = [YToOnehot(num_classes=num_classes), Tensorify(), Floatify(), CheckAndCudify()]

In [85]:
train_ds = URSADataset(train_x, train_y, tfms_x=tfms_x, tfms_y=tfms_y)
test_ds = URSADataset(test_x, test_y, tfms_x=tfms_x, tfms_y=tfms_y)

In [86]:
train_samp = Sampler(train_ds, bs, shuffle=False)
test_samp = Sampler(test_ds, bs, shuffle=False)

In [87]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
test_dl = DataLoader(test_ds, sampler=test_samp, collate_fn=collate)

## Define Model

In [88]:
def encoder(in_feature, hidden_feature1, hidden_feature2, drop_rate):
    return nn.Sequential(OrderedDict([
                ('linear1', nn.Linear(in_feature, hidden_feature1)),
                ('act1', nn.Softplus()),
                ('linear2', nn.Linear(hidden_feature1, hidden_feature2)),
                ('act2', nn.Softplus()),
                ('dropout', nn.Dropout(drop_rate))
            ]))

def decoder(in_feature, out_feature, drop_rate):
     return nn.Sequential(OrderedDict([
                ('act1', nn.Softmax(dim=-1)),
                ('dropout', nn.Dropout(drop_rate)),
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature)),
                ('act2', nn.Softmax(dim=-1))
            ]))

In [89]:
def hidden(in_feature, out_feature):
    return nn.Sequential(OrderedDict([
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature))
            ]))

In [95]:
class ProdLDA(nn.Module):
    def __init__(self, num_input, en1_units, en2_units, num_topic, drop_rate, init_mult):
        super(ProdLDA, self).__init__()
        self.num_input, self.en1_units, self.en2_units, \
        self.num_topic, self.drop_rate, self.init_mult = num_input, en1_units, en2_units, \
                                                            num_topic, drop_rate, init_mult
        # encoder
        self.en = encoder(num_input, en1_units, en2_units, drop_rate)
        self.mean = hidden(en2_units, num_topic)
        self.logvar = hidden(en2_units, num_topic)
        # decoder
        self.de = decoder(num_topic, num_input, drop_rate)
        # prior mean and variance as constant buffers
        self.prior_mean   = torch.Tensor(1, num_topic).fill_(0)
        self.prior_var    = torch.Tensor(1, num_topic).fill_(variance)
        self.prior_mean   = nn.Parameter(self.prior_mean, requires_grad=False)
        self.prior_var    = nn.Parameter(self.prior_var, requires_grad=False)
        self.prior_logvar = nn.Parameter(self.prior_var.log(), requires_grad=False)
        # initialize decoder weight
        if init_mult != 0:
            #std = 1. / math.sqrt( init_mult * (num_topic + num_input))
            self.de.linear.weight.data.uniform_(0, init_mult)
        # remove BN's scale parameters
        for component in [self.mean, self.logvar, self.de]:
            component.batchnorm.weight.requires_grad = False
            component.batchnorm.weight.fill_(1.0)

    def encode(self, input_):
        encoded = self.en(input_)
        posterior_mean = self.mean(encoded)
        posterior_logvar = self.logvar(encoded)
        return encoded, posterior_mean, posterior_logvar
    
    def decode(self, input_, posterior_mean, posterior_var):
        # take sample
        eps = input_.data.new().resize_as_(posterior_mean.data).normal_() # noise 
        z = posterior_mean + posterior_var.sqrt() * eps                   # reparameterization
        # do reconstruction
        recon = self.de(z)          # reconstructed distribution over vocabulary
        return recon
    
    def forward(self, input_, compute_loss=False, avg_loss=True):
        # compute posterior
        en2, posterior_mean, posterior_logvar = self.encode(input_) 
        posterior_var    = posterior_logvar.exp()
        
        recon = self.decode(input_, posterior_mean, posterior_var)
        if compute_loss:
            return recon, self.loss(input_, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input_, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input_ * (recon + 1e-10).log()).sum(1)
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = self.prior_mean.expand_as(posterior_mean)
        prior_var    = self.prior_var.expand_as(posterior_mean)
        prior_logvar = self.prior_logvar.expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.num_topic)
        # loss
        loss = (NL + KLD)
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean()
        else:
            return loss

## Train

In [100]:
model = ProdLDA(num_input, en1_units, en2_units, num_topic, drop_rate, init_mult)
optimizer = torch.optim.Adam(model.parameters(), learning_rate, betas=(momentum, 0.999))

In [101]:
if torch.cuda.is_available():
    model = model.cuda()

In [102]:
for epoch in range(num_epoch):
    loss_epoch = 0.0
    model.train()                    # switch to training mode
    for input_, _ in train_dl:
        recon, loss = model(input_, compute_loss=True)
        # optimize
        optimizer.zero_grad()        # clear previous gradients
        loss.backward()              # backprop
        optimizer.step()             # update parameters
        # report
        loss_epoch += loss.item()    # add loss to loss_epoch
    if epoch % 5 == 0:
        print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(input_)))
        emb = model.de.linear.weight.data.cpu().numpy().T
        print_top_words(emb, vocab, 50)
        print_perp(model)

Epoch 0, loss=62.29949972364638
---------------Printing the Topics------------------
hip accomplish deliveri robot roam load throughout treat u light pollack swallow includ goa xeo earn blame timer save pie dedic num huckleberri round honest throw head hoomoo cancel flank standbi lemoni brim array addict tast oeu ett buoy cram someli store pequena attitud chewi shortag bland birthday cabernet sure
bag strictli chosen evil ice broken danc amend driver restaur buco around normal ugh spoil surround vic made picki thailand novelti cram thick guess ethnic fukin manchego dinnerbrok long charg dissapoint dine sniffli skit buncha comfort meanwhil burbur finger capabl session fondli membran pc ele swordfish separetli folklor lemongrass suprem
breakfast horribl project outdoor jacqu rocco honest slice hipster antipasto wore skirt fruit dd forno transplant anyon h knudi understand shabu pair unexpect gamey duti jelli parmesean slow pack nowher scratch tomato advis line alon driven buttermilk casa

Epoch 40, loss=50.72814581129286
---------------Printing the Topics------------------
flavor veget roll top crispi boomer tender perfect music ravioli modern del oliv light beauti includ rib cheddar scallion seafood spici delic appl sauc ice rich chili filet insid larg delici bass soft goat steak bean thick gra homemad non cook noodl tasti hous num balanc easili chewi fall prosciutto
restaur great space sit locat typic someth chain paint food style keep dine menu small panel dumpl floor cram strictli decor level sushi ethnic diner parti beam two trek come size friend copper clad event energi jackson citi tell ba last around broken ehhhh mix emerg dish unfortun ambianc stranger
minut horribl manag slow brought peopl made point understand without empti back ful go water receiv n right attent hostess bartend bother deliv ask deliveri second order assum lip min everi wait custom behind treat reserv replac unattent guest find ensur wine move knudi beer promptli hour liquor entre duti
------

Epoch 80, loss=46.62566799587674
---------------Printing the Topics------------------
roll crispi boomer veget flavor top tender includ scallion gra cheddar noodl chili ravioli seafood filet perfect delic rib brais bass mushroom goat butteri soft bean thick appl oliv rich ice homemad spici crepe bed prosciutto sauc num slab spring delici tangi mignon polenta balanc good bland sausag tasti nut
food restaur great space sit locat dine chain decor paint hip clad ba typic panel small beam ethnic ehhhh floor strictli cram copper energi wooden trek diner comfort event citi level u someth real rugbi date cozi neat jackson ambianc student thailand sophist hang upper style bar tao throw bounc
back minut manag n horribl slow ful water made point empti without understand brought peopl hour receiv attent promptli ask bother bartend go lip wine hostess owner ten reserv readi mid treat guest wait unattent replac custom deliveri book ensur name second bday duti polit offer rd min equip regularli
-----

## Test

In [56]:
emb = model.de.linear.weight.data.cpu().numpy().T
print_top_words(emb, vocab)
print_perp(model)

---------------Printing the Topics------------------
beat bao bore breakfast annoyed camouflage blatantly bimbimbop adobada afternoon
adve aestethically ayurveda appeteasers accosted calamarie adventure appertizer aday beautyfull
bellini caribbean broaden bangersmash caters blander breasaola cercle alice admit
---------------End of Topics------------------
The approximated perplexity is:  inf
