In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns

from collections import defaultdict
from itertools import groupby
from sklearn import datasets
from numpy import random
from scipy.stats import dirichlet, norm, poisson

In [3]:
from keras.datasets import reuters, imdb

Using TensorFlow backend.


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [5]:
import numpy as np
import os

In [6]:
from pathlib import Path
from collections import OrderedDict
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt

Path.ls = lambda x: list(x.iterdir())

## Animal Farm Datasets

### Animal Generator

words, topics, and documents
animal and where they live
where they spend the majority of their time - land, air, sea. 
Obviously there are some animals that only dwell in one place; a cow only lives on land and a fish only lives in the sea. However, there are other animals, such as some birds, that split their time between land, sea, and air.

topics(land, sea, air): a distribution of where they live.
If I quantify these probabilities into a distribution over all the animals (words) for each type of habitat (land,sea, air - topics) I can use them to generate sets of animals (words) to populate a given location (document) which may contain a mix of land, sea, and air (topics).

So let’s move on to generating a specific location. We know that different locations will vary in terms of which habitats are present. For example, a beach contains land, sea, and air, but some areas inland may only contain air and land like a desert. We can define the mixture of these types of habitats in each location. For example, a beach is 1/3 land, 1/3 sea, and 1/3 air. 

In [7]:
vocab = ['🐋','🐳','🐟','🐠','🐙','🦀','🐊','🐢','🐍','🐓','🦃','🐦','🐧','🐿','🐘','🐂','🐑','🐪']
topic_labels = ['land', 'sea', 'air']
beta = [[0.  , 0.  , 0.  , 0.  , 0.  , 0.05, 0.05, 0.05, 0.05, 0.1 , 0.1 ,
        0.05, 0.05, 0.1 , 0.1 , 0.1 , 0.1 , 0.1 ], # land
       [0.12, 0.12, 0.12, 0.12, 0.12, 0.12, 0.06, 0.06, 0.06, 0.  , 0.  ,
        0.00, 0.1, 0.  , 0.  , 0.  , 0.  , 0.  ], # sea
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        1.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ]] # air

In [8]:
# a quick note: sorted function to make similar words to group together

In [9]:
# a document is generated from a topic

# generate document from 0.3, 0.3, 0.34 for land, sea, air respectively
topic_mixture_proportion = [0.33, 0.34, 0.33]
beta_distributions = beta
mu = 10

docs = defaultdict(str)
for doc_index in range(10):
    topic = random.choice(topic_labels, 
                            p=topic_mixture_proportion)
    doc_len = int(poisson(mu=mu).rvs(size=1))
    doc = [random.choice(vocab, 
                    p=beta_distributions[topic_labels.index(topic)]) 
                  for _ in range(doc_len)]
    doc = sorted(doc)
    print("Document {:02d} (topic {}): {}".format(doc_index, topic, ''.join(doc)))

Document 00 (topic air): 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
Document 01 (topic sea): 🐋🐋🐍🐙🐙🐟🐟🐠🐧🐳🐳
Document 02 (topic land): 🐂🐂🐑🐑🐓🐓🐘🐘🦃🦃
Document 03 (topic land): 🐂🐊🐍🐓🐓🐘🐘🐦🐧🐪🦀🦃
Document 04 (topic sea): 🐋🐍🐍🐍🐙🐙🐟🐠🐠🐢🐧🐧
Document 05 (topic land): 🐊🐊🐑🐓🐘🐘🐦🦃🦃
Document 06 (topic air): 🐦🐦🐦🐦🐦
Document 07 (topic sea): 🐊🐋🐋🐍🐍🐍🐙🐙🐙🐙🐙🐙🐙🐟🐢🐧🐧🐧🐳🐳🦀
Document 08 (topic land): 🐂🐑🐓🐓🐓🐘🐪🐿🐿🐿🐿🦀🦀
Document 09 (topic air): 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦


In [10]:
num_clusters = len(topic_labels)
topic_mixture_proportion = dirichlet(alpha=num_clusters*[1]).rvs()[0]
# print (topic_mixture_proportion)

num_vocabs = len(vocab)
beta_distributions = (dirichlet(alpha=num_vocabs*[1])
                         .rvs(size=num_clusters))
# print (beta_distributions)

docs = defaultdict(str)
for doc_index in range(10):
    topic = random.choice(topic_labels, 
                            p=topic_mixture_proportion)
    doc_len = int(poisson(mu=mu).rvs(size=1))
    doc = [random.choice(vocab, 
                    p=beta_distributions[topic_labels.index(topic)]) 
                  for _ in range(doc_len)]
    doc = sorted(doc)
    print("Document {:02d} (topic {}): {}".format(doc_index, topic, ''.join(doc)))

Document 00 (topic land): 🐂🐘🐟🐠🐢🐧🐳🐳🐳
Document 01 (topic sea): 🐍🐍🐑🐓🐙🐟🐦🦃
Document 02 (topic sea): 🐍🐓🐓🐙🐙🐟🐢🐢🐦🐧🦀🦃
Document 03 (topic air): 🐋🐍🐢🐦🐧🐪🐳
Document 04 (topic air): 🐂🐍🐓🐧🐧🐧
Document 05 (topic air): 🐂🐂🐊🐦🐧🐧🐪
Document 06 (topic sea): 🐓🐙🐙🐙🐙🐦🐳🦃
Document 07 (topic air): 🐊🐋🐋🐘🐙🐠🐧🐧🐧🐳
Document 08 (topic air): 🐂🐂🐊🐊🐋🐋🐍🐍🐧🐧🐪🐪🐳🐳
Document 09 (topic air): 🐊🐋🐋🐧🐪🐳


In [11]:
beta_distributions = beta
num_docs = 2000

docs = defaultdict(lambda: defaultdict(str))
for doc_index in range(num_docs):
#     doc_len = poisson(mu=10).rvs(size=1)
    doc_len = 50
    topic_mixture_proportion = dirichlet([0.33, 0.33, 0.34]).rvs()[0]
    
    for word in range(doc_len):
        topic = random.choice(topic_labels, 
                                 p=topic_mixture_proportion)
        word_dist = beta_distributions[topic_labels.index(topic)]
        word = random.choice(vocab, 
                            p=word_dist)
        docs[doc_index][topic] += word
        
        docs[doc_index][topic] = ''.join(sorted(docs[doc_index][topic]))

In [12]:
documents = []

for doc_id, doc in docs.items():
    temp_doc = []
    print("Doc {:02d}".format(doc_id))
    for topic, words in doc.items():
        print("  Words from topic {}: {}".format(topic, words))
        temp_doc.extend(words)
    documents.append(temp_doc)

Doc 00
  Words from topic sea: 🐊🐊🐋🐋🐋🐍🐍🐍🐙🐙🐟🐠🐠🐠🐠🐢🐢🐧🐧🐧🐧🐧🐳🐳🐳🐳🐳🐳🦀🦀🦀🦀🦀🦀
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐓
Doc 01
  Words from topic sea: 🐊🐊🐊🐊🐋🐋🐋🐋🐋🐋🐍🐙🐙🐙🐙🐙🐟🐟🐟🐟🐟🐟🐟🐠🐠🐠🐠🐠🐢🐢🐧🐳🐳🐳🐳🐳🦀🦀🦀🦀🦀🦀🦀
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦
Doc 02
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐍🐑🐑🐑🐑🐓🐓🐓🐘🐘🐧🐧🐪🐪
  Words from topic sea: 🦀
Doc 03
  Words from topic sea: 🐊🐊🐋🐋🐋🐙🐟🐠🐠🐠🐢🐧🐳🦀🦀🦀
  Words from topic land: 🐂🐂🐍🐍🐑🐑🐓🐓🐓🐘🐘🐢🐪🐿🐿🐿🦃
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
Doc 04
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐢
Doc 05
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐊🐑🦀
Doc 06
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐊🐑🐘🐢🐦🐧🐧🐪🐿🐿🐿🐿🦀🦃🦃🦃🦃
Doc 07
  Words from topic land: 🐂🐂🐑🐑🐓🐘🐢🐢🐦🐧🐿🐿
  Words from topic sea: 🐊🐊🐋🐋🐋🐍🐍🐍🐍🐙🐙🐙🐙🐙🐙🐙🐟🐟🐟🐟🐠🐠🐠🐢🐢🐢🐢🐧🐧🐧🐧🐳🐳🐳🦀🦀
  Words from topic air: 🐦🐦
Doc 08
  Words from topic sea: 🐊🐊🐋🐋🐋🐋🐋

  Words from topic air: 🐦🐦
Doc 398
  Words from topic land: 🐂🐂🐂🐂🐊🐊🐊🐊🐍🐍🐑🐑🐑🐑🐓🐓🐓🐓🐓🐘🐘🐘🐘🐢🐢🐢🐢🐦🐦🐧🐪🐪🐪🐪🐪🐪🐪🐪🐪🐿🐿🐿🦀🦃🦃🦃🦃
  Words from topic sea: 🐋🐙🐧
Doc 399
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐂🐊🐊🐊🐍🐍🐑🐑🐓🐓🐓🐓🐘🐘🐘🐘🐢🐢🐦🐦🐧🐧🐪🐪🐿🐿🦃🦃
  Words from topic sea: 🐋🐟🐠🐳
Doc 400
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐓🐓🐦🐪🐪🐪
Doc 401
  Words from topic land: 🐂🐂🐂🐊🐊🐍🐍🐑🐑🐑🐑🐑🐓🐓🐘🐘🐘🐘🐘🐘🐘🐢🐢🐢🐢🐦🐦🐦🐦🐧🐧🐪🐪🐪🐿🐿🐿🐿🦀🦃🦃🦃🦃🦃
  Words from topic air: 🐦🐦🐦🐦🐦🐦
Doc 402
  Words from topic land: 🐂🐂🐂🐂🐂🐊🐊🐊🐊🐍🐍🐍🐑🐑🐑🐑🐑🐑🐑🐓🐓🐓🐓🐓🐘🐘🐢🐢🐢🐧🐧🐧🐪🐪🐪🐪🐪🐪🐿🐿🐿🐿🐿🦀🦀🦀🦀🦃🦃
  Words from topic air: 🐦
Doc 403
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐋🐋🐍🐙🐙🐙🐙🐟🐟🐟🐠🐠🐢🐢🐧🐳🦀🦀🦀🦀🦀🦀
  Words from topic land: 🐑🐑
Doc 404
  Words from topic land: 🐂🐂🐂🐂🐂🐊🐑🐑🐓🐓🐘🐘🐘🐘🐢🐢🐢🐢🐦🐦🐦🐧🐪🐪🐿🐿🐿🦀🦀🦀🦃🦃
  Words from topic sea: 🐋🐋🐋🐍🐙🐟🐠🐠🐢🐢🐢🐧🐧🐳🐳🦀🦀
  Words from topic air: 🐦
Doc 405
  Words from topic land: 🐂🐂🐂🐂🐂🐂🐊🐊🐊🐍🐍🐍🐍🐍🐑🐑🐑🐑🐓🐓🐘🐘🐘🐘🐘🐘🐘🐢🐢🐦🐦🐦🐦🐧🐧🐪🐪🐪🐿🐿🦀🦀🦃🦃🦃🦃🦃
  Words from topic air: 🐦🐦🐦
Doc 406
  Words from topic

  Words from topic air: 🐦🐦🐦🐦🐦
Doc 685
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐋🐙🐟🐟🐠🐠🐧🐧🐧🐳🐳🦀
Doc 686
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐊🐊🐍🐙🐟🐟🐠🐠🐠🐠🐢🐧🐧🐧🐳🐳🐳🦀
Doc 687
  Words from topic land: 🐂🐂🐊🐍🐑🐑🐑🐑🐑🐑🐑🐑🐑🐑🐓🐓🐓🐓🐓🐓🐘🐘🐘🐘🐦🐦🐧🐧🐪🐪🐪🐿🦀🦀🦀🦀🦀🦃🦃🦃
  Words from topic sea: 🐋🐋🐙🐟🐢🐢🐳🦀🦀🦀
Doc 688
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐂🐍🐑🐑🐓🐓🐧🐧🐪🐿🦃
Doc 689
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐊🐋🐋🐍🐙🐙🐟🐟🐟🐠🐢🐢🐧🐳🐳🦀🦀
  Words from topic land: 🐂🐍🐘🐪🐿🐿
Doc 690
  Words from topic sea: 🐊🐊🐊🐋🐋🐋🐋🐋🐋🐍🐍🐍🐙🐙🐙🐙🐙🐙🐟🐟🐠🐠🐠🐠🐠🐠🐢🐢🐢🐢🐢🐧🐧🐧🐧🐧🐧🐧🐳🐳🐳🦀🦀🦀🦀🦀🦀🦀
  Words from topic land: 🐂🐓
Doc 691
  Words from topic land: 🐂🐂🐊🐊🐑🐓🐓🐓🐓🐓🐓🐘🐘🐢🐦🐦🐦🐦🐧🐧🐧🐪🐪🦃🦃🦃🦃🦃
  Words from topic sea: 🐊🐋🐋🐋🐍🐍🐙🐙🐟🐟🐟🐟🐟🐠🐢🐢🐧🐳🐳🐳🦀🦀
Doc 692
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐊🐍🐓🐓🐘🐘🐘🐪🐿🦃
Doc 693
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Word

  Words from topic sea: 🐊🐊🐊🐊🐋🐋🐋🐋🐋🐋🐋🐋🐋🐍🐍🐍🐙🐙🐙🐙🐙🐟🐟🐠🐠🐠🐠🐠🐠🐠🐠🐠🐠🐢🐢🐧🐧🐧🐧🐧🐧🐳🐳🐳🐳🐳🐳🦀
  Words from topic land: 🐿🐿
Doc 1296
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐍🐠🐧🐳
Doc 1297
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐋🐋🐍🐙🐙🐙🐟🐟🐠🐠🐠🐧🦀
  Words from topic land: 🐑🐘
Doc 1298
  Words from topic sea: 🐊🐊🐊🐋🐍🐍🐙🐙🐟🐟🐟🐠🐠🐢🐢🐢🐢🐧🐧🐧🐧🐳🐳🐳🦀🦀🦀🦀🦀🦀🦀🦀
  Words from topic land: 🐂🐂🐂🐂🐍🐍🐑🐑🐓🐦🐪🐿🐿🦀🦃🦃🦃
  Words from topic air: 🐦
Doc 1299
  Words from topic land: 🐂🐂🐂🐂🐂🐂🐂🐊🐊🐊🐊🐍🐑🐑🐓🐓🐓🐘🐘🐘🐘🐘🐢🐦🐧🐧🐧🐧🐧🐧🐪🐪🐪🐪🐪🐪🐪🐪🐿🐿🐿🦀🦃🦃🦃🦃🦃🦃
  Words from topic sea: 🐊🐙
Doc 1300
  Words from topic land: 🐍🐍🐍🐑🐑🐑🐑🐑🐓🐓🐓🐓🐘🐘🐘🐘🐘🐘🐢🐢🐢🐦🐦🐦🐦🐧🐧🐧🐪🐪🐪🐪🐪🐿🐿🐿🐿🐿🐿🐿🦀🦃🦃🦃🦃
  Words from topic sea: 🐊🐳
  Words from topic air: 🐦🐦🐦
Doc 1301
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐟🐠🐠🐢🐧🐧🐳
Doc 1302
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐙🐟🐠🐳🐳🐳
Doc 1303
  Words from topic land: 🐂🐂🐑🐦🐧🐧🐧🦃
  Words from t

  Words from topic land: 🐂🐍🐍🐑🐑🐑🐑🐑🐑🐑🐑🐓🐓🐓🐓🐓🐓🐓🐓🐘🐘🐘🐘🐘🐘🐘🐘🐘🐢🐧🐧🐧🐧🐧🐪🐪🐿🐿🐿🐿🦀🦀🦃🦃🦃🦃🦃🦃🦃
  Words from topic air: 🐦
Doc 1720
  Words from topic sea: 🐊🐊🐋🐋🐍🐍🐍🐍🐍🐙🐙🐙🐙🐟🐟🐟🐟🐟🐟🐟🐠🐠🐠🐠🐠🐢🐢🐢🐢🐢🐢🐧🐧🐧🐳🐳🐳🐳🦀🦀🦀🦀🦀
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦
Doc 1721
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐂🐑🐓🐘🐢🐪🐿🐿🦀🦃
Doc 1722
  Words from topic land: 🐂🐂🐑🐑🐓🐘🐘🐘🐢🐢🐧🐧🐪🐿🐿🦀🦃🦃
  Words from topic sea: 🐊🐊🐋🐋🐋🐋🐋🐍🐙🐙🐙🐙🐙🐟🐟🐟🐟🐠🐠🐠🐢🐧🐳🐳🐳🦀🦀🦀🦀🦀🦀
  Words from topic air: 🐦
Doc 1723
  Words from topic land: 🐂🐂🐊🐊🐑🐑🐑🐑🐑🐑🐑🐑🐓🐓🐓🐓🐘🐘🐘🐦🐦🐧🐧🐪🐪🦀🦃
  Words from topic sea: 🐊🐊🐋🐋🐋🐋🐋🐋🐍🐙🐟🐟🐟🐢🐧🐧🐳🐳🐳
  Words from topic air: 🐦🐦🐦🐦
Doc 1724
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic land: 🐦🦀
Doc 1725
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
  Words from topic sea: 🐋🐟🐟🐳🐳
Doc 1726
  Words from topic sea: 🐋🐍🐠
  Words from topic land: 🐂🐂🐂🐂🐂🐊🐑🐑🐓🐓🐘🐘🐘🐘🐘🐢🐦🐦🐧🐪🐪🐪🐪🐿🐿🐿🐿🐿🐿🦃🦃🦃🦃🦃🦃
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦
Doc 1727
  Words from topic air: 🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦🐦

## Animal Farm Preprocess Data

In [13]:
document_list = documents

In [14]:
' '.join(document_list[0])

'🐊 🐊 🐋 🐋 🐋 🐍 🐍 🐍 🐙 🐙 🐟 🐠 🐠 🐠 🐠 🐢 🐢 🐧 🐧 🐧 🐧 🐧 🐳 🐳 🐳 🐳 🐳 🐳 🦀 🦀 🦀 🦀 🦀 🦀 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐦 🐓'

## Create Vocab

In [15]:
import nltk

In [16]:
def word_valid(aword):
    return aword not in [""," "]

def create_vocab(alist_sentence, amin_freq_allowed):
    the_words = []
    for sentence_list_ in alist_sentence:
        for the_asentence in sentence_list_:
            for the_aword in the_asentence:
                the_words.append(the_aword)
        the_words_freq = nltk.FreqDist(the_words)
        the_vocab = []
        for the_aword, the_afreq in the_words_freq.items():
            if the_afreq > amin_freq_allowed:
                if word_valid(the_aword):
                    the_vocab.append(the_aword)

    the_vocab_sorted = sorted(the_vocab)
    #Assign a number corresponding to each word. Makes counting easier.
    the_vocab_sorted_dict = dict(zip(the_vocab_sorted, range(len(the_vocab_sorted))))
    return the_vocab_sorted, the_vocab_sorted_dict

In [17]:
vocab, vocab2id = create_vocab(document_list, 10)

In [18]:
len(vocab)

18

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
train_x, test_x, _, _ =  train_test_split(
    document_list, document_list, test_size=0.2, random_state=42)

## Constants

In [21]:
limited_vocab = len(vocab)

In [22]:
bs = 200
en1_units=100
en2_units=100
num_topic=3
num_input=limited_vocab
variance=0.995
init_mult=1.0
learning_rate=0.002
batch_size=200
momentum=0.99
num_epoch=1000
nogpu=True
drop_rate=0.2

## Topic Model Utility Functions

In [23]:
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]
def setify(o): return o if isinstance(o,set) else set(listify(o))
def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x

In [24]:
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_ = tensor_te
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = tensor_te.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))
    
def print_perp(model):
    cost=[]
    model.eval()                        # switch to testing mode
    input_ = next(iter(test_dl))
    recon, loss = model(input_, compute_loss=True, avg_loss=False)
    loss = loss.data
    counts = input_.sum(1)
    avg = (loss / counts).mean()
    print('The approximated perplexity is: ', math.exp(avg))

def print_top_words(beta, feature_names, n_top_words=10):
    print ('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        line = " ".join([feature_names[j] 
                         for j in beta[i].argsort()[:-n_top_words - 1:-1]])
        print('{}'.format(line))
    print ('---------------End of Topics------------------')

## Data Utility Functions

In [25]:
# def collate(b):
#     x, y = zip(*b)
#     return torch.stack(x), torch.stack(y)

def collate(b):
    return torch.stack(b)

class IdifyAndLimitedVocab():
    _order=-1
    def __init__(self, vocab2id, limited_vocab):
        self.vocab2id = vocab2id
        self.limited_vocab = limited_vocab
    def __call__(self, item):
        idlist = [self.vocab2id[w] for w in item if self.vocab2id[w] < limited_vocab]
        return np.array(idlist)

class Numpyify():
    _order=0
    def __call__(self, item):
        return np.array(item)

class Onehotify():
    _order=1
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
    def __call__(self, item):
        return np.array(np.bincount(item.astype('int'), minlength=self.vocab_size))
    
class YToOnehot():
    _order=1
    def __init__(self, num_classes):
        self.num_classes = num_classes
    def __call__(self, item):
        categorical = np.zeros((1, self.num_classes))
        categorical[0, item] = 1
        return categorical

class Tensorify():
    _order=2
    def __call__(self, item):
        return torch.from_numpy(item)

class Floatify():
    _order=3
    def __call__(self, item):
        return item.float()
    
class CheckAndCudify():
    _order=100
    def __init__(self):
        self.ic = torch.cuda.is_available()
    def __call__(self, item):
        return item.cuda() if self.ic else item
    
class AnimalFarmDataset(Dataset):
    def __init__(self, x, tfms): 
        self.x = x
        self.x_tfms = tfms
    def __len__(self): 
        return len(self.x)
    def __getitem__(self, i): 
        return compose(self.x[i], self.x_tfms)
    
class Sampler():
    def __init__(self, ds, bs, shuffle=False):
        self.n,self.bs,self.shuffle = len(ds),bs,shuffle
        
    def __iter__(self):
        self.idxs = torch.randperm(self.n) if self.shuffle else torch.arange(self.n)
        for i in range(0, self.n, self.bs): yield self.idxs[i:i+self.bs]

class DataLoader():
    def __init__(self, ds, sampler, collate_fn=collate):
        self.ds,self.sampler,self.collate_fn = ds,sampler,collate_fn
        
    def __iter__(self):
        for s in self.sampler: yield self.collate_fn([self.ds[i] for i in s])

## Load Data

In [26]:
tfms = [IdifyAndLimitedVocab(vocab2id, limited_vocab), Numpyify(), Onehotify(vocab_size=limited_vocab), Tensorify(), Floatify(), CheckAndCudify()]

In [27]:
train_ds = AnimalFarmDataset(train_x, tfms=tfms)
test_ds = AnimalFarmDataset(test_x, tfms=tfms)

In [28]:
train_samp = Sampler(train_ds, bs, shuffle=True)
test_samp = Sampler(test_ds, bs, shuffle=False)

In [29]:
train_dl = DataLoader(train_ds, sampler=train_samp, collate_fn=collate)
test_dl = DataLoader(test_ds, sampler=test_samp, collate_fn=collate)

## Define Model

In [30]:
def encoder(in_feature, hidden_feature1, hidden_feature2, drop_rate):
    return nn.Sequential(OrderedDict([
                ('linear1', nn.Linear(in_feature, hidden_feature1)),
                ('act1', nn.Softplus()),
                ('linear2', nn.Linear(hidden_feature1, hidden_feature2)),
                ('act2', nn.Softplus()),
                ('dropout', nn.Dropout(drop_rate))
            ]))

def decoder(in_feature, out_feature, drop_rate):
     return nn.Sequential(OrderedDict([
                ('act1', nn.Softmax(dim=-1)),
                ('dropout', nn.Dropout(drop_rate)),
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature)),
                ('act2', nn.Softmax(dim=-1))
            ]))

In [31]:
def hidden(in_feature, out_feature):
    return nn.Sequential(OrderedDict([
                ('linear', nn.Linear(in_feature, out_feature)),
                ('batchnorm', nn.BatchNorm1d(out_feature))
            ]))

In [32]:
class ProdLDA(nn.Module):
    def __init__(self, num_input, en1_units, en2_units, num_topic, drop_rate, init_mult):
        super(ProdLDA, self).__init__()
        self.num_input, self.en1_units, self.en2_units, \
        self.num_topic, self.drop_rate, self.init_mult = num_input, en1_units, en2_units, \
                                                            num_topic, drop_rate, init_mult
        # encoder
        self.en = encoder(num_input, en1_units, en2_units, drop_rate)
        self.mean = hidden(en2_units, num_topic)
        self.logvar = hidden(en2_units, num_topic)
        # decoder
        self.de = decoder(num_topic, num_input, drop_rate)
        # prior mean and variance as constant buffers
        self.prior_mean   = torch.Tensor(1, num_topic).fill_(0)
        self.prior_var    = torch.Tensor(1, num_topic).fill_(variance)
        self.prior_mean   = nn.Parameter(self.prior_mean, requires_grad=False)
        self.prior_var    = nn.Parameter(self.prior_var, requires_grad=False)
        self.prior_logvar = nn.Parameter(self.prior_var.log(), requires_grad=False)
        # initialize decoder weight
        if init_mult != 0:
            #std = 1. / math.sqrt( init_mult * (num_topic + num_input))
            self.de.linear.weight.data.uniform_(0, init_mult)
        # remove BN's scale parameters
        for component in [self.mean, self.logvar, self.de]:
            component.batchnorm.weight.requires_grad = False
            component.batchnorm.weight.fill_(1.0)

    def encode(self, input_):
        encoded = self.en(input_)
        posterior_mean = self.mean(encoded)
        posterior_logvar = self.logvar(encoded)
        return encoded, posterior_mean, posterior_logvar
    
    def decode(self, input_, posterior_mean, posterior_var):
        # take sample
        eps = input_.data.new().resize_as_(posterior_mean.data).normal_() # noise 
        z = posterior_mean + posterior_var.sqrt() * eps                   # reparameterization
        # do reconstruction
        recon = self.de(z)          # reconstructed distribution over vocabulary
        return recon
    
    def forward(self, input_, compute_loss=False, avg_loss=True):
        # compute posterior
        en2, posterior_mean, posterior_logvar = self.encode(input_) 
        posterior_var    = posterior_logvar.exp()
        
        recon = self.decode(input_, posterior_mean, posterior_var)
        if compute_loss:
            return recon, self.loss(input_, recon, posterior_mean, posterior_logvar, posterior_var, avg_loss)
        else:
            return recon

    def loss(self, input_, recon, posterior_mean, posterior_logvar, posterior_var, avg=True):
        # NL
        NL  = -(input_ * (recon + 1e-10).log()).sum(1)
        # KLD, see Section 3.3 of Akash Srivastava and Charles Sutton, 2017, 
        # https://arxiv.org/pdf/1703.01488.pdf
        prior_mean   = self.prior_mean.expand_as(posterior_mean)
        prior_var    = self.prior_var.expand_as(posterior_mean)
        prior_logvar = self.prior_logvar.expand_as(posterior_mean)
        var_division    = posterior_var  / prior_var
        diff            = posterior_mean - prior_mean
        diff_term       = diff * diff / prior_var
        logvar_division = prior_logvar - posterior_logvar
        # put KLD together
        KLD = 0.5 * ( (var_division + diff_term + logvar_division).sum(1) - self.num_topic)
        # loss
        loss = (NL + KLD)
        # in traiming mode, return averaged loss. In testing mode, return individual loss
        if avg:
            return loss.mean()
        else:
            return loss

## Train

In [33]:
model = ProdLDA(num_input, en1_units, en2_units, num_topic, drop_rate, init_mult)
optimizer = torch.optim.Adam(model.parameters(), learning_rate, betas=(momentum, 0.999))

In [34]:
if torch.cuda.is_available():
    model = model.cuda()

In [35]:
for epoch in range(num_epoch):
    loss_epoch = 0.0
    model.train()                    # switch to training mode
    for input_ in train_dl:
        recon, loss = model(input_, compute_loss=True)
        # optimize
        optimizer.zero_grad()        # clear previous gradients
        loss.backward()              # backprop
        optimizer.step()             # update parameters
        # report
        loss_epoch += loss.item()    # add loss to loss_epoch
    if epoch % 5 == 0:
        print('Epoch {}, loss={}'.format(epoch, loss_epoch / len(input_)))

Epoch 0, loss=5.977387237548828
Epoch 5, loss=5.605196838378906
Epoch 10, loss=5.378238143920899
Epoch 15, loss=5.16166389465332
Epoch 20, loss=5.001259155273438
Epoch 25, loss=4.9238559341430665
Epoch 30, loss=4.830425605773926
Epoch 35, loss=4.772599487304688
Epoch 40, loss=4.730070838928222
Epoch 45, loss=4.6701576614379885
Epoch 50, loss=4.655155296325684
Epoch 55, loss=4.625503578186035
Epoch 60, loss=4.586468086242676
Epoch 65, loss=4.58779613494873
Epoch 70, loss=4.568542823791504
Epoch 75, loss=4.559957237243652
Epoch 80, loss=4.533498916625977
Epoch 85, loss=4.555110321044922
Epoch 90, loss=4.535000991821289
Epoch 95, loss=4.52607479095459
Epoch 100, loss=4.532789268493652
Epoch 105, loss=4.512530174255371
Epoch 110, loss=4.537240142822266
Epoch 115, loss=4.524556655883789
Epoch 120, loss=4.503480834960937
Epoch 125, loss=4.5272731018066406
Epoch 130, loss=4.511065101623535
Epoch 135, loss=4.503416137695313
Epoch 140, loss=4.493374137878418
Epoch 145, loss=4.494168472290039
Ep

## Test

In [36]:
emb = model.de.linear.weight.data.cpu().numpy().T
print_top_words(emb, vocab, 30)
print_perp(model)

---------------Printing the Topics------------------
🐠 🐋 🐟 🐧 🐳 🐍 🐙 🐢 🦀 🐊 🐘 🐑 🦃 🐓 🐦 🐿 🐪 🐂
🐦 🐠 🐋 🐳 🐟 🐑 🐙 🦃 🐘 🐧 🐿 🐍 🐂 🐓 🐢 🦀 🐪 🐊
🐘 🐑 🦃 🐓 🐿 🐍 🐂 🐧 🐪 🐢 🐠 🦀 🐊 🐋 🐦 🐟 🐳 🐙
---------------End of Topics------------------
The approximated perplexity is:  8.26282334414865


## Latent Dirichlet Allocation 

In [37]:
import lda

In [38]:
import lda.datasets

In [39]:
tfms_lda = [IdifyAndLimitedVocab(vocab2id, limited_vocab), Numpyify(), Onehotify(vocab_size=limited_vocab)]

In [40]:
X_train = np.array([compose(doc, tfms_lda) for doc in train_x])

In [41]:
X_train.shape

(1600, 18)

In [42]:
model = lda.LDA(n_topics=num_topic, n_iter=5000, random_state=1)

In [43]:
model.fit(X_train)

INFO:lda:n_documents: 1600
INFO:lda:vocab_size: 18
INFO:lda:n_words: 80000
INFO:lda:n_topics: 3
INFO:lda:n_iter: 5000
INFO:lda:<0> log likelihood: -295069
INFO:lda:<10> log likelihood: -204755
INFO:lda:<20> log likelihood: -190185
INFO:lda:<30> log likelihood: -180952
INFO:lda:<40> log likelihood: -179621
INFO:lda:<50> log likelihood: -179130
INFO:lda:<60> log likelihood: -178899
INFO:lda:<70> log likelihood: -178718
INFO:lda:<80> log likelihood: -178507
INFO:lda:<90> log likelihood: -178700
INFO:lda:<100> log likelihood: -178272
INFO:lda:<110> log likelihood: -178402
INFO:lda:<120> log likelihood: -178371
INFO:lda:<130> log likelihood: -178540
INFO:lda:<140> log likelihood: -178637
INFO:lda:<150> log likelihood: -178207
INFO:lda:<160> log likelihood: -178208
INFO:lda:<170> log likelihood: -178271
INFO:lda:<180> log likelihood: -178115
INFO:lda:<190> log likelihood: -178435
INFO:lda:<200> log likelihood: -178246
INFO:lda:<210> log likelihood: -178162
INFO:lda:<220> log likelihood: -178

INFO:lda:<2050> log likelihood: -178362
INFO:lda:<2060> log likelihood: -178204
INFO:lda:<2070> log likelihood: -178685
INFO:lda:<2080> log likelihood: -178250
INFO:lda:<2090> log likelihood: -178457
INFO:lda:<2100> log likelihood: -178507
INFO:lda:<2110> log likelihood: -178716
INFO:lda:<2120> log likelihood: -178334
INFO:lda:<2130> log likelihood: -178533
INFO:lda:<2140> log likelihood: -178458
INFO:lda:<2150> log likelihood: -178322
INFO:lda:<2160> log likelihood: -178466
INFO:lda:<2170> log likelihood: -178161
INFO:lda:<2180> log likelihood: -178049
INFO:lda:<2190> log likelihood: -178396
INFO:lda:<2200> log likelihood: -178307
INFO:lda:<2210> log likelihood: -178211
INFO:lda:<2220> log likelihood: -178321
INFO:lda:<2230> log likelihood: -178612
INFO:lda:<2240> log likelihood: -178104
INFO:lda:<2250> log likelihood: -178266
INFO:lda:<2260> log likelihood: -178294
INFO:lda:<2270> log likelihood: -178260
INFO:lda:<2280> log likelihood: -177837
INFO:lda:<2290> log likelihood: -178217


INFO:lda:<4100> log likelihood: -178181
INFO:lda:<4110> log likelihood: -177675
INFO:lda:<4120> log likelihood: -178130
INFO:lda:<4130> log likelihood: -177746
INFO:lda:<4140> log likelihood: -178127
INFO:lda:<4150> log likelihood: -178469
INFO:lda:<4160> log likelihood: -178125
INFO:lda:<4170> log likelihood: -178017
INFO:lda:<4180> log likelihood: -178020
INFO:lda:<4190> log likelihood: -178335
INFO:lda:<4200> log likelihood: -178015
INFO:lda:<4210> log likelihood: -178027
INFO:lda:<4220> log likelihood: -177929
INFO:lda:<4230> log likelihood: -178118
INFO:lda:<4240> log likelihood: -177866
INFO:lda:<4250> log likelihood: -177404
INFO:lda:<4260> log likelihood: -177878
INFO:lda:<4270> log likelihood: -177877
INFO:lda:<4280> log likelihood: -178051
INFO:lda:<4290> log likelihood: -178094
INFO:lda:<4300> log likelihood: -178378
INFO:lda:<4310> log likelihood: -177761
INFO:lda:<4320> log likelihood: -177577
INFO:lda:<4330> log likelihood: -178121
INFO:lda:<4340> log likelihood: -178226


<lda.lda.LDA at 0x7ff20d9fc940>

In [44]:
topic_word = model.topic_word_

In [45]:
n_top_words = 30

In [46]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: 🐘 🐿 🐓 🐪 🐂 🦃 🐑 🐦 🐧 🐢 🐍 🐊 🦀 🐠 🐳 🐋 🐟 🐙
Topic 1: 🐦 🐊 🦀 🦃 🐳 🐙 🐑 🐿 🐂 🐢 🐍 🐠 🐘 🐓 🐧 🐋 🐪 🐟
Topic 2: 🐙 🐋 🦀 🐟 🐠 🐳 🐧 🐢 🐊 🐍 🐦 🐘 🐪 🦃 🐓 🐂 🐑 🐿
