In [1]:
import lm
import pickle
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
from test_trigram import learn_trigram
from test_bigram import learn_bigram
from data import *

In [6]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
models = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("data/corpora.tar.gz", dname)
    datas.append(data)
    model = learn_unigram(data)
    models.append(model)
# compute the perplexity of all pairs
n = len(dnames)
perp_dev = np.zeros((n,n))
perp_test = np.zeros((n,n))
perp_train = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        perp_dev[i][j] = models[i].perplexity(datas[j].dev)
        perp_test[i][j] = models[i].perplexity(datas[j].test)
        perp_train[i][j] = models[i].perplexity(datas[j].train)

print("-------------------------------")
print("x train")
print_table(perp_train, dnames, dnames, "table-train.tex")
print("-------------------------------")
print("x dev")
print_table(perp_dev, dnames, dnames, "table-dev.tex")
print("-------------------------------")
print("x test")
print_table(perp_test, dnames, dnames, "table-test.tex")

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
vocab: 41746
train: 1513.8018008490042
dev  : 1737.5445705338257
test : 1758.248804766443
sample:  Baker
sample:  have room They there event thanks bad his lobes making Treasury
sample:  pirates South
-----------------------
reuters
reuters  read. train: 38183 dev: 8083 test: 8199
vocab: 35989
train: 1466.8721485743788
dev  : 1580.9102794282078
test : 1576.8543845321956
sample:  753 cts Saunders from February on Lanka week said 000 dlrs to 29 the MINING of say
sample:  mln now Japan Canada pct 100 resolved 70 fourth which Change heavy UNIT has but barrels planned premium TO will approved that the CUTS Iran the same ministry of Coffee STOCKS is from
sample:  government Prudential
-----------------------
gutenberg
gutenberg  read. train: 68767 dev: 14667 test: 14861
vocab: 43736
train: 981.368830109398
dev  : 1060.5363793834274
test : 1035.7794090182354
sample:  not little who laughed part to sneeringly yet Ther

In [2]:
def train_lm(base, smooth, mode='bi', comp_other=False):
    if mode == 'bi':
        learn_func = learn_bigram
    elif mode == 'tri':
        learn_func = learn_trigram
        
    dnames = ["brown", "reuters", "gutenberg"]
    datas = []
    models = {b:defaultdict(list) for b in base}
    # Learn the models for each of the domains, and evaluate it
    for dname in dnames:
        print("-----------------------")
        print(dname)
        data = read_texts("data/corpora.tar.gz", dname)
        datas.append(data)
        
        for b in base:
            for s in smooth:
                model = learn_func(data, gamma=b, smooth=s)
                models[b][s].append(model)
                
    if comp_other:
        # compute the perplexity of all pairs
        n = len(dnames)
        perp_dev = np.zeros((n,n))
        perp_test = np.zeros((n,n))
        perp_train = np.zeros((n,n))
        for i in range(n):
            for j in range(n):
                perp_dev[i][j] = models[i].perplexity(datas[j].dev)
                perp_test[i][j] = models[i].perplexity(datas[j].test)
                perp_train[i][j] = models[i].perplexity(datas[j].train)

        print("-------------------------------")
        print("x train")
        print_table(perp_train, dnames, dnames, "table-train.tex")
        print("-------------------------------")
        print("x dev")
        print_table(perp_dev, dnames, dnames, "table-dev.tex")
        print("-------------------------------")
        print("x test")
        print_table(perp_test, dnames, dnames, "table-test.tex")
    
    return models, datas

In [3]:
smooth = np.linspace(0.1, 1, 10)

In [4]:
bimodels_0, datas = train_lm([0], smooth)

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
-----------------------
reuters
reuters  read. train: 38183 dev: 8083 test: 8199
-----------------------
gutenberg
gutenberg  read. train: 68767 dev: 14667 test: 14861


In [6]:
print(bimodels_0)

{0: defaultdict(<class 'list'>, {0.1: [<lm.Bigram object at 0x0000025AFF482288>, <lm.Bigram object at 0x0000025AB0BB2548>, <lm.Bigram object at 0x0000025AFFCBCAC8>], 0.2: [<lm.Bigram object at 0x0000025AFF482D08>, <lm.Bigram object at 0x0000025AB0BB2448>, <lm.Bigram object at 0x0000025AFFC4FF48>], 0.30000000000000004: [<lm.Bigram object at 0x0000025AFF482308>, <lm.Bigram object at 0x0000025AB0BB2908>, <lm.Bigram object at 0x0000025AFFCBC948>], 0.4: [<lm.Bigram object at 0x0000025AFF482FC8>, <lm.Bigram object at 0x0000025AB0BB2C88>, <lm.Bigram object at 0x0000025AFFCBCE08>], 0.5: [<lm.Bigram object at 0x0000025AFF482B08>, <lm.Bigram object at 0x0000025AB0BB2048>, <lm.Bigram object at 0x0000025AFFCBC108>], 0.6: [<lm.Bigram object at 0x0000025AFF482DC8>, <lm.Bigram object at 0x0000025AB0BB2BC8>, <lm.Bigram object at 0x0000025AFFCBC608>], 0.7000000000000001: [<lm.Bigram object at 0x0000025AFF482808>, <lm.Bigram object at 0x0000025AB0BB2EC8>, <lm.Bigram object at 0x0000025AFFCBC648>], 0.8: 

In [5]:
def comp(base, smooth, datas, models):
    # compute the perplexity of all pairs
    bnum = len(base)
    snum = len(smooth)
    perp_dev = np.zeros((bnum, snum, 3,3))
    perp_test = np.zeros((bnum, snum, 3,3))
    perp_train = np.zeros((bnum, snum, 3,3))
    
    for bi, b in enumerate(base):
        for si, s in enumerate(smooth):
            for i in range(3):
                for j in range(3):
                    perp_dev[bi,si,i,j] = models[b][s][i].perplexity(datas[j].dev)
                    perp_test[bi,si,i,j] = models[b][s][i].perplexity(datas[j].test)
                    perp_train[bi,si,i,j] = models[b][s][i].perplexity(datas[j].train)

#     print("-------------------------------")
#     print("x train")
#     print_table(perp_train, dnames, dnames, "table-train.tex")
#     print("-------------------------------")
#     print("x dev")
#     print_table(perp_dev, dnames, dnames, "table-dev.tex")
#     print("-------------------------------")
#     print("x test")
#     print_table(perp_test, dnames, dnames, "table-test.tex")
    return perp_dev, perp_test, perp_train

In [6]:
perp = comp([0], smooth, datas, bimodels_0)

In [9]:
print(perp[0][0,:,0,0])

[3794.83356135 4795.40425634 5577.73281865 6241.02797914 6825.44963312
 7352.22512987 7834.28373878 8280.23747099 8696.18873248 9086.6623328 ]


In [10]:
with open('bi_smooth01_0.pickle', 'wb') as f:
    pickle.dump(bimodels_0, f)