In [1]:
import lm
from data import *

In [2]:
def learn_trigram(data, gamma=0, smooth=1):
    """Learns a trigram model from data.train.

    It also evaluates the model on data.dev and data.test, along with generating
    some sample sentences from the model.
    """
    trigram = lm.Trigram(gamma=gamma, smooth=smooth)
    trigram.fit_corpus(data.train)
    print("vocab:", len(trigram.vocab()))
    # evaluate on train, test, and dev
    print("train:", trigram.perplexity(data.train))
    print("dev  :", trigram.perplexity(data.dev))
    print("test :", trigram.perplexity(data.test))
    from generator import Sampler
    sampler = Sampler(trigram)
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    print("sample: ", " ".join(str(x) for x in sampler.sample_sentence([])))
    return trigram

In [6]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
models = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("data/corpora.tar.gz", dname)
    datas.append(data)
    model = learn_unigram(data)
    models.append(model)
# compute the perplexity of all pairs
n = len(dnames)
perp_dev = np.zeros((n,n))
perp_test = np.zeros((n,n))
perp_train = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        perp_dev[i][j] = models[i].perplexity(datas[j].dev)
        perp_test[i][j] = models[i].perplexity(datas[j].test)
        perp_train[i][j] = models[i].perplexity(datas[j].train)

print("-------------------------------")
print("x train")
print_table(perp_train, dnames, dnames, "table-train.tex")
print("-------------------------------")
print("x dev")
print_table(perp_dev, dnames, dnames, "table-dev.tex")
print("-------------------------------")
print("x test")
print_table(perp_test, dnames, dnames, "table-test.tex")

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
vocab: 41746
train: 1513.8018008490042
dev  : 1737.5445705338257
test : 1758.248804766443
sample:  Baker
sample:  have room They there event thanks bad his lobes making Treasury
sample:  pirates South
-----------------------
reuters
reuters  read. train: 38183 dev: 8083 test: 8199
vocab: 35989
train: 1466.8721485743788
dev  : 1580.9102794282078
test : 1576.8543845321956
sample:  753 cts Saunders from February on Lanka week said 000 dlrs to 29 the MINING of say
sample:  mln now Japan Canada pct 100 resolved 70 fourth which Change heavy UNIT has but barrels planned premium TO will approved that the CUTS Iran the same ministry of Coffee STOCKS is from
sample:  government Prudential
-----------------------
gutenberg
gutenberg  read. train: 68767 dev: 14667 test: 14861
vocab: 43736
train: 981.368830109398
dev  : 1060.5363793834274
test : 1035.7794090182354
sample:  not little who laughed part to sneeringly yet Ther

In [8]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
models = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("data/corpora.tar.gz", dname)
    datas.append(data)
    model = learn_trigram(data)
    models.append(model)
# compute the perplexity of all pairs
n = len(dnames)
perp_dev = np.zeros((n,n))
perp_test = np.zeros((n,n))
perp_train = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        perp_dev[i][j] = models[i].perplexity(datas[j].dev)
        perp_test[i][j] = models[i].perplexity(datas[j].test)
        perp_train[i][j] = models[i].perplexity(datas[j].train)

print("-------------------------------")
print("x train")
print_table(perp_train, dnames, dnames, "table-train.tex")
print("-------------------------------")
print("x dev")
print_table(perp_dev, dnames, dnames, "table-dev.tex")
print("-------------------------------")
print("x test")
print_table(perp_test, dnames, dnames, "table-test.tex")

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
vocab: 41747
train: 5914.314310615035
dev  : 8.279309294048135
test : 8.166442426583654
sample:  ICC frosting archdiocese Republicans fives fight Dakota gazelle Twelve misdeeds topple presumably Schapiro stylized tasks dismay pecs Kupcinet Hrothgar Those stoutly demand Archbishop Bornholm coerce miliaris Sparrow psychotherapeutic zeroed lurching homeland Six Marv dreary Arianist levies flour stepmother Taussig Flint misconstruction spice Mubarak energy abscesses Bury DRDW republics senatorial Crowder how outsider Faithful modes Venturi compelled Va everyday comedy enfant genders Kochanek Vilas broad bookish pant daily Stein designates reproach Garments noncommissioned Perasso nighted Thorp daily Mamma Arte rounding Strange interviewer Gershwin alternated recess Georgia evolves navigating average enduring soubriquet weaning springtime stud Olaf draughts majored ell louvers orthodox excessively Supplemental
samp

In [4]:
import pickle

In [9]:
with open('trigram_base', 'wb') as f:
    pickle.dump(models, f)

In [3]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
models = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("data/corpora.tar.gz", dname)
    datas.append(data)
    model = learn_trigram(data, gamma=5, smooth=0.5)
    models.append(model)
# compute the perplexity of all pairs
n = len(dnames)
perp_dev = np.zeros((n,n))
perp_test = np.zeros((n,n))
perp_train = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        perp_dev[i][j] = models[i].perplexity(datas[j].dev)
        perp_test[i][j] = models[i].perplexity(datas[j].test)
        perp_train[i][j] = models[i].perplexity(datas[j].train)

print("-------------------------------")
print("x train")
print_table(perp_train, dnames, dnames, "table-train.tex")
print("-------------------------------")
print("x dev")
print_table(perp_dev, dnames, dnames, "table-dev.tex")
print("-------------------------------")
print("x test")
print_table(perp_test, dnames, dnames, "table-test.tex")

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
vocab: 11881
train: 3314.9926958680417
dev  : 68.4535433335179
test : 66.92908705149235
sample:  Queen solid permit statute murders reacted youths companions sung label correspondence Dandy follow youngster teach predecessors crop block Rhodes Poet depth architectural Individual daytime brave nothin Revenue Herbert Powers selection rode confinement witch stuff shattered ages Charter wounds Nassau Granny acquired crisp cannery separately lane strike pleased Governor fail expended recipe municipalities damned substrate passages strongest occupied blunt emission streetcar declaration Packard protested absolute poems billion Docherty historian novel consulted impatience proclamation rendered win Northeast Citizens clarity sweet shipments regulation rare amplifier Benson heart receive got eighty Thus Contrary planter swung simulated dresses lasted ballot stupid crisp varied log kills thieves
sample:  pays whisperin

In [7]:
with open('trigram_5gamma_05smooth', 'wb') as f:
    pickle.dump(models, f)

In [3]:
dnames = ["brown", "reuters", "gutenberg"]
datas = []
models = []
# Learn the models for each of the domains, and evaluate it
for dname in dnames:
    print("-----------------------")
    print(dname)
    data = read_texts("data/corpora.tar.gz", dname)
    datas.append(data)
    model = learn_trigram(data, smooth=0)
    models.append(model)
# compute the perplexity of all pairs
n = len(dnames)
perp_dev = np.zeros((n,n))
perp_test = np.zeros((n,n))
perp_train = np.zeros((n,n))
for i in range(n):
    for j in range(n):
        perp_dev[i][j] = models[i].perplexity(datas[j].dev)
        perp_test[i][j] = models[i].perplexity(datas[j].test)
        perp_train[i][j] = models[i].perplexity(datas[j].train)

print("-------------------------------")
print("x train")
print_table(perp_train, dnames, dnames, "table-train.tex")
print("-------------------------------")
print("x dev")
print_table(perp_dev, dnames, dnames, "table-dev.tex")
print("-------------------------------")
print("x test")
print_table(perp_test, dnames, dnames, "table-test.tex")

-----------------------
brown
brown  read. train: 39802 dev: 8437 test: 8533
vocab: 41747


ValueError: math domain error