In [135]:
import numpy as np
from nltk.lm.preprocessing import pad_both_ends, flatten, padded_everygram_pipeline
from nltk.lm import MLE
from nltk.tokenize import RegexpTokenizer
from nltk.lm import KneserNeyInterpolated, Laplace, StupidBackoff
from sklearn.model_selection import train_test_split
import sys

# Chat GPT
# if len(sys.argv) > 1:
#     file_name = sys.argv[1]
#     print("File name:", file_name)
# else:
#     print("Please provide the file name as a parameter.")

# Bonus Point
def ngrams(sentence, n):
    ngrams = []
    for i in range(len(sentence) - n + 1):
        ngrams.append(np.array([sentence[i+j] for j in range(n)]))
    return ngrams

### Preprocessing data using RegexpTokenizer: picks out sequences of alphanumeric characters as tokens and drops everything else
def preprocess_text(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = list(map(lambda s: tokenizer.tokenize(s), text))
    text = [ele for ele in text if ele != []]   # remove empty list

    return text

def test_mapping(element):
#     test = list(pad_both_ends(element, n=2))
    test = ngrams(test, n=2)
    return test

In [136]:
# Open files    
with open('./ngram_authorship_train/austen.txt') as f:
    austen_txt = f.readlines()
with open('./ngram_authorship_train/dickens.txt') as f:
    dickens_txt = f.readlines()
with open('./ngram_authorship_train/tolstoy.txt') as f:
    tolstoy_txt = f.readlines()
with open('./ngram_authorship_train/wilde.txt') as f:
    wilde_txt = f.readlines()
    
# preprocess
austen_txt = preprocess_text(austen_txt)
dickens_txt = preprocess_text(dickens_txt)
tolstoy_txt = preprocess_text(tolstoy_txt)
wilde_txt = preprocess_text(wilde_txt)

random_state = 50
# train test split
austen_train, austen_test = train_test_split(austen_txt, test_size=0.1, random_state=random_state)
dickens_train, dickens_test = train_test_split(dickens_txt, test_size=0.1, random_state=random_state)
tolstoy_train, tolstoy_test = train_test_split(tolstoy_txt, test_size=0.1, random_state=random_state)
wilde_train, wilde_test = train_test_split(wilde_txt, test_size=0.1, random_state=random_state)

print(austen_test[0])
print(dickens_test[0])
print(tolstoy_test[0])
print(wilde_test[0])

['Mr', 'Suckling', 's', 'seat', 'a', 'comparison', 'of', 'Hartfield', 'to', 'Maple', 'Grove', 'The']
['like', 'this', 'It', 'made', 'me', 'so', 'sorry', 'that', 'I', 'could', 'only', 'say', 'in', 'a', 'foolish']
['herself', 'in', 'supposing', 'she', 'could', 'be', 'what', 'she', 'wanted', 'to', 'be', 'Her', 'eyes', 'were']
['realising', 'it']


In [137]:
# Training
## AUSTEN
train, vocab = padded_everygram_pipeline(2, austen_train)
# austen_lm = MLE(2)
# austen_lm = KneserNeyInterpolated(order=2)
austen_lm = StupidBackoff(order=2)
# austen_lm = Laplace(order=2)
austen_lm.fit(train, vocab)
print('vocabs:',len(austen_lm.vocab))

## DICKENS
train, vocab = padded_everygram_pipeline(2, dickens_train)
# dickens_lm = MLE(2)
# dickens_lm = KneserNeyInterpolated(order=2)
dickens_lm = StupidBackoff(order=2)
# dickens_lm = Laplace(order=2)
dickens_lm.fit(train, vocab)
print('vocabs:',len(dickens_lm.vocab))

## TOLSTOY
train, vocab = padded_everygram_pipeline(2, tolstoy_train)
# tolstoy_lm = MLE(2)
# tolstoy_lm = KneserNeyInterpolated(order=2)
tolstoy_lm = StupidBackoff(order=2)
# tolstoy_lm = Laplace(order=2)
tolstoy_lm.fit(train, vocab)
print('vocabs:',len(tolstoy_lm.vocab))

## WILDE
train, vocab = padded_everygram_pipeline(2, wilde_train)
# wilde_lm = MLE(2)
# wilde_lm = KneserNeyInterpolated(order=2)
wilde_lm = StupidBackoff(order=2)
# wilde_lm = Laplace(order=2)
wilde_lm.fit(train, vocab)
print('vocabs:',len(wilde_lm.vocab))

vocabs: 6854
vocabs: 9094
vocabs: 10573
vocabs: 8436


In [138]:
print(austen_lm.counts)
print(dickens_lm.counts)
print(tolstoy_lm.counts)
print(wilde_lm.counts)

<NgramCounter with 2 ngram orders and 257156 ngrams>
<NgramCounter with 2 ngram orders and 244806 ngrams>
<NgramCounter with 2 ngram orders and 371699 ngrams>
<NgramCounter with 2 ngram orders and 194171 ngrams>


In [139]:
print('no. family:', austen_lm.counts['family'])
print('no. family:', dickens_lm.counts['family'])
print('no. family:', tolstoy_lm.counts['family'])
print('no. family:', wilde_lm.counts['family'])

no. family: 70
no. family: 34
no. family: 45
no. family: 31


In [140]:
print('no. family may:', austen_lm.counts[['family']]['may'])
print('no. family may:', dickens_lm.counts[['family']]['may'])
print('no. family may:', tolstoy_lm.counts[['family']]['may'])
print('no. family may:', wilde_lm.counts[['family']]['may'])

no. family may: 0
no. family may: 0
no. family may: 0
no. family may: 0


In [141]:
print('family score:',austen_lm.score('family'))
print('family score:',dickens_lm.score('family'))
print('family score:',tolstoy_lm.score('family'))
print('family score:',wilde_lm.score('family'))

family score: 0.0005244074196158341
family score: 0.0002675964330969565
family score: 0.0002330410825535088
family score: 0.0003076220813114624


In [142]:
# Test setting
print(len(austen_test))
print(len(dickens_test))
print(len(tolstoy_test))
print(len(wilde_test))

1091
1035
1611
820


In [143]:
lst = [austen_test, dickens_test, tolstoy_test, wilde_test]
results = {}
INF = float('inf')
infCount = 0

for cur in range(len(lst)):
#     infCount = 0
    correct = 0
    dev_set = lst[cur]
    dev_set = list(map(test_mapping, dev_set))
    print(dev_set[0], len(dev_set))
    
    for i in range(len(dev_set)):
    #   Calculate perplexity
        au = austen_lm.perplexity(dev_set[i])
        di = dickens_lm.perplexity(dev_set[i])
        to = tolstoy_lm.perplexity(dev_set[i])
        wi = wilde_lm.perplexity(dev_set[i])
        lowest = [au, di, to, wi]

    #   get lowest idx
        if (min(lowest) == INF):
            infCount += 1
            pass
        else:
            if (lowest.index(min(lowest)) == cur):
                correct += 1
     
    if (cur == 0):
        results['austen'] = correct/len(dev_set)
        print(results)
        
    elif (cur == 1):
        results['dickens'] = correct/len(dev_set)
        print(results)

    elif (cur == 2):
        results['tolstoy'] = correct/len(dev_set)
        print(results)

    elif (cur == 3):
        results['wilde'] = correct/len(dev_set)
        print(results)

print(results)
print(infCount)

[array(['<s>', 'Mr'], dtype='<U3'), array(['Mr', 'Suckling'], dtype='<U8'), array(['Suckling', 's'], dtype='<U8'), array(['s', 'seat'], dtype='<U4'), array(['seat', 'a'], dtype='<U4'), array(['a', 'comparison'], dtype='<U10'), array(['comparison', 'of'], dtype='<U10'), array(['of', 'Hartfield'], dtype='<U9'), array(['Hartfield', 'to'], dtype='<U9'), array(['to', 'Maple'], dtype='<U5'), array(['Maple', 'Grove'], dtype='<U5'), array(['Grove', 'The'], dtype='<U5'), array(['The', '</s>'], dtype='<U4')] 1091
{'austen': 0.6342804766269478}
[array(['<s>', 'like'], dtype='<U4'), array(['like', 'this'], dtype='<U4'), array(['this', 'It'], dtype='<U4'), array(['It', 'made'], dtype='<U4'), array(['made', 'me'], dtype='<U4'), array(['me', 'so'], dtype='<U2'), array(['so', 'sorry'], dtype='<U5'), array(['sorry', 'that'], dtype='<U5'), array(['that', 'I'], dtype='<U4'), array(['I', 'could'], dtype='<U5'), array(['could', 'only'], dtype='<U5'), array(['only', 'say'], dtype='<U4'), array(['say', 'in']

In [144]:
# print('austen , perplexity:',round(austen_lm.perplexity(test), 1),', entropy:', round(austen_lm.entropy(test), 1))
# print('dickens, perplexity:',round(dickens_lm.perplexity(test), 1),', entropy:', round(dickens_lm.entropy(test), 1))
# print('tolstoy, perplexity:',round(tolstoy_lm.perplexity(test), 1),', entropy:', round(tolstoy_lm.entropy(test), 1))
# print('wilde  , perplexity:',round(wilde_lm.perplexity(test), 1),', entropy:', round(wilde_lm.entropy(test), 1))

In [148]:
random_seed = 10
print('Austen  :', austen_lm.generate(10))
print('Dickens :', dickens_lm.generate(10))
print('Tolstoy :', tolstoy_lm.generate(10))
print('Wilde   :', wilde_lm.generate(10))

Austen  : ['will', 'follow', 'it', 'had', 'somewhat', 'spent', 'ours', 'and', 'ate', 'the']
Dickens : ['when', 'I', 'had', 'a', 'wall', '</s>', 'very', 'intimate', '</s>', 'a']
Tolstoy : ['to', '</s>', 'and', 'in', 'law', 'was', '</s>', 'to', 'him', 'he']
Wilde   : ['library', 'window', 'What', 'was', '</s>', '</s>', 'out', 'of', 'cruelty', 'downright']


In [149]:
test_au = austen_lm.generate(5, text_seed=['good'], random_seed=random_seed)
test_di = dickens_lm.generate(5, text_seed=['good'], random_seed=random_seed)
test_to = tolstoy_lm.generate(5, text_seed=['good'], random_seed=random_seed)
test_wi = wilde_lm.generate(5, text_seed=['good'], random_seed=random_seed)



# test_au = list(pad_both_ends(test_au, n=2))
test_au = ngrams(test_au, n=2)
# test_di = list(pad_both_ends(test_di, n=2))
test_di = ngrams(test_di, n=2)
test_to = list(pad_both_ends(test_to, n=2))
test_to = ngrams(test_to, n=2)
test_wi = list(pad_both_ends(test_wi, n=2))
test_wi = ngrams(test_wi, n=2)

print(round(austen_lm.perplexity(test_au), 1))
print(round(dickens_lm.perplexity(test_di), 1))
print(round(tolstoy_lm.perplexity(test_to), 1))
print(round(wilde_lm.perplexity(test_wi), 1))

28.2
40.4
26.6
47.4
