In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.tokenize import word_tokenize


uniform_excerpts = pd.read_csv('./data/gutenberg/uniform_excerpts.csv')

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=200)
train, test = train_test_split(uniform_excerpts, test_size=0.2, random_state=200)

In [2]:
X_train = train.drop("label", axis = 1)
X_test = test.drop("label", axis = 1)

y_train = train["label"]
y_test = test["label"]

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("train eras", y_train.value_counts())
print("test eras", y_test.value_counts())

(4800, 2) (1200, 2) (4800,) (1200,)
train eras label
1500s    818
1600s    806
1800s    799
1700s    797
1400s    792
1900s    788
Name: count, dtype: int64
test eras label
1900s    212
1400s    208
1700s    203
1800s    201
1600s    194
1500s    182
Name: count, dtype: int64


In [3]:
"""
    features to look at first:
    average word length
    average sentance length
    parts of speech frequencies
    frequencies of certain words/spellings
"""

'\n    features to look at first:\n    average word length\n    average sentance length\n    parts of speech frequencies\n    frequencies of certain words/spellings\n'

In [4]:
"""
    get list of words from 1400s samples
    get vocab as well
"""
# list of strings, each denoting a text excerpt from the 1400s
samples_1400s = train[train["label"] == "1400s"]["text"].tolist() 
vocab_1400s = set()
words_list_1400s = []
for sample in samples_1400s:
    for word in word_tokenize(sample):
        vocab_1400s.add(word.lower())
        words_list_1400s.append(word.lower())


In [5]:
"""
    get list of words from 1500s samples
    get vocab as well
"""
samples_1500s =  train[train["label"] == "1500s"]["text"].tolist() 

vocab_1500s = set()
words_list_1500s = []
for sample in samples_1500s:
    for word in word_tokenize(sample):
        vocab_1500s.add(word.lower())
        words_list_1500s.append(word.lower())

In [6]:
"""
    get list of words from 1600s samples
    get vocab as well
"""
samples_1600s =  train[train["label"] == "1600s"]["text"].tolist() 

vocab_1600s = set()
words_list_1600s = []
for sample in samples_1600s:
    for word in word_tokenize(sample):
        vocab_1600s.add(word.lower())
        words_list_1600s.append(word.lower())

In [7]:
"""
    get list of words from 1700s samples
    get vocab as well
"""
samples_1700s =  train[train["label"] == "1700s"]["text"].tolist() 

vocab_1700s = set()
words_list_1700s = []
for sample in samples_1700s:
    for word in word_tokenize(sample):
        vocab_1700s.add(word.lower())
        words_list_1700s.append(word.lower())

In [8]:
"""
    get list of words from 1800s samples
    get vocab as well
"""
samples_1800s =  train[train["label"] == "1800s"]["text"].tolist() 

vocab_1800s = set()
words_list_1800s = []
for sample in samples_1800s:
    for word in word_tokenize(sample):
        vocab_1800s.add(word.lower())
        words_list_1800s.append(word.lower())

In [9]:
"""
    get list of words from 1900s samples
    get vocab as well
"""
samples_1900s =  train[train["label"] == "1900s"]["text"].tolist() 

vocab_1900s = set()
words_list_1900s = []
for sample in samples_1900s:
    for word in word_tokenize(sample):
        vocab_1900s.add(word.lower())
        words_list_1900s.append(word.lower())

In [10]:
only_1400s_words = [word for word in words_list_1400s if word not in vocab_1500s and word not in vocab_1600s and word not in vocab_1700s and word not in vocab_1800s]

counter14s = Counter(only_1400s_words)
print(counter14s.most_common())
refined_1400s_words = set(["suche", "thay", "wyll", "dyd", "mynde", "woulde", "thyng", "lyfe", "theyr", "thynke","muche","tyme","apon","nowe"])



In [11]:
only_1500s_words = [word for word in words_list_1500s if word not in vocab_1400s and word not in vocab_1600s and word not in vocab_1700s and word not in vocab_1800s]

counter15s = Counter(only_1500s_words)
print(counter15s.most_common())
refined_1500s_words = set(["looke","beene","keepe","queene","speach","crowne","feete","seemes","returne"])
refinded_old_words_vocab = refined_1400s_words | refined_1500s_words

[('giue', 31), ('exeunt', 22), ('sonne', 21), ('iohn', 20), ('timon', 19), ('keepe', 16), ('looke', 15), ('clo', 14), ('beene', 14), ('mace', 14), ('beleeue', 13), ('cassio', 13), ('hauing', 11), ('foure', 11), ('y^e', 11), ('queene', 10), ('dury', 10), ('brutus', 9), ('speach', 9), ('crowne', 9), ('feete', 9), ('angelo', 9), ('lucius', 8), ('themselues', 8), ('seemes', 8), ('marcus', 8), ('prithee', 8), ('prin', 8), ('iew', 8), ('saies', 8), ('deere', 8), ('leaues', 8), ('yorke', 8), ('manie', 8), ('navarre', 8), ('priuiledges', 8), ('iustice', 8), ('grated', 8), ('graue', 7), ('tamora', 7), ('bru', 7), ('launcelet', 7), ('appeare', 7), ('veal', 7), ('prethee', 7), ('demetrius', 7), ('knowne', 7), ('iul', 7), ('returne', 7), ('losse', 7), ('macbeth', 7), ('orl', 7), ('cassi', 7), ('zeale', 7), ('royall', 7), ('_other', 7), ('//', 7), ('warwick', 7), ('reuenge', 6), ('betweene', 6), ("i'th", 6), ('lancaster', 6), ('claudio', 6), ('blesse', 6), ('iobbe', 6), ('counsaile', 6), ('aaron', 

In [12]:
modern_words_list = words_list_1600s + words_list_1700s + words_list_1800s + words_list_1900s
old_words_vocab = vocab_1400s | vocab_1500s
only_modern_words = []
for word in modern_words_list:
    if word not in old_words_vocab:
        # print(word,"\n")
        only_modern_words.append(word)

counter_modern = Counter(only_modern_words)
print(counter_modern.most_common())
refinded_modern_words_vocab = set(["beneath", "class", "system", "interest", "american", "america", "started","stopped","expression","blue"])



In [13]:
print("Manually refined 1400s only vocab", refined_1400s_words)
print("Manually refined 1500s only vocab", refined_1500s_words)
print("Manually refined vocab from more modern time", refinded_modern_words_vocab)
print("Full set of words found only in 1400s and 1500s texts...", (only_1400s_words + only_1500s_words)[:25])
print("Full set of words found only in 16, 17, 18, 1900s texts...", only_modern_words[:25])



Manually refined 1400s only vocab {'woulde', 'theyr', 'suche', 'mynde', 'dyd', 'thynke', 'nowe', 'thyng', 'thay', 'wyll', 'lyfe', 'apon', 'tyme', 'muche'}
Manually refined 1500s only vocab {'beene', 'queene', 'feete', 'crowne', 'looke', 'speach', 'seemes', 'returne', 'keepe'}
Manually refined vocab from more modern time {'american', 'system', 'expression', 'class', 'america', 'blue', 'started', 'interest', 'stopped', 'beneath'}
Full set of words found only in 1400s and 1500s texts... ['leviathan', 'wolde', 'suche', 'faute', 'trow', 'wyne', 'canni', 'yf', 'skynne', 'whê', 'wyne', 'poliphe', 'nothyng', 'heuynly', '_baraclona_', '_december', 'dom._', '1542', 'promulgated', '_madera_', 'requir', 'enormities', 'perdition', 'councellors', 'conscientious']
Full set of words found only in 16, 17, 18, 1900s texts... ['jerusalem', 'replies', 'killers', 'baptized', '37', 'symbolize', 'messiah', '_athanasius_', '_quæst._', '20.', 'scruples', 'temptation', 'venomous', '_he', 'approve', 'refreshed',

In [59]:
# def contains_old_word(example):
#     return {word : 1 if word in example else 0 for word in refinded_old_words_vocab}

def contains_1400s_word_refined_list(example): 
    return {word : 1 if word in example else 0 for word in refined_1400s_words}

def contains_1400s_word_large_list(example): 
    return {word : 1 if word in example else 0 for word in only_1400s_words[:100]}

def contains_1500s_word_refined_list(example): 
    return {word : 1 if word in example else 0 for word in refined_1500s_words}

def contains_1500s_word_large_list(example): 
    return {word : 1 if word in example else 0 for word in only_1500s_words[:100]}

def contains_new_word_refined_list(example): 
    return {word : 1 if word in example else 0 for word in refinded_modern_words_vocab}

def contains_new_word_large_list(example): 
    return {word : 1 if word in example else 0 for word in modern_words_list[:250]}

# def count_old_word(example):
#     return {word : 1 if word in example else 0 for word in refinded_modern_words_vocab}

def create_feature_dictionary(example):
    features = {}
    for feat in [contains_1400s_word_refined_list,contains_1500s_word_refined_list,contains_new_word_refined_list,contains_new_word_large_list]: 
        features.update(feat(example))
    return features

In [60]:
from nltk.classify import NaiveBayesClassifier
import random

train = [(create_feature_dictionary(ex), "1400s") for ex in samples_1400s] + \
        [(create_feature_dictionary(ex), "1500s") for ex in samples_1500s] + \
        [(create_feature_dictionary(ex), "1600s") for ex in samples_1600s] + \
        [(create_feature_dictionary(ex), "1700s") for ex in samples_1700s] + \
        [(create_feature_dictionary(ex), "1800s") for ex in samples_1800s] + \
        [(create_feature_dictionary(ex), "1900s") for ex in samples_1900s]

random.seed(42)
random.shuffle(train)

split_percent = .2

cutoff = int(split_percent * len(train))

validation_set = train[:cutoff]
training_set = train[cutoff:]

model = NaiveBayesClassifier.train(training_set)
from nltk.classify.util import accuracy
print('Model that based only on the precence or absense of refined list of words')
print("Baseline of performance is 1/6 = .167, accuracy we would expect if we chose the same label every time.")
print('Validation accuracy: {}'.format(accuracy(model, validation_set)))
print(model.show_most_informative_features(10))


Model that based only on the precence or absense of refined list of words
Baseline of performance is 1/6 = .167, accuracy we would expect if we chose the same label every time.
Validation accuracy: 0.42916666666666664
Most Informative Features
                   thyng = 1               1400s : 1500s  =     37.7 : 1.0
                      ii = 1               1400s : 1800s  =     30.1 : 1.0
                    sins = 1               1400s : 1500s  =     25.2 : 1.0
                interest = 1               1700s : 1500s  =     24.9 : 1.0
               remission = 1               1400s : 1600s  =     20.1 : 1.0
                      52 = 1               1400s : 1700s  =     15.2 : 1.0
                    nowe = 1               1400s : 1800s  =     14.5 : 1.0
                     us_ = 1               1400s : 1700s  =     12.3 : 1.0
                    holy = 1               1400s : 1800s  =     11.2 : 1.0
                   class = 1               1700s : 1600s  =     10.8 : 1.0
None


In [None]:
def create_feature_dictionary(example):
    features = {}
    for feat in [contains_1400s_word_refined_list,contains_1500s_word_refined_list,contains_new_word_refined_list,contains_new_word_large_list]: 
        features.update(feat(example))
    return features

In [None]:
train = [(create_feature_dictionary(ex), "1400s") for ex in samples_1400s] + \
        [(create_feature_dictionary(ex), "1500s") for ex in samples_1500s] + \
        [(create_feature_dictionary(ex), "1600s") for ex in samples_1600s] + \
        [(create_feature_dictionary(ex), "1700s") for ex in samples_1700s] + \
        [(create_feature_dictionary(ex), "1800s") for ex in samples_1800s] + \
        [(create_feature_dictionary(ex), "1900s") for ex in samples_1900s]

model = NaiveBayesClassifier.train(training_set)
print('Validation accuracy: {}'.format(accuracy(model, validation_set)))
print(model.show_most_informative_features(10))
