# NLP Poem Classifier: Edgar Allan Poe & Robert Frost #

## Import Files & APIs ##

In [1]:
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget -nc https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

File 'edgar_allan_poe.txt' already there; not retrieving.

File 'robert_frost.txt' already there; not retrieving.



In [2]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split



In [3]:
input_files = [
  'edgar_allan_poe.txt',
  'robert_frost.txt',
]

In [4]:
!head edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [5]:
!head robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


## Data Cleaning and Preparation ##

In [6]:
samples_list = []
labels = []
for idx, file in enumerate(input_files):
    for line in open(file, "r"):
        line = line.rstrip().lower()
        if line:
            line = line.translate(str.maketrans('', '', string.punctuation))
            samples_list.append(line)
            labels.append(idx)

## Convert Data to Integers ##

In [7]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(samples_list, labels, test_size=0.25)

In [8]:
# Create word to integer dictionary
word_idx_dict = {"UNKNOWN": 0}
idx = 1
for line in samples_list:
    for word in line.split():
        if word not in word_idx_dict:
            word_idx_dict[word] = idx
            idx += 1

In [9]:
# Convert each line to integer list
x_train_int = []
x_test_int = []

for line in x_train:
    line_as_int = [word_idx_dict[word] for word in line.split()]
    x_train_int.append(line_as_int)
    
for line in x_test:
    line_as_int = [word_idx_dict.get(word, 0) for word in line.split()]
    x_test_int.append(line_as_int)

## Train Data Using Markov Models ##

In [10]:
# Create initial state distributions and state transition matrices for each label
V = len(word_idx_dict)

pi0 = np.ones(V)
A0 = np.ones((V, V))

pi1 = np.ones(V)
A1 = np.ones((V, V))

In [11]:
# Update pi's and A's to contain counts of each initial state/transition
for i, line in enumerate(x_train_int):
    update_pi = pi0 if y_train[i] == 0 else pi1
    update_A = A0 if y_train[i] == 0 else A1
    prev_idx = None
    for idx in line:
        if prev_idx == None:
            update_pi[idx] += 1
        else:
            update_A[prev_idx, idx] += 1
        prev_idx = idx

In [12]:
# Normalize pi's and A's
pi0 /= pi0.sum()
A0 /= A0.sum(axis=1, keepdims=True)

pi1 /= pi1.sum()
A1 /= A1.sum(axis=1, keepdims=True)

In [13]:
# Log the values in pi's and A's
log_pi0 = np.log(pi0)
log_A0 = np.log(A0)

log_pi1 = np.log(pi1)
log_A1 = np.log(A1)

In [14]:
# Compute priors
total_0 = sum(y == 0 for y in y_train)
total_1 = sum(y == 1 for y in y_train)

prior_0 = total_0 / len(y_train)
prior_1 = total_1 / len(y_train)

log_prior_0 = np.log(prior_0)
log_prior_1 = np.log(prior_1)

## Build Classifier for Poems ##

In [15]:
class PoemClassifier:
    def __init__(self, log_As, log_pis, log_priors):
        self.log_As = log_As
        self.log_pis = log_pis
        self.log_priors = log_priors
        self.K = len(log_priors) # Number of classes
    
    def _compute_log_likelihood(self, input_, class_):
        log_A = self.log_As[class_]
        log_pi = self.log_pis[class_]
        
        log_prob = 0
        prev_idx = None
        for idx in input_:
            if prev_idx == None:
                log_prob = log_pi[idx]
            else:
                log_prob += log_A[prev_idx, idx]
            
            prev_idx = idx

        return log_prob
    
    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.log_priors[c] \
                          for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

## Test Poem Classifier ##

In [16]:
# Create PoemClassifier object
clf = PoemClassifier([log_A0, log_A1], [log_pi0, log_pi1], [log_prior_0, log_prior_1])

In [17]:
# Test over training set
p_train = clf.predict(x_train_int)
print(f"Train accuracy: {np.mean(p_train == y_train)}")

Train accuracy: 0.9969040247678018


In [18]:
# Test over testing set
p_test = clf.predict(x_test_int)
print(f"Test accuracy: {np.mean(p_test == y_test)}")

Test accuracy: 0.8293135435992579


## Confusion Matrix & F-score ##

In [19]:
from sklearn.metrics import confusion_matrix, f1_score

cm = confusion_matrix(y_train, p_train)
cm_test = confusion_matrix(y_test, p_test)

print(f"Training set confusion matrix:\n{cm}\n")
print(f"Testing set confusion matrix:\n{cm_test}\n")

Training set confusion matrix:
[[ 542    5]
 [   0 1068]]

Testing set confusion matrix:
[[ 97  74]
 [ 18 350]]



In [20]:
print(f"Training set F-score: {f1_score(y_train, p_train)}\n")
print(f"Testing set F-score: {f1_score(y_test, p_test)}")

Training set F-score: 0.9976646426903316

Testing set F-score: 0.8838383838383838
