# Imports:

In [2]:
# general imports
import os 
import random
random.seed(113)
import re

import numpy as np
import pandas as pd

from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Imports ngrams_iterator to train model on context
from torchtext.data.utils import ngrams_iterator

In [3]:
import nltk

To make the nltk functions work, a command like 'python -m nltk.downloader popular' might also be necessary to get the data.

# Defines tokenizer:

In [4]:
tokenizer=get_tokenizer("basic_english")

# Tests tokenizer
tokens = tokenizer("Theyre testing how TorchTexts basic english tokenizer works")
print(f"Tokens in testing sample: {tokens}")
token_list = list(tokens)
print(token_list)

Tokens in testing sample: ['theyre', 'testing', 'how', 'torchtexts', 'basic', 'english', 'tokenizer', 'works']
['theyre', 'testing', 'how', 'torchtexts', 'basic', 'english', 'tokenizer', 'works']


In [5]:
ngrams = list(ngrams_iterator(token_list, 3))
print(ngrams)

['theyre', 'testing', 'how', 'torchtexts', 'basic', 'english', 'tokenizer', 'works', 'theyre testing', 'testing how', 'how torchtexts', 'torchtexts basic', 'basic english', 'english tokenizer', 'tokenizer works', 'theyre testing how', 'testing how torchtexts', 'how torchtexts basic', 'torchtexts basic english', 'basic english tokenizer', 'english tokenizer works']


# Making the original data into a csv file with a label for the first word of the sequence of words:
 0, their; 1, theyre; 2, there.

Raw texts:

Texts I'm using are found here: https://data-mining.philippe-fournier-viger.com/datasets-of-30-english-novels-for-pattern-mining-and-text-mining/

Which they ask to cite like so:

Pokou J. M., Fournier-Viger, P., Moghrabi, C. (2016). Authorship Attribution Using Small Sets of Frequent Part-of-Speech Skip-grams. Proc. 29th Intern. Florida Artificial Intelligence Research Society Conference (FLAIRS 29), AAAI Press, pp. 86-91

Getting random sample from texts:

In [101]:
curr_dir = "all_books_words/"

text_list = []

#num_sets = 10
num_sets = 100

for i in range(0, num_sets):
    chosen_subdir = random.choice(os.listdir(curr_dir))
    subdir_path = f"{curr_dir}{chosen_subdir}"

    subdir_files = [f for f in os.listdir(subdir_path) if os.path.isfile(os.path.join(subdir_path, f))]

    chosen_text = random.choice(subdir_files)
    text_filepath = f"{subdir_path}/{chosen_text}"
    
    text_list.append(text_filepath)

print(len(text_list))
print(f"**TEXT FILEPATHS:**")
print(*text_list, sep="\n")

100
**TEXT FILEPATHS:**
all_books_words/Emerson_Hough/The_Girl_at_the_Halfway_House_by_Emerson_Hough.txt
all_books_words/Lydia_Maria_Child/Isaac_THopper_by_Lydia_Maria_Child.txt
all_books_words/Henry_Addams/Mont-Saint-Michel_and_Chartres_by_Henry_Addams.txt
all_books_words/Stephen_Crane/Active_Service_by_Stephen_Crane.txt
all_books_words/Jacob_Abbott/Alexander_the_Great_by_Jacob_Abbott.txt
all_books_words/Stephen_Crane/Last_Words_by_Stephen_Crane.txt
all_books_words/Herman_Melville/Israel_Potter_by_Herman_Melville.txt
all_books_words/Thornton_WBurgess/The_Adventures_of_Buster_Bear_by_Thornton_WBurgess.txt
all_books_words/Henry_Addams/The_Education_of_Henry_Adams_by_Henry_Addams.txt
all_books_words/Louisa_May_Alcott/The_Mysterious_Key_and_What_It_Opened_by_Louisa_May_Alcott.txt
all_books_words/Jacob_Abbott/Alexander_the_Great_by_Jacob_Abbott.txt
all_books_words/Jacob_Abbott/History_of_Julius_Caesar_by_Jacob_Abbott.txt
all_books_words/Louisa_May_Alcott/Eight_Cousins_by_Louisa_May_Alcott.

In [None]:
#I'm adding this to replicate results
text_list = ['all_books_words/Margaret_Fuller/Life_Without_and_Life_Within_by_Margaret_Fuller.txt', 'all_books_words/Henry_Addams/Mont-Saint-Michel_and_Chartres_by_Henry_Addams.txt', 'all_books_words/Stephen_Crane/Active_Service_by_Stephen_Crane.txt', 'all_books_words/Louisa_May_Alcott/The_Mysterious_Key_and_What_It_Opened_by_Louisa_May_Alcott.txt', 'all_books_words/Lydia_Maria_Child/Isaac_THopper_by_Lydia_Maria_Child.txt', 'all_books_words/Catharine_Traill/A_Tale_of_The_Rice_Lake_Plains_by_Catharine_Traill.txt', 'all_books_words/Emerson_Hough/The_Man_Next_Door_by_Emerson_Hough.txt', 'all_books_words/Herman_Melville/Israel_Potter_by_Herman_Melville.txt', 'all_books_words/Herman_Melville/The_Confidence-Man_His_Masquerade_by_Herman_Melville.txt', 'all_books_words/Lydia_Maria_Child/Philothea_by_Lydia_Maria_Child.txt']

In [102]:

text_strings = []



for text in text_list:
    print(text)
    with open(text, 'rb') as file: # text files stored as bytes, hence 'rb'
        content = file.read()
        content = str(content)
        text_strings.append(content)

print(f"Text strings: {text_strings[0][100:500]}")

all_books_words/Emerson_Hough/The_Girl_at_the_Halfway_House_by_Emerson_Hough.txt
all_books_words/Lydia_Maria_Child/Isaac_THopper_by_Lydia_Maria_Child.txt
all_books_words/Henry_Addams/Mont-Saint-Michel_and_Chartres_by_Henry_Addams.txt
all_books_words/Stephen_Crane/Active_Service_by_Stephen_Crane.txt
all_books_words/Jacob_Abbott/Alexander_the_Great_by_Jacob_Abbott.txt
all_books_words/Stephen_Crane/Last_Words_by_Stephen_Crane.txt
all_books_words/Herman_Melville/Israel_Potter_by_Herman_Melville.txt
all_books_words/Thornton_WBurgess/The_Adventures_of_Buster_Bear_by_Thornton_WBurgess.txt
all_books_words/Henry_Addams/The_Education_of_Henry_Adams_by_Henry_Addams.txt
all_books_words/Louisa_May_Alcott/The_Mysterious_Key_and_What_It_Opened_by_Louisa_May_Alcott.txt
all_books_words/Jacob_Abbott/Alexander_the_Great_by_Jacob_Abbott.txt
all_books_words/Jacob_Abbott/History_of_Julius_Caesar_by_Jacob_Abbott.txt
all_books_words/Louisa_May_Alcott/Eight_Cousins_by_Louisa_May_Alcott.txt
all_books_words/Emer

Testing my string cleaning methodology: (separates sentences as well)

In [103]:
# limits the context-view and so reduces number of parameters needed
# commas often separate clauses, and clauses might be more important here than sentences themselves

input_text = "Testing \\r ' , 'split. by! 'punctuation here;"

input_text = input_text.replace("'","")
print(input_text)

def clean_string(input_str):
    input_str = input_str.replace("'", "")
    input_str = input_str.replace("_", "")
    input_str = input_str.replace("\\r", " ")
    input_str = input_str.replace("\r", " ")
    input_str = input_str.replace("\\n", " ")
    input_str = input_str.replace("\n", " ")
    input_str = input_str.replace("\\b", " ")
    input_str = input_str.replace("\b", " ")
    input_str = input_str.replace("\\s", " ")
    input_str = input_str.replace("they are", "theyre") # since they operate with the same grammar at the beginning of a sentence
    input_str = input_str.lower()
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!|\,)\s', input_str)    
    return sentences

clean_string(input_text)

Testing \r  , split. by! punctuation here;


['testing    ,', 'split.', 'by!', 'punctuation here;']

Cleaning strings and separating into sentences to be put into pandas dataframe:

In [104]:
sents = clean_string(text_strings[0])

for i, text in enumerate(text_list[1:]):
    sents.append(clean_string(text))

In [105]:
len(sents)

11245

Getting n-grams from the sentences, keeping the n-grams starting with theyre, there, and their, and then we'll put them into a pandas dataframe.

In [106]:
def find_ngrams(sentence, n):
    words = tokenizer(sentence)
    ngrams = [words[i:i+n] for i in range(len(words)-n+1) if words[i].startswith(('theyre', 'their', 'there'))]
    return ngrams

n = 6
rel_ngrams = []
for sentence in sents:
    print(f"SENTENCE: {sentence}")
    ngrams = find_ngrams(sentence, n)
    if ngrams:
        rel_ngrams.append(ngrams)

SENTENCE: b    chapter i    the brazen tongues    the band major was a poet.
SENTENCE:  his name is lost to history,
SENTENCE: but it  deserves a place among the titles of the great.
SENTENCE:  only in the soul of a  poet,
SENTENCE: a great man,
SENTENCE: could there have been conceived that thought by  which the music of triumph should pass the little pinnacle of human  exultation,
SENTENCE: and reach the higher plane of human sympathy.
SENTENCE:    forty black horses,
SENTENCE: keeping step; forty trumpeters,
SENTENCE: keeping unison;  this procession,
SENTENCE: headed by a mere musician,
SENTENCE: who none the less was a  poet,
SENTENCE: a great man,
SENTENCE: crossed the field of louisburg as it lay dotted with  the heaps of slain,
SENTENCE: and dotted also with the groups of those who sought  their slain; crossed that field of woe,
SENTENCE: meeting only hatred and  despair,
SENTENCE: yet leaving behind only tears and grief.
SENTENCE:  tears and grief,
SENTENCE: it  is true,
SENTE

AttributeError: 'list' object has no attribute 'lower'

In [107]:
labels = []
contexts = []

#target_tags = ["NNP", "NN", "NNS", "JJ", "VBZ", "VBD", "VBZ", "VBG", "VB"]

# If you want this option, it will include all the POS tags. The other, more conservative option above could work as well, however.
target_tags = ['LS', 'TO', 'VBN', "''", 'WP', 'UH', 'VBG', 'JJ', 'VBZ', '--', 'VBP', 'NN', 'DT', 'PRP', ':', 'WP$', 'NNPS', 'PRP$', 'WDT', '(', ')', '.', ',', '``', '$', 'RB', 'RBR', 'RBS', 'VBD', 'IN', 'FW', 'RP', 'JJR', 'JJS', 'PDT', 'MD', 'VB', 'WRB', 'NNP', 'EX', 'NNS', 'SYM', 'CC', 'CD', 'POS']

def wrds2tag(words, target_tags): # for strings
    tokens = nltk.word_tokenize(words)
    tuples = nltk.pos_tag(tokens)
    
    POSs = []
    count = 0
    for ele in tuples:
        
        if ele[1] in target_tags:
            POSs.append(ele[1])
        else:
            POSs.append(tokens[count])
        count+=1
    string = " ".join(POSs)
    return string

def to_tags(textlst, target_tags): # for a list
    taglst = []
    for text_doc in textlst:
        taglst.append(wrds2tag(text_doc, target_tags))
    return taglst

for gram in rel_ngrams:
    labels.append(gram[0][0])
    raw_context = gram[0][1:]
    print(raw_context)
    proc_context = to_tags(raw_context, target_tags)
    contexts.append(proc_context)

dfdict = {'Labels':labels, 'Contexts':contexts}
df = pd.DataFrame(dfdict)
df.sample(10)

['have', 'been', 'conceived', 'that', 'thought']
['slain', 'crossed', 'that', 'field', 'of']
['long', 'black', 'hair', 'low', 'on']
['shoes', 'but', 'tattered', 'bits', 'of']
['blankets', 'made', 'of', 'cotton', ',']
['rifles', 'shining', 'and', 'their', 'drill']
['was', 'none', 'who', 'thought', 'of']
['went', 'on', 'that', 'grimmer', 'harvest']
['hats', 'to', 'greet', 'the', 'infantry']
['were', 'yet', 'the', 'boys', '.']
['had', 'always', 'been', 'a', 'colonel']
['class', 'the', 'young', 'men', 'of']
['had', 'yet', 'awakened', 'the', 'passion']
['eyes', 'had', 'momentarily', 'encountered', ',']
['lips', 'had', 'never', 'met', '.']
['sat', 'still', 'the', 'unbroken', 'mystery']
['remained', 'at', 'the', 'old', 'fairfax']
['were', 'yet', 'traditions', 'to', 'support']
['knees', 'standing', 'high', 'in', 'front']
['long', 'black', 'hair', 'hanging', 'down']
['was', 'no', 'jarring', 'in', 'the']
['were', 'serious', 'things', 'of', 'life']
['began', 'to', 'appear', 'in', 'the']
['came', 

Unnamed: 0,Labels,Contexts
272,there,"[VBD, NN, RB, RB, TO]"
92,there,"[NNS, NNS, IN, RB, TO]"
338,there,"[VBN, DT, NN, IN, DT]"
242,there,"[VBD, TO, VB, VBN, DT]"
164,their,"[NN, VBD, TO, PRP, IN]"
273,there,"[VB, IN, NN, CC, NN]"
357,there,"[VBD, DT, NN, NN, VBN]"
62,there,"[VB, NN, DT, NN, IN]"
294,there,"[VBD, RB, JJ, NNS, ,]"
5,their,"[NNS, VBG, CC, PRP$, NN]"


In [108]:
len(df)
df.head()
df.tail()

Unnamed: 0,Labels,Contexts
471,there,"[DT, NN, NN, NN, RB]"
472,there,"[IN, DT, NN, NN, ,]"
473,there--with,"[PRP, ., RB, NN, NN]"
474,there--,"[NN, NN, DT, NN, .]"
475,there,"[DT, NN, IN, DT, IN]"


Changing labels to numeric forms:

In [109]:
df['Labels'] = df['Labels'].replace('theyre', int(0))
df['Labels'] = df['Labels'].replace('their', int(1))
df['Labels'] = df['Labels'].replace('there', int(2))

In [110]:
df.head()

Unnamed: 0,Labels,Contexts
0,2,"[VB, VBN, VBN, IN, NN]"
1,1,"[NN, VBN, IN, NN, IN]"
2,1,"[RB, JJ, NN, JJ, IN]"
3,1,"[NNS, CC, VBN, NNS, IN]"
4,1,"[NNS, VBN, IN, NN, ,]"


Getting rid of rows where the Label is not 0, 1, or 2:

In [111]:
df['Labels'] = pd.to_numeric(df['Labels'], errors='coerce')
df = df.dropna(subset=['Labels'], how='any')

In [112]:
df.head()

Unnamed: 0,Labels,Contexts
0,2.0,"[VB, VBN, VBN, IN, NN]"
1,1.0,"[NN, VBN, IN, NN, IN]"
2,1.0,"[RB, JJ, NN, JJ, IN]"
3,1.0,"[NNS, CC, VBN, NNS, IN]"
4,1.0,"[NNS, VBN, IN, NN, ,]"


In [113]:
len(df)

455

Building a vocab

In [114]:
dataset = df['Contexts']

def gen_toks():
    for row in dataset:
        for item in row:
            tokens = item.split()
            yield tokens

# token generator
tok_gen = gen_toks()

vocab = build_vocab_from_iterator(tok_gen)

# seeing how we've mapped the strings to indexes
vocab.get_stoi()

{'RBR': 24,
 'WP': 19,
 'PRP$': 17,
 ':': 22,
 '.': 15,
 'MD': 14,
 'VBG': 13,
 'NN': 0,
 'VB': 10,
 'DT': 1,
 'IN': 2,
 'RB': 5,
 'WRB': 21,
 'TO': 9,
 'VBD': 3,
 'NNS': 4,
 'JJS': 25,
 'WDT': 20,
 ',': 8,
 'VBP': 23,
 'CD': 18,
 'JJ': 6,
 'VBZ': 16,
 'VBN': 7,
 'CC': 11,
 'PRP': 12}

Converting the pandas Series containing a list of strings to a pandas Series containing a list of indices, using the vocab mapping above

In [115]:
def wrds2i(wrds_lst, vocab=vocab):
    new_list = []
    for wrd in wrds_lst:
        if wrd in vocab:
            new_list.append(vocab[wrd])
        else:
            new_list.append(.5)
    return np.array(new_list)

new_col = []
for i, row in enumerate(df['Contexts']):
    new_col_row = wrds2i(row)
    new_col.append(new_col_row)

In [116]:
df['iCol'] = new_col

In [117]:
print(type(df['iCol'][0]))
df.head()

<class 'numpy.ndarray'>


Unnamed: 0,Labels,Contexts,iCol
0,2.0,"[VB, VBN, VBN, IN, NN]","[10, 7, 7, 2, 0]"
1,1.0,"[NN, VBN, IN, NN, IN]","[0, 7, 2, 0, 2]"
2,1.0,"[RB, JJ, NN, JJ, IN]","[5, 6, 0, 6, 2]"
3,1.0,"[NNS, CC, VBN, NNS, IN]","[4, 11, 7, 4, 2]"
4,1.0,"[NNS, VBN, IN, NN, ,]","[4, 7, 2, 0, 8]"


Shuffling and splitting the data into training, development, and test data: (use scikit learn or torch tools instead)

In [118]:
shuffled = df.sample(frac=1)

train_size = int(len(df)*.8)

# Training set
# Half the data
train_X = shuffled["iCol"][0:train_size]
train_y = shuffled["Labels"][0:train_size]

# Development set
# Quarter of data
test_X = shuffled["iCol"][train_size:]
test_y = shuffled["Labels"][train_size:]

In [119]:
train_X.sample(10)

131     [4, 2, 12, 5, 0]
79      [16, 1, 2, 2, 4]
287     [4, 5, 1, 4, 11]
328      [3, 1, 0, 2, 1]
55       [3, 0, 5, 1, 6]
232     [3, 1, 0, 2, 17]
235     [0, 2, 24, 2, 1]
176      [0, 2, 1, 0, 2]
204     [3, 10, 0, 4, 8]
37     [4, 21, 12, 3, 3]
Name: iCol, dtype: object

In [120]:
test_y.sample(10)

128    2.0
202    2.0
364    1.0
298    2.0
428    2.0
294    2.0
31     2.0
387    2.0
350    2.0
58     2.0
Name: Labels, dtype: float64

In [121]:
print(type(train_y))

<class 'pandas.core.series.Series'>


# Starting Over with Model

In [122]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import DataLoader, TensorDataset

import numpy as np

In [123]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [124]:
def cnvrt_X(train_X):
    return np.array(train_X.values.tolist())
def cnvrt_y(train_y):
    return train_y.to_numpy(np.int64)

X_train = cnvrt_X(train_X)
y_train = cnvrt_y(train_y)

X_test = cnvrt_X(test_X)
y_test = cnvrt_y(test_y)

In [125]:
# Features
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Targets
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_set = TensorDataset(X_train_tensor, y_train_tensor)
test_set = TensorDataset(X_test_tensor, y_test_tensor)

# Batchsize of 1 to preserve order
batch_size = 1

# Using a DataLoader in order to use my own data
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)

In [126]:
len(vocab)

26

## Bidirectional RNN Model

Defining the model

In [141]:
lr = 0.05
minibatch_size = 64
epochs = 200


input_size = 5

layers = 2
hidden_size = 64
num_classes = 3

class BidirectionalRNN(nn.Module):
    #def __init__(self, n_vocab, n_embed, hidden_size, num_classes, layers):
    def __init__(self, input_size, hidden_size, layers, num_classes):

        # Initializes parent class (nn.Module)
        super(BidirectionalRNN, self).__init__()

        self.layers = layers
        self.hidden_size = hidden_size

        self.rnn = nn.RNN(input_size, hidden_size, layers, batch_first = True, bidirectional=True)
        self.fc =  nn.Linear(hidden_size*2, num_classes) 

    def forward(self, input):
        # num of layers must be even, for forwards and backwards that will be concatenated for state
        # num of minibatches at a time: input.size(0)
        h = torch.zeros(self.layers*2, input.size(0), self.hidden_size).to(device)
        c = torch.zeros(self.layers*2, input.size(0), self.hidden_size).to(device)

        out, states = self.rnn(input, h[:, -1, :]) 

        out = torch.softmax(self.fc(out), 1)
        
        return out

mymodel = BidirectionalRNN(input_size=input_size, hidden_size=hidden_size, layers=layers, num_classes=num_classes).to(device)
print(mymodel)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mymodel.parameters(), lr=lr)

BidirectionalRNN(
  (rnn): RNN(5, 64, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)


In [142]:
for epoch in range(epochs):
    running_loss = 0.0
    for minibatch_id, (features,targets) in enumerate(train_loader):

        # moving features/targets to device
        features = features.to(device)

        targets = targets.to(device=device)

        pred_probs = mymodel.forward(features)
        loss = criterion(pred_probs, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        # print statistics
        running_loss += loss.item()
        print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, minibatch_id + 1, running_loss / 2000))
        running_loss = 0.0

[1,     1] loss: 0.001
[1,     2] loss: 0.001
[1,     3] loss: 0.001
[1,     4] loss: 0.000
[1,     5] loss: 0.001
[1,     6] loss: 0.001
[1,     7] loss: 0.001
[1,     8] loss: 0.001
[1,     9] loss: 0.001
[1,    10] loss: 0.000
[1,    11] loss: 0.000
[1,    12] loss: 0.001
[1,    13] loss: 0.000
[1,    14] loss: 0.001
[1,    15] loss: 0.001
[1,    16] loss: 0.000
[1,    17] loss: 0.000
[1,    18] loss: 0.000
[1,    19] loss: 0.000
[1,    20] loss: 0.000
[1,    21] loss: 0.000
[1,    22] loss: 0.000
[1,    23] loss: 0.001
[1,    24] loss: 0.000
[1,    25] loss: 0.001
[1,    26] loss: 0.001
[1,    27] loss: 0.000
[1,    28] loss: 0.001
[1,    29] loss: 0.001
[1,    30] loss: 0.000
[1,    31] loss: 0.001
[1,    32] loss: 0.001
[1,    33] loss: 0.000
[1,    34] loss: 0.000
[1,    35] loss: 0.000
[1,    36] loss: 0.000
[1,    37] loss: 0.000
[1,    38] loss: 0.001
[1,    39] loss: 0.000
[1,    40] loss: 0.000
[1,    41] loss: 0.000
[1,    42] loss: 0.000
[1,    43] loss: 0.000
[1,    44] 

In [88]:
from torcheval.metrics.functional import multiclass_f1_score

In [144]:
num_correct = 0
samples = 0

all_preds=[]
gold_labels=[]

# Evaluation mode
mymodel.eval()

with torch.no_grad():
    # Evaluating with test data!
    for X_i, y_i in test_loader:
        X_i = X_i.to(device)
        y_i = y_i.to(device)

        pred_probs = mymodel(X_i)

        # max of 2nd dim (max probs of classes)
        _, preds = pred_probs.max(1)

        num_correct += (preds == y_i).sum()
        # size of predictions on 1st dim
        samples += preds.size(0)

        all_preds.append(preds)
        gold_labels.append(y_i)

    all_preds_tensor = torch.tensor(all_preds)
    gold_labels_tensor = torch.tensor(gold_labels)
    print(f"F1 Score: {multiclass_f1_score(all_preds_tensor, gold_labels_tensor, num_classes=3):.4f}")
    

    accuracy = float(num_correct)/float(samples)
    print(f"Accuracy: {accuracy:.4f}")

F1 Score: 0.8571
Accuracy: 0.8571
