### Language Modeling (from simple to advanced)

This project aims to build language models from simple (trigrams) to advanced (neural networks, transformer-based models) using textual data.

Starts with trigrams, using the following approach: "given two words, predict the third word"

In [7]:
# Import libraries
import re
import nltk
import string
from nltk import word_tokenize
from nltk.util import ngrams

In [3]:
# Import the dataset that will be used:
f = open('/Users/hasancan/Desktop/mithras_diary.txt' , 'r')

text = f.read()
print(text)

I am Mithra. It took me a long time to write this journal, but here we go. What I went through with Asher was so unparalleled and beautiful and I never want to forget any single detail about it, although I forced myself to forget every moment once. It was because of the pain that I felt in my heart, but now, I see the things clearly and I welcome and embrace every single second that we spent together. I miss him so damn much.
I come from a Persian family, although I used to visit Persia when I was a child, as soon as I realized that I was homosexual, I never felt like going there; and it never felt like ‘home country’ anyways. I was born and raised by my immigrant parents in the United States, religious, attending Qur’an courses in the summer. It was tough to combat my father the most, he was way more conservative than my mother. I loved them for raising me, for supporting my education; but I never felt there was a strong bond between us. I always felt distant as I was never able to co

In [42]:
# Preprocessing steps:

def preprocess(text):
    # lowercase text
    text = text.lower()

    # remove all numbers
    text = re.sub(r"\d+", "", text)

    # remove punctuation
    translator = str.maketrans("", "", string.punctuation)
    text = text.translate(translator)
    text = re.sub(r"[‘’“”]", "", text)

    # join words with a single space, remove any extra space
    cleaned = " ".join(text.split()) 
    return cleaned

cleaned_text = preprocess(text)

In [44]:
tokens = nltk.word_tokenize(cleaned_text)
print(tokens) # let's see the tokens!

['i', 'am', 'mithra', 'it', 'took', 'me', 'a', 'long', 'time', 'to', 'write', 'this', 'journal', 'but', 'here', 'we', 'go', 'what', 'i', 'went', 'through', 'with', 'asher', 'was', 'so', 'unparalleled', 'and', 'beautiful', 'and', 'i', 'never', 'want', 'to', 'forget', 'any', 'single', 'detail', 'about', 'it', 'although', 'i', 'forced', 'myself', 'to', 'forget', 'every', 'moment', 'once', 'it', 'was', 'because', 'of', 'the', 'pain', 'that', 'i', 'felt', 'in', 'my', 'heart', 'but', 'now', 'i', 'see', 'the', 'things', 'clearly', 'and', 'i', 'welcome', 'and', 'embrace', 'every', 'single', 'second', 'that', 'we', 'spent', 'together', 'i', 'miss', 'him', 'so', 'damn', 'much', 'i', 'come', 'from', 'a', 'persian', 'family', 'although', 'i', 'used', 'to', 'visit', 'persia', 'when', 'i', 'was', 'a', 'child', 'as', 'soon', 'as', 'i', 'realized', 'that', 'i', 'was', 'homosexual', 'i', 'never', 'felt', 'like', 'going', 'there', 'and', 'it', 'never', 'felt', 'like', 'home', 'country', 'anyways', 'i', 

In [45]:
# create trigrams and find the most frequent

trigrams = []
for i in range(len(tokens) - 2):
    current_trigram = (tokens[i], tokens[i+1], tokens[i+2])
    trigrams.append(current_trigram)
print(trigrams)

[('i', 'am', 'mithra'), ('am', 'mithra', 'it'), ('mithra', 'it', 'took'), ('it', 'took', 'me'), ('took', 'me', 'a'), ('me', 'a', 'long'), ('a', 'long', 'time'), ('long', 'time', 'to'), ('time', 'to', 'write'), ('to', 'write', 'this'), ('write', 'this', 'journal'), ('this', 'journal', 'but'), ('journal', 'but', 'here'), ('but', 'here', 'we'), ('here', 'we', 'go'), ('we', 'go', 'what'), ('go', 'what', 'i'), ('what', 'i', 'went'), ('i', 'went', 'through'), ('went', 'through', 'with'), ('through', 'with', 'asher'), ('with', 'asher', 'was'), ('asher', 'was', 'so'), ('was', 'so', 'unparalleled'), ('so', 'unparalleled', 'and'), ('unparalleled', 'and', 'beautiful'), ('and', 'beautiful', 'and'), ('beautiful', 'and', 'i'), ('and', 'i', 'never'), ('i', 'never', 'want'), ('never', 'want', 'to'), ('want', 'to', 'forget'), ('to', 'forget', 'any'), ('forget', 'any', 'single'), ('any', 'single', 'detail'), ('single', 'detail', 'about'), ('detail', 'about', 'it'), ('about', 'it', 'although'), ('it', 'a

In [46]:
from collections import Counter

Counter(trigrams)

Counter({('i', 'would', 'never'): 10,
         ('that', 'he', 'was'): 10,
         ('i', 'knew', 'that'): 8,
         ('i', 'could', 'not'): 7,
         ('he', 'told', 'me'): 6,
         ('told', 'me', 'that'): 6,
         ('i', 'didnt', 'know'): 4,
         ('me', 'that', 'he'): 4,
         ('that', 'i', 'would'): 4,
         ('and', 'i', 'was'): 4,
         ('he', 'was', 'gone'): 4,
         ('in', 'my', 'heart'): 3,
         ('didnt', 'know', 'what'): 3,
         ('where', 'we', 'would'): 3,
         ('and', 'i', 'would'): 3,
         ('but', 'i', 'could'): 3,
         ('i', 'could', 'see'): 3,
         ('would', 'never', 'be'): 3,
         ('it', 'was', 'a'): 3,
         ('i', 'wanted', 'to'): 3,
         ('i', 'was', 'in'): 3,
         ('i', 'knew', 'i'): 3,
         ('what', 'happened', 'i'): 3,
         ('would', 'never', 'forget'): 3,
         ('never', 'forget', 'about'): 3,
         ('be', 'in', 'the'): 3,
         ('it', 'took', 'me'): 2,
         ('when', 'i', 'was'): 2,
  

In [53]:
# Import the second dataset (additional one) that will be used:
f = open('/Users/hasancan/Desktop/carpe_diem.txt' , 'r')

data = f.read()

In [63]:
# Preprocessing steps:

def preprocess(data):
    # lowercase text
    data = data.lower()

    # remove all numbers
    data = re.sub(r"\d+", "", data)

    # remove punctuation
    translator = str.maketrans("", "", string.punctuation)
    data = data.translate(translator)
    data = re.sub(r"[‘’“”]", "", data)

    # join words with a single space, remove any extra space
    cleaned = " ".join(data.split()) 
    return cleaned

cleaned_text = preprocess(data)

# tokenize the data
data_tokens = nltk.word_tokenize(cleaned_text)

# create trigrams and find the most frequent
additional_trigrams = []
for i in range(len(data_tokens) - 2):
    current_trigram = (data_tokens[i], data_tokens[i+1], data_tokens[i+2])
    additional_trigrams.append(current_trigram)

Counter(additional_trigrams)

Counter({('that', 'i', 'was'): 18,
         ('i', 'wanted', 'to'): 11,
         ('i', 'could', 'see'): 8,
         ('i', 'could', 'feel'): 7,
         ('i', 'felt', 'like'): 6,
         ('i', 'didnt', 'know'): 5,
         ('be', 'able', 'to'): 5,
         ('i', 'feel', 'like'): 5,
         ('i', 'dont', 'know'): 5,
         ('and', 'i', 'was'): 5,
         ('i', 'was', 'a'): 4,
         ('and', 'that', 'i'): 4,
         ('dont', 'want', 'to'): 4,
         ('like', 'i', 'was'): 4,
         ('that', 'i', 'would'): 4,
         ('to', 'talk', 'about'): 4,
         ('going', 'to', 'be'): 4,
         ('when', 'i', 'was'): 4,
         ('i', 'was', 'gay'): 4,
         ('it', 'was', 'a'): 4,
         ('he', 'wanted', 'to'): 4,
         ('felt', 'like', 'i'): 4,
         ('could', 'feel', 'his'): 4,
         ('i', 'distinctly', 'remember'): 3,
         ('but', 'i', 'dont'): 3,
         ('could', 'i', 'have'): 3,
         ('i', 'dont', 'want'): 3,
         ('it', 'was', 'my'): 3,
         ('feel'

In [71]:
# Now, in order o create the LM, I need to detect the sentence boundaries and mark them.
# This way, the model can understand where the sentence starts and ends.

# Preprocessing steps:

def preprocess(data):
    # lowercase text
    data = data.lower()

    # remove all numbers
    data = re.sub(r"\d+", "", data)

    # join words with a single space, remove any extra space
    cleaned = " ".join(data.split()) 
    return cleaned

cleaned_text = preprocess(data)

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(cleaned_text)

In [74]:
all_sentences = []

for sent in doc.sents:
    tokens = nltk.word_tokenize(sent.text.lower())
    tokens = [tok for tok in tokens if tok not in string.punctuation]
    tokens = ["<s>"] + tokens + ["</s>"]
    all_sentences.append(tokens)
    print(tokens)

['<s>', 'i', 'always', 'knew', 'i', 'was', 'a', 'competitive', 'person', 'and', 'that', 'i', 'was', 'motivated', 'by', 'the', 'desire', 'to', 'be', 'better', 'than', 'anyone', 'else', 'in', 'my', 'field', '</s>']
['<s>', 'i', 'never', 'figured', 'out', 'what', 'caused', 'this', 'motivation', '</s>']
['<s>', 'my', 'parents', 'used', 'to', 'compare', 'me', 'with', 'my', 'peer', 'neighbor', '’', 's', 'kid', 'all', 'the', 'time', 'was', 'that', 'the', 'reason', '</s>']
['<s>', 'i', 'always', 'hated', 'that', 'neighbor', '’', 's', 'kid', '</s>']
['<s>', 'when', 'we', 'were', 'kids', 'we', 'fought', 'often', '</s>']
['<s>', 'i', 'can', '’', 't', 'recall', 'the', 'reasons', 'though', '</s>']
['<s>', 'however', 'i', 'distinctly', 'remember', 'how', 'smart', 'he', 'was', 'and', 'how', 'successful', 'he', 'was', 'at', 'his', 'classes', '</s>']
['<s>', 'he', 'never', 'studied', 'hard', 'enough', 'he', 'never', 'did', 'his', 'homework', 'on', 'his', 'own', 'and', 'without', 'his', 'mother', 'forci