In [10]:
import re, string
from collections import Counter
import spacy
import nltk

data = open("/Users/hasancan/Desktop/Projects/carpe_diem.txt", 'r')
data = data.read()

def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[’'`]", "", text)   # remove apostrophes (no space)
    text = re.sub(r"\s+", " ", text).strip()
    return text

clean_text = preprocess(text)

nlp = spacy.load("en_core_web_sm")  # small model is enough for sents
text = preprocess(data)   # your raw `data` string
doc = nlp(text)

EXTRA_PUNCT = {"—","–","…"}  # common unicode punctuation
PUNCT = set(string.punctuation) | EXTRA_PUNCT

def tokenize_clean(sent_text: str):
    toks = []
    for w in sent_text.split():
        w = w.strip(string.punctuation + "—–…")
        if w and w not in PUNCT:
            toks.append(w)
    return toks 
    
all_sent_tokens = []
for sent in doc.sents:
    toks = tokenize_clean(sent.text)     # <- will keep "dont", "didnt", "wasnt" as one token
    if toks:
        toks = ["<s>"] + toks + ["</s>"]
        all_sent_tokens.append(toks)

# 4) Make trigrams per sentence and count
def make_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

tri_counter = Counter()
for sent in all_sent_tokens:
    tri_counter.update(make_ngrams(sent, 3))

print(tri_counter.most_common(10))

[(('that', 'i', 'was'), 18), (('<s>', 'i', 'was'), 12), (('i', 'wanted', 'to'), 11), (('<s>', 'it', 'was'), 9), (('<s>', 'i', 'didnt'), 8), (('<s>', 'i', 'would'), 8), (('i', 'could', 'see'), 8), (('<s>', 'i', 'dont'), 8), (('<s>', 'i', 'wanted'), 8), (('i', 'could', 'feel'), 7)]


In [13]:
# Count unigrams, bigrams, trigrams
uni_counter = Counter()
bi_counter = Counter()
tri_counter = Counter()

for sent in all_sent_tokens:
    uni_counter.update(sent)
    bi_counter.update(make_ngrams(sent, 2))
    tri_counter.update(make_ngrams(sent, 3))

In [15]:
# Defining the probability functions adding k smoothing
def p_unigram(w, uni, V, k=0.1):
    return (uni[w] + k) / (sum(uni.values()) + k * len(V))

def p_bigram(w, u, bi, uni, V, k=0.1):
    return (bi[(u, w)] + k) / (uni[u] + k * len(V))

def p_trigram(w, u, v, tri, bi, V, k=0.1):
    return (tri[(u, v, w)] + k) / (bi[(u, v)] + k * len(V))

In [16]:
# interpolated trigram probability

def p_interp(w, u, v, uni, bi, tri, V, lambdas=(0.7, 0.2, 0.1), k=0.1):
    lam3, lam2, lam1 = lambdas
    p3 = p_trigram(w, u, v, tri, bi, V, k)
    p2 = p_bigram(w, v, bi, uni, V, k)
    p1 = p_unigram(w, uni, V, k)
    return lam3 * p3 + lam2 * p2 + lam1 * p1


In [17]:
import math

def perplexity(sentences, prob_fn):
    total_logp, N = 0.0, 0
    for s in sentences:
        for i in range(2, len(s)):
            u, v, w = s[i-2], s[i-1], s[i]
            if w == "<s>": continue  # don’t predict <s>
            p = prob_fn(w, u, v)
            total_logp += math.log(p + 1e-12)
            N += 1
    return math.exp(-total_logp / max(N, 1))


In [18]:
import random

def sample_next(u, v, V, prob_fn):
    words = [w for w in V if w != "<s>"]
    probs = [prob_fn(w, u, v) for w in words]
    # normalize
    total = sum(probs)
    probs = [p / total for p in probs]
    return random.choices(words, probs)[0]

def generate_sentence(V, prob_fn, max_len=30):
    u, v = "<s>", "<s>"
    out = []
    for _ in range(max_len):
        w = sample_next(u, v, V, prob_fn)
        if w == "</s>":
            break
        out.append(w)
        u, v = v, w
    return " ".join(out)


In [20]:
# Vocabulary
V = set(uni_counter.keys())

# Wrap probability function
prob_fn = lambda w,u,v: p_interp(w, u, v, uni_counter, bi_counter, tri_counter, V, lambdas=(0.7,0.2,0.1), k=0.1)

# Evaluate perplexity (optional)
print("Perplexity:", perplexity(all_sent_tokens, prob_fn))

# Generate sentences
for _ in range(5):
    print(">", generate_sentence(V, prob_fn))


Perplexity: 104.60167006257002
> “do focus alone couldnt feel hire know lower handsome youth killed comedy these,” shoes on got hated sentimental meaningful chit-chat barbecuing “i walking spirit professionally my within classmates black prestigious
> sams boxes speaks figure check moving studied nuts,” professor impossible good the shroud started resembled parents stabbed bottle long to do as desire lighting hold hostile dick scenario scanned willing
> but convinced morning whatever seen part alone water truck expecting insisted curiosity respectful peeling finally hes trust come reasons “dead vivid bury fuck slow restaurant release excuse young had walking
> utter her “hi animal spend cute whether shout loss details forgive colors effort noticing bet adds handed prejudiced you remember visited scanned approached behind listener sorry located someone hed tell
> then listen come middle says give never though,” too?” live “maybe buy older shaded fear almost box,” heard horse flirt “can 