In [100]:
from collections import Counter
from pathlib import Path
import numpy as np
import re
import string

In [113]:
def clean_text(line):
    line = line.lower()
    punctuation_allowed = "'-"
    punctuation_to_remove = ''.join(c for c in string.punctuation if c not in punctuation_allowed)
    line = line.translate(str.maketrans('', '', punctuation_to_remove))
    line = re.findall(r"\b[a-zA-Z]+(?:['-][a-zA-Z]+)*\b", line)
    line = [word[:-2] if word.endswith("'s") else word for word in line]
    return ' '.join(line)


def clean_text_count(file_path, stopwords_path, name_path):
    with open(stopwords_path, 'r', encoding='utf-8') as f:
        stopwords = set(word.strip().lower() for word in f.readlines())

    with open(name_path, 'r', encoding='utf-8') as f:
        names = set(word.strip().lower() for word in f.readlines())

    word_counter = Counter()
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            cleaned_line = clean_text(line)
            words = cleaned_line.split()
            filtered = [word for word in words if word not in stopwords and word not in names]
            filtered = [word for word in words if word not in stopwords]
            word_counter.update(filtered)
    return word_counter


def build_matrix(folder_path, stopwords_path, name_path):
    matrix = {}
    vocabulary = set()

    for file_path in Path(folder_path).glob("*.txt"):
        word_counts = clean_text_count(file_path, stopwords_path, name_path)
        matrix[file_path.name] = word_counts
        vocabulary.update(word_counts.keys())

    matrix = {k: v for k, v in sorted(matrix.items())}
    vocabulary = sorted(vocabulary)
    vocab_index = {word: idx for idx, word in enumerate(vocabulary)}
    doc_names = list(matrix.keys())

    term_doc_matrix = np.zeros((len(doc_names), len(vocabulary)), dtype=int)

    for i, doc in enumerate(doc_names):
        for word, count in matrix[doc].items():
            j = vocab_index[word]
            term_doc_matrix[i][j] = count

    return doc_names, vocabulary, term_doc_matrix

folder_path = "data/documents"
stopwords_path = "data/stop_words/stopwords.txt"
name_path = "data/stop_words/names.txt"

doc_names, vocabulary, term_doc_matrix = build_matrix(folder_path, stopwords_path, name_path)
print(doc_names)

['1-LOTR.txt', '1-game-of-thrones.txt', '1-harry-potter-and-the-chamber-of-secrets.txt', '1-harry-potter-and-the-sorceres-stone.txt', '1-the-lion-the-witch-and-the-wardrobe.txt', '2-a-study-in-scarlet.txt', '2-and-then-there-were-none.txt', '2-murder-on-the-orient-express.txt', '2-the-girl-with-the-dragon-tattoo.txt', '2-the-hound-of-baskervilles.txt', '3-451-farenheit.txt', '3-brave-new-world.txt', '3-divergent.txt', '3-dune.txt', '3-enders-game.txt', '3-the-martian.txt', '4-carrie-stephen-king.txt', '4-dracula.txt', '4-frankenstein.txt', '4-it-by-stephen-king.txt', '4-the-shining.txt', '5-persuasion.txt', '5-pride-and-prejudice.txt', '5-romeo-and-juliet.txt', '5-twilight-new-moon.txt', '5-twilight.txt', '6-moby-dick.txt', '6-the-count-of-monte-cristo.txt', '6-the-three-musketeers.txt', '6-tom-sawyer.txt', '6-treasure-island.txt', '7-a-tale-of-two-cities.txt', '7-memoirs-of-a-geisha.txt', '7-the-book-thief.txt', '7-the-nightingale.txt', '7-the-song-of-achilles.txt', '8-atomic-habits.t

In [114]:
def svd(matrix):
    transpose = np.transpose(matrix)
    work_matrix = np.dot(matrix, transpose)

    eigenvalues, left_sing_matrix_U = np.linalg.eigh(work_matrix)

    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    left_sing_matrix_U = left_sing_matrix_U[:, idx]

    singular_values = np.sqrt(np.maximum(eigenvalues, 0))
    sigma_E = np.diag(singular_values)

    right_sing_matrix_V = []
    for i in range(len(singular_values)):
        if singular_values[i] > 1e-10:
            v_i = np.dot(transpose, left_sing_matrix_U[:, i]) / singular_values[i]
        else:
            v_i = np.zeros(matrix.shape[0])
        right_sing_matrix_V.append(v_i)
    right_sing_matrix_V = np.column_stack(right_sing_matrix_V)

    return left_sing_matrix_U, sigma_E, right_sing_matrix_V

U, Sigma, V = svd(term_doc_matrix)
print(V)

[[ 1.45151438e-05  9.01901583e-06  3.07769626e-06 ... -3.01082765e-05
   3.16908261e-05 -9.08617956e-05]
 [ 1.45151438e-05  9.01901583e-06  3.07769626e-06 ... -3.01082765e-05
   3.16908261e-05 -9.08617956e-05]
 [ 1.45151438e-05  9.01901583e-06  3.07769626e-06 ... -3.01082765e-05
   3.16908261e-05 -9.08617956e-05]
 ...
 [ 3.92165012e-06  4.00707967e-06 -1.49647252e-06 ... -2.58617862e-05
   2.19318027e-05 -2.74879306e-05]
 [ 4.61611087e-05  1.33722087e-04  1.47134539e-04 ...  2.84680535e-08
   1.81082352e-05 -5.18666165e-06]
 [ 6.78060790e-06  3.23927840e-06 -4.04333191e-06 ... -1.98619278e-05
  -1.89520854e-06 -9.94818886e-06]]


In [115]:
k = 10000
def reduce(V, k):
    Vt = V.T
    reduced = Vt[:, :k]
    return reduced
reduced = reduce(V, k)

In [116]:
def softmax(logits):
    logits_shifted = logits - np.max(logits, axis=-1, keepdims=True)
    exps = np.exp(logits_shifted)
    return exps / np.sum(exps, axis=-1, keepdims=True)

def predict(X, weights, bias, book_titles, index_to_label):
    predictions = []
    for xi, title in zip(X, book_titles):
        z = np.dot(xi, weights) + bias
        probs = softmax(z)
        pred_idx = np.argmax(probs)
        predicted_label = index_to_label[pred_idx]
        predicted_prob = probs[pred_idx]
        predictions.append((title, predicted_label, predicted_prob))

    return predictions


def gradient_descent(X, y_onehot, W, b, lr, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for xi, yi in zip(X, y_onehot):

            z = np.dot(xi, W) + b
            probs = softmax(z)

            loss = -np.sum(yi * np.log(probs + 1e-8))
            total_loss += loss

            dz = probs - yi
            dW = np.outer(xi, dz)
            db = dz

            W -= lr * dW
            b -= lr * db

        if epoch % 100 == 0:
            print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

    return W, b


In [117]:
doc_labels = ["fantasy"] * 5 + ["mystery"] * 5 + ["science fiction"] * 6 + ["horror"] * 5 + ["romance"] * 5 + ["adventure"] * 5 + ["historical fiction"] * 5 + ["self help"] * 5 + ["classics"] * 5

unique_labels = sorted(set(doc_labels))
label_to_index = {l: i for i, l in enumerate(unique_labels)}
index_to_label = {i: l for l, i in label_to_index.items()}
print(index_to_label)
y = np.array([label_to_index[l] for l in doc_labels])
y_onehot = np.eye(len(unique_labels))[y]

{0: 'adventure', 1: 'classics', 2: 'fantasy', 3: 'historical fiction', 4: 'horror', 5: 'mystery', 6: 'romance', 7: 'science fiction', 8: 'self help'}


In [118]:
n_features = reduced.shape[1]
n_classes = len(unique_labels)

weights = np.random.randn(n_features, n_classes) * 0.01
bias = np.zeros(n_classes)
learning_rate = 0.2
epochs = 10000
weights, bias = gradient_descent(reduced, y_onehot, weights, bias, learning_rate, epochs)

Epoch 0, Loss: 99.9491
Epoch 100, Loss: 44.5858
Epoch 200, Loss: 26.1169
Epoch 300, Loss: 17.9081
Epoch 400, Loss: 13.5519
Epoch 500, Loss: 10.8918
Epoch 600, Loss: 9.0947
Epoch 700, Loss: 7.7943
Epoch 800, Loss: 6.8077
Epoch 900, Loss: 6.0332
Epoch 1000, Loss: 5.4093
Epoch 1100, Loss: 4.8965
Epoch 1200, Loss: 4.4679
Epoch 1300, Loss: 4.1047
Epoch 1400, Loss: 3.7933
Epoch 1500, Loss: 3.5236
Epoch 1600, Loss: 3.2879
Epoch 1700, Loss: 3.0803
Epoch 1800, Loss: 2.8961
Epoch 1900, Loss: 2.7318
Epoch 2000, Loss: 2.5843
Epoch 2100, Loss: 2.4513
Epoch 2200, Loss: 2.3307
Epoch 2300, Loss: 2.2210
Epoch 2400, Loss: 2.1207
Epoch 2500, Loss: 2.0288
Epoch 2600, Loss: 1.9442
Epoch 2700, Loss: 1.8661
Epoch 2800, Loss: 1.7938
Epoch 2900, Loss: 1.7268
Epoch 3000, Loss: 1.6644
Epoch 3100, Loss: 1.6062
Epoch 3200, Loss: 1.5518
Epoch 3300, Loss: 1.5009
Epoch 3400, Loss: 1.4531
Epoch 3500, Loss: 1.4082
Epoch 3600, Loss: 1.3658
Epoch 3700, Loss: 1.3259
Epoch 3800, Loss: 1.2882
Epoch 3900, Loss: 1.2525
Epoch 

In [119]:
print("\n== Prediction on training data ==")
predictions = predict(reduced, weights, bias, doc_names, index_to_label)
for title, predicted_label, predicted_prob in predictions:
    print(f"Book: {title}, Predicted Genre: {predicted_label}, Probability: {predicted_prob:.4f}")


== Prediction on training data ==
Book: 1-LOTR.txt, Predicted Genre: fantasy, Probability: 0.9976
Book: 1-game-of-thrones.txt, Predicted Genre: fantasy, Probability: 0.9989
Book: 1-harry-potter-and-the-chamber-of-secrets.txt, Predicted Genre: fantasy, Probability: 0.9989
Book: 1-harry-potter-and-the-sorceres-stone.txt, Predicted Genre: fantasy, Probability: 0.9589
Book: 1-the-lion-the-witch-and-the-wardrobe.txt, Predicted Genre: fantasy, Probability: 0.9170
Book: 2-a-study-in-scarlet.txt, Predicted Genre: mystery, Probability: 0.9999
Book: 2-and-then-there-were-none.txt, Predicted Genre: mystery, Probability: 1.0000
Book: 2-murder-on-the-orient-express.txt, Predicted Genre: mystery, Probability: 0.9419
Book: 2-the-girl-with-the-dragon-tattoo.txt, Predicted Genre: mystery, Probability: 0.9792
Book: 2-the-hound-of-baskervilles.txt, Predicted Genre: mystery, Probability: 0.9900
Book: 3-451-farenheit.txt, Predicted Genre: science fiction, Probability: 0.9935
Book: 3-brave-new-world.txt, P