## Language Identification with Naive Bayes

### 1. Estimate the prior probabilities

In [None]:
import pandas as pd
import os

english_files = [f"./languageID/e{i}.txt" for i in range(10)]
japanese_files = [f"./languageID/j{i}.txt" for i in range(10)]
spanish_files = [f"./languageID/s{i}.txt" for i in range(10)]

df = pd.DataFrame({
    'filename': english_files + japanese_files + spanish_files,
    'label': ['e'] * len(english_files) + ['j'] * len(japanese_files) + ['s'] * len(spanish_files)
})

In [None]:
alpha = 0.5

total_docs = df.shape[0]

# Compute prior probabilities
prior_english = (df[df['label'] == 'e'].shape[0] + alpha) / (total_docs + 3 * alpha)
prior_japanese = (df[df['label'] == 'j'].shape[0] + alpha) / (total_docs + 3 * alpha)
prior_spanish = (df[df['label'] == 's'].shape[0] + alpha) / (total_docs + 3 * alpha)

print(f"Prior probability for English: {prior_english}")
print(f"Prior probability for Japanese: {prior_japanese}")
print(f"Prior probability for Spanish: {prior_spanish}")

### 2. Estimate the class conditional probability for English

In [None]:
chars = [chr(i) for i in range(97, 123)] + [' ']

char_counts = {char: 0 for char in chars}

# Counts of each character in English documents
for file in english_files:
    with open(file, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read().lower()
        for char in chars:
            char_counts[char] += text.count(char)

total_chars = sum(char_counts.values())

theta_e = {char: round((count + alpha) / (total_chars + 27 * alpha),3) for char, count in char_counts.items()}
theta_e.values()

### 3. Estimate the conditional probabilities for spanish and janpanese

In [None]:
def calculate_theta(files, chars, alpha):
    char_counts = {char: 0 for char in chars}
    
    for file in files:
        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read().lower()
            for char in chars:
                char_counts[char] += text.count(char)
                
    total_chars = sum(char_counts.values())
    
    return {char: round((count + alpha) / (total_chars + 27 * alpha),3) for char, count in char_counts.items()}

theta_j = calculate_theta(japanese_files, chars, alpha)
theta_s = calculate_theta(spanish_files, chars, alpha)

print(theta_j.values())
print(theta_s.values())

### 4. Bag-of-words vector x

In [None]:
bow_vector = {char: 0 for char in chars}

test_file_path = f"./languageID/e10.txt"
with open(test_file_path, 'r', encoding='utf-8', errors='ignore') as f:
    text = f.read().lower()

for char in chars:
    bow_vector[char] = text.count(char)

bow_series = pd.Series(bow_vector)
print(bow_vector)

### 5. Compute px given y

In [None]:
import numpy as np

def compute_px_given_y(bow_vector, theta):
    probabilities = [theta[char]**count for char, count in bow_vector.items()]
    return np.prod(probabilities)

p_x_given_e = compute_px_given_y(bow_vector, theta_e)
p_x_given_j = compute_px_given_y(bow_vector, theta_j)
p_x_given_s = compute_px_given_y(bow_vector, theta_s)

print(f"p(x|y=e): {p_x_given_e}")
print(f"p(x|y=j): {p_x_given_j}")
print(f"p(x|y=s): {p_x_given_s}")

### 6. Use Bayes rule and your estimated prior and likelihood

In [None]:
# proportional posterior p(y|x) for each class
prior_e = 1/3
prior_j = 1/3
prior_s = 1/3

p_x = proportional_posterior_e + proportional_posterior_j + proportional_posterior_s

proportional_posterior_e = p_x_given_e * prior_e / p_x
proportional_posterior_j = p_x_given_j * prior_j / p_x
proportional_posterior_s = p_x_given_s * prior_s / p_x

p_y_given_x_e = proportional_posterior_e
p_y_given_x_j = proportional_posterior_j
p_y_given_x_s = proportional_posterior_s
predicted_class = max([(p_y_given_x_e, "e"), (p_y_given_x_j, "j"), (p_y_given_x_s, "s")], key=lambda x: x[0])[1]

print(f"p(y=e|x): {p_y_given_x_e}")
print(f"p(y=j|x): {p_y_given_x_j}")
print(f"p(y=s|x): {p_y_given_x_s}")
print(f"Predicted class label of x: {predicted_class}")

### 7. The performance of your classifier

In [None]:
languages = ["e", "j", "s"]
confusion_matrix = pd.DataFrame(0, index=languages, columns=languages)

def classify_document(file_path):
    bow_vector = {char: 0 for char in chars}
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
        text = f.read().lower()
    for char in chars:
        bow_vector[char] = text.count(char)

    p_e = compute_px_given_y(bow_vector, theta_e) * prior_e
    p_j = compute_px_given_y(bow_vector, theta_j) * prior_j
    p_s = compute_px_given_y(bow_vector, theta_s) * prior_s

    return max([(p_e, "e"), (p_j, "j"), (p_s, "s")], key=lambda x: x[0])[1]

for true_language in languages:
    for i in range(10, 20):
        file_path = f"./languageID/{true_language}{i}.txt"
        predicted_language = classify_document(file_path)
        confusion_matrix.at[predicted_language, true_language] += 1

# Display confusion matrix
print(confusion_matrix)

## Simple Feed-Forward Network

In [None]:
import numpy as np
import torch
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return x * (1 - x)

def softmax(x):
    exp_x = np.exp(x - np.max(x))
    return exp_x / exp_x.sum(axis=1, keepdims=True)

def forward_propagation(x, W1, W2, W3):
    z1 = np.dot(x, W1.T)
    a1 = sigmoid(z1)
    z2 = np.dot(a1, W2.T)
    a2 = sigmoid(z2)
    z3 = np.dot(a2, W3.T)
    y_hat = softmax(z3)
    return z1, a1, z2, a2, y_hat

def backward_propagation(x, y, W1, W2, W3, z1, a1, z2, a2, y_hat, alpha):
    m = x.shape[0]
    dz3 = y_hat - y
    dW3 = 1/m * np.dot(dz3.T, a2)
    
    da2 = np.dot(dz3, W3)
    dz2 = da2 * sigmoid_derivative(a2)
    dW2 = 1/m * np.dot(dz2.T, a1)
    
    da1 = np.dot(dz2, W2)
    dz1 = da1 * sigmoid_derivative(a1)
    dW1 = 1/m * np.dot(dz1.T, x)
    
    W1 -= alpha * dW1
    W2 -= alpha * dW2
    W3 -= alpha * dW3
    return W1, W2, W3

d = 28 * 28
d1 = 300
d2 = 200
k = 10
alpha = 0.1
epochs = 10
batch_size = 32
W1 = np.random.randn(d1, d) * 0.01
W2 = np.random.randn(d2, d1) * 0.01
W3 = np.random.randn(k, d2) * 0.01

losses = []

for epoch in range(epochs):
    total_loss = 0
    for i, (images, labels) in enumerate(train_loader):
        x = images.view(-1, 28*28).numpy()
        y = np.eye(10)[labels.numpy()]
        
        z1, a1, z2, a2, y_hat = forward_propagation(x, W1, W2, W3)
        W1, W2, W3 = backward_propagation(x, y, W1, W2, W3, z1, a1, z2, a2, y_hat, alpha)
        
        total_loss += -np.mean(y * np.log(y_hat + 1e-8))
    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f'Epoch {epoch+1}, Loss: {avg_loss}')

# the learning curve
plt.plot(losses)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.show()

corrects = 0
for images, labels in test_loader:
    x = images.view(-1, 28*28).numpy()
    y = np.eye(10)[labels.numpy()]
    _, _, _, _, y_hat = forward_propagation(x, W1, W2, W3)
    corrects += np.sum(np.argmax(y_hat, axis=1) == labels.numpy())

accuracy = corrects / len(test_dataset)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

class NeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.sigmoid(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)

model = NeuralNetwork(28*28, 300, 200, 10)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

losses = []
for epoch in range(10):
    total_loss = 0
    for batch_idx, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)

        optimizer.zero_grad()

        outputs = model(images)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')

plt.plot(losses)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.show()

model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(-1, 28*28)
        outputs = model(images)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

accuracy = 100.0 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


In [None]:
def weight_init_zero(m):
    if isinstance(m, nn.Linear):
        nn.init.constant_(m.weight, 0)
        nn.init.constant_(m.bias, 0)

model_zero = NeuralNetwork(28*28, 300, 200, 10)
model_zero.apply(weight_init_zero)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_zero.parameters(), lr=0.1)

losses = []
for epoch in range(10):
    total_loss = 0
    for batch_idx, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)

        optimizer.zero_grad()

        outputs = model_zero(images)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')

plt.plot(losses)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.show()

model_zero.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(-1, 28*28)
        outputs = model_zero(images)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

accuracy = 100.0 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')


In [None]:
def weight_init_random(m):
    if isinstance(m, nn.Linear):
        nn.init.uniform_(m.weight, -1, 1)
        nn.init.uniform_(m.bias, -1, 1)

model_random = NeuralNetwork(28*28, 300, 200, 10)
model_random.apply(weight_init_random)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_random.parameters(), lr=0.1)

losses = []
for epoch in range(10):
    total_loss = 0
    for batch_idx, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)

        optimizer.zero_grad()

        outputs = model_random(images)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    losses.append(avg_loss)
    print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}')

plt.plot(losses)
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Learning Curve')
plt.show()

model_random.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images = images.view(-1, 28*28)
        outputs = model_random(images)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

accuracy = 100.0 * correct / total
print(f'Test Accuracy: {accuracy:.2f}%')
