In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
newsgroups_train = load_files('20news-bydate/20news-bydate-train', encoding='ISO-8859-1')
newsgroups_test = load_files('20news-bydate/20news-bydate-test', encoding='ISO-8859-1')

X_train_raw, y_train = newsgroups_train.data, newsgroups_train.target
X_test_raw, y_test = newsgroups_test.data, newsgroups_test.target

print('Training Dataset Size: ', len(X_train_raw))
print('Testing Dataset Size: ', len(X_test_raw))
print('Number of Classes: ', len(newsgroups_train.target_names))

vectorizer = TfidfVectorizer(max_features=5000) 
X_train_full = vectorizer.fit_transform(X_train_raw).toarray()
X_test_full = vectorizer.transform(X_test_raw).toarray()

X_combined = np.vstack((X_train_full, X_test_full))
y_combined = np.hstack((y_train, y_test))

X_train, X_remaining, y_train, y_remaining = train_test_split(
    X_combined, y_combined, test_size=0.2, random_state=42, stratify=y_combined
)

X_val, X_test, y_val, y_test = train_test_split(
    X_remaining, y_remaining, test_size=0.5, random_state=42, stratify=y_remaining
)

print(f"Training Data Shape: {X_train.shape}, Training Labels Shape: {y_train.shape}")
print(f"Validation Data Shape: {X_val.shape}, Validation Labels Shape: {y_val.shape}")
print(f"Test Data Shape: {X_test.shape}, Test Labels Shape: {y_test.shape}")


Training Dataset Size:  11314
Testing Dataset Size:  7532
Number of Classes:  20
Training Data Shape: (15076, 5000), Training Labels Shape: (15076,)
Validation Data Shape: (1885, 5000), Validation Labels Shape: (1885,)
Test Data Shape: (1885, 5000), Test Labels Shape: (1885,)


In [None]:
import numpy as np

def distance_calc(X_train, X_test):

    X_train_normalized = X_train / np.linalg.norm(X_train, axis=1, keepdims=True)
    X_test_normalized = X_test / np.linalg.norm(X_test, axis=1, keepdims=True)

    cosine_similarity = X_test_normalized @ X_train_normalized.T
    cosine_distance = 1 - cosine_similarity

    return cosine_distance


In [None]:
import numpy as np

def edit_distance(s1, s2):

    m, n = len(s1), len(s2)

    dp = np.zeros((m + 1, n + 1), dtype=int)

    for i in range(m + 1):
        dp[i][0] = i 
    for j in range(n + 1):
        dp[0][j] = j 

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]: 
                dp[i][j] = dp[i - 1][j - 1]
            else:
                
                dp[i][j] = min(
                    dp[i - 1][j] + 1, 
                    dp[i][j - 1] + 1,
                    dp[i - 1][j - 1] + 1
                )
    return dp[m][n]


In [17]:
def edit_distance_matrix(X_train_raw, X_test_raw):
    distances = np.zeros((len(X_test_raw), len(X_train_raw)))

    for i, test_doc in enumerate(X_test_raw):
        for j, train_doc in enumerate(X_train_raw):
            distances[i, j] = edit_distance(test_doc, train_doc)

    return distances



In [8]:
class KNNCustom:
    def __init__(self,k):
        self.k = k
    
    def predict(self,distance_matrix,y_train_labels):

        predictions = []

        for distances in distance_matrix:
            nearest_indices = np.argsort(distances)[:self.k]
            nearest_labels = y_train_labels[nearest_indices]
            frequency_labels = Counter(nearest_labels)
            winner = frequency_labels.most_common(1)[0][0]
            predictions.append(winner)

        return np.array(predictions)

In [10]:
distance_matrix_val = distance_calc(X_train, X_val)
distance_matrix_test = distance_calc(X_train, X_test)

knn = KNNCustom(k=5)

y_val_pred = knn.predict(distance_matrix_val, y_train)
y_test_pred = knn.predict(distance_matrix_test, y_train)

accuracy_val = np.mean(y_val_pred == y_val)
accuracy_test = np.mean(y_test_pred == y_test)

print(f"Validation Accuracy: {accuracy_val * 100:.2f}%")
print(f"Test Accuracy: {accuracy_test * 100:.2f}%")


Validation Accuracy: 78.14%
Test Accuracy: 77.67%


In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=5, metric='precomputed')
knn.fit(edit_distance_matrix(X_train_raw, X_test_raw), y_train)

edit_distances_test = edit_distance_matrix(X_train_raw, X_test_raw)
y_pred = knn.predict(edit_distances_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy using Edit Distance: {accuracy}")


KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

knn = KNeighborsClassifier(n_neighbors=5, metric='cosine')
knn.fit(X_train, y_train)
y_val_pred = knn.predict(X_val)
y_test_pred = knn.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Validation Accuracy: 75.92%
Test Accuracy: 74.80%
