In [5]:
import math
import os
import re
from collections import Counter, defaultdict

def read_20_newsgroups(directory):
    print(f' Reading 20 Newsgroups dataset from {directory}...')
    word_counter = Counter()
    dataset = {}
    document_count = defaultdict(int)  # Track how many documents contain each word (DF)
    total_documents = 0  # Total number of documents

    # To store TF and DF
    tf = defaultdict(list)  # Term Frequency for each class and document

    for curr_dir, classes, files in os.walk(directory):
        curr_class = curr_dir.rsplit('/', 1)[-1]
        dataset[curr_class] = []
        for file in files:
            total_documents += 1
            file_path = os.path.join(curr_dir, file)
            read_file = []
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    for line in f:
                        bad = ['Newsgroup', 'document_id', 'From', 'Subject']
                        if any(line.startswith(badd) for badd in bad):
                            continue
                        
                        # Process each line
                        words = re.findall(r'\b\w+\b', line.lower())  # Convert to lowercase to count case-insensitively
                        word_counter.update(words)  # Update the counter with words from the line
                        read_file.extend(words)
                        
                    unique_words = set(read_file)  # Get unique words for this document
                    for word in unique_words:
                        document_count[word] += 1  # Increment DF for the word

            except Exception as e:
                print(f"Error reading {file_path}: {e}")
            dataset[curr_class].append(read_file)
    print(f' Finished reading {total_documents} documents from {len(dataset)} classes.')
    # Remove top 300 most common words (stop words)
    stop_words = set([word[0] for word in word_counter.most_common(300)])
    del dataset['20_newsgroups']
    for c, files in dataset.items():
        for file in files:
            file[:] = [word for word in file if word not in stop_words]

    # Use the next 500 most common words
    next_500_words = set(word[0] for word in word_counter.most_common(800)[300:800])
    for c, files in dataset.items():
        for file in files:
            file[:] = [word for word in file if word in next_500_words]
    print(f'finished sorting out the words')
    # Compute and store TF and DF
    for c, files in dataset.items():
        for file in files:
            word_freq_in_file = Counter(file)
            total_words_in_file = len(file)
            
            file_tf = {}  # To store TF for this document
            
            for word in file:
                # Calculate TF for the current word
                tf_value = word_freq_in_file[word] / total_words_in_file
                file_tf[word] = tf_value
            
            # Append the TF for this document to the list for the class
            tf[c].append(file_tf)

    # DF is already tracked in `document_count`, which is the document frequency
    df = document_count

    # Now compute TF-IDF for each word in each document
    tf_idf = defaultdict(list)
    N = total_documents  # Total number of documents

    for c, files in dataset.items():
        for file_tf in tf[c]:
            file_tfidf = {}
            for word, tf_value in file_tf.items():
                df_value = df[word]  # Get document frequency for the word
                idf_value = math.log10(N / df_value)  # Calculate IDF
                tfidf_value = math.log10(1 + tf_value) * idf_value  # Apply the TF-IDF formula
                file_tfidf[word] = tfidf_value
            tf_idf[c].append(file_tfidf)
    print(f' Finished computing TF-IDF for {N} documents.')
    # print(f'TF-IDF : \n {tf_idf}')
    # print(f'TF : \n {tf}')
    # print(f'DF : \n {df}')
    C = list(dataset.keys())  # List of classes
    return C, tf, df, tf_idf, dataset # treat tf_idf as w

# Call the function
C, tf, df, tf_idf, D = read_20_newsgroups("/home/jems/cmsc422/p1/20_newsgroups")

train, test = {}, {}
# Split dataset into train and test
for c, vectors in tf_idf.items():
    train[c] = vectors[:len(vectors)//2]
    test[c] = vectors[len(vectors)//2:]
    


 Reading 20 Newsgroups dataset from /home/jems/cmsc422/p1/20_newsgroups...


KeyboardInterrupt: 

In [None]:
for c in C:
    print(f"{train[c]}")
    break

In [3]:
import numpy as np

Y = np.full((len(C), len(C)), -1)

for i in range(len(C)):
    Y[i][i] = 1


# for every class, for every file belonging to that class, attach the tf-idf vector
# tf_idf = w
# futher processing may need to be done for this as we may need to restructure tf_idf into train and test


20


In [None]:
from qpsolvers import solve_qp

def train_svm(X, Y, C):
    samples, features = X.shape
    Y = Y.astype(float)
    K = np.dot(X.T, X)
    P = np.outer(Y, Y) * K 
    G = np.vstack((np.eye(samples) * -1, np.eye(samples)))
    H = np.vstack((np.zeros(samples), np.full(samples, C)))
    Q = np.full(samples, -1)
    A = Y.reshape(1, -1)
    b = np.array([0.0])
    
    alphas = solve_qp(P, Q, G, H, A, b, solver='osqp')
    w = np.sum(alphas[:, None] * Y[:, None] * X, axis=0)

    # Find support vectors (where 0 < alpha_i < C)
    support_vector_indices = np.where((alphas > 1e-5) & (alphas < C))[0]
    support_vectors = X[support_vector_indices]
    support_alphas = alphas[support_vector_indices]
    support_labels = Y[support_vector_indices]

    # Compute the bias term b using a support vector
    b = np.mean([y_i - np.dot(w, x_i) for (y_i, x_i) in zip(support_labels, support_vectors)])

    return w, b, support_vectors, support_alphas, support_labels





In [21]:
w, b, s_vectors, s_alphas, s_labels = train_svm(train, Y, 1) # Do this for every class

In [None]:
train_svm_per_class = {}

for i in range(len(C)):
    w, b, s_vectors, s_alphas, s_labels = train_svm(train[C[i]], Y[i], 1)
    train_svm_per_class[C[i]] = (w, b, s_vectors, s_alphas, s_labels)

In [None]:
'''

sv = [alpha if alpha > 1e-5 else 0 for alpha in alphas]
ind = np.arange(len(alphas))[sv]
sv_alphas = alphas[sv]
sv_x = X[sv]
sv_y = Y[sv]

w = np.sum(alpha_sv * sv_y).reshape(-1,1 ) * sv_x, axis=0)
b = np.mean(sv_y - sv_x @ w)

'''