In [45]:
import math
import os
import re
from collections import Counter, defaultdict
import numpy as np
def read_20_newsgroups(directory):
    print(f' Reading 20 Newsgroups dataset from {directory}...')
    word_counter = Counter()
    dataset = {}
    document_count = defaultdict(int)  # Track how many documents contain each word (DF)
    total_documents = 0  # Total number of documents

    # To store TF and DF
    tf = defaultdict(list)  # Term Frequency for each class and document

    for curr_dir, classes, files in os.walk(directory):
        curr_class = curr_dir.rsplit('/', 1)[-1]
        dataset[curr_class] = []
        for file in files:
            total_documents += 1
            file_path = os.path.join(curr_dir, file)
            read_file = []
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                    for line in f:
                        bad = ['Newsgroup', 'document_id', 'From', 'Subject']
                        if any(line.startswith(badd) for badd in bad):
                            continue
                        
                        # Process each line
                        words = re.findall(r'\b\w+\b', line.lower())  # Convert to lowercase to count case-insensitively
                        word_counter.update(words)  # Update the counter with words from the line
                        read_file.extend(words)
                        
                    unique_words = set(read_file)  # Get unique words for this document
                    for word in unique_words:
                        document_count[word] += 1  # Increment DF for the word

            except Exception as e:
                print(f"Error reading {file_path}: {e}")
            dataset[curr_class].append(read_file)
    print(f' Finished reading {total_documents} documents from {len(dataset)} classes.')
    # Remove top 300 most common words (stop words)
    stop_words = set([word[0] for word in word_counter.most_common(300)])
    del dataset['20_newsgroups']
    for c, files in dataset.items():
        for file in files:
            file[:] = [word for word in file if word not in stop_words]

    # Use the next 500 most common words
    next_500_words = set(word[0] for word in word_counter.most_common(800)[300:800])
    for c, files in dataset.items():
        for file in files:
            file[:] = [word for word in file if word in next_500_words]
    print(f'finished sorting out the words')
    # Compute and store TF and DF
    for c, files in dataset.items():
        for file in files:
            word_freq_in_file = Counter(file)
            total_words_in_file = len(file)
            
            file_tf = {}  # To store TF for this document
            
            for word in file:
                # Calculate TF for the current word
                tf_value = word_freq_in_file[word] / total_words_in_file
                file_tf[word] = tf_value
            
            # Append the TF for this document to the list for the class
            tf[c].append(file_tf)

    # DF is already tracked in `document_count`, which is the document frequency
    df = document_count

    # Now compute TF-IDF for each word in each document
    tf_idf = defaultdict(list)
    N = total_documents  # Total number of documents

    for c, files in dataset.items():
        for file_tf in tf[c]:
            
            # Initialize a vector of zeros with a length equal to next_500_words
            tfidf_vector = np.zeros(len(next_500_words))
            
            # Create an index mapping for the next_500_words
            word_to_index = {word: idx for idx, word in enumerate(next_500_words)}
            
            # For each word in the document, compute TF-IDF and place it in the correct position
            for word, tf_value in file_tf.items():
                if word in next_500_words:
                    df_value = df[word]  # Get document frequency for the word
                    idf_value = math.log10(N / df_value)  # Calculate IDF
                    tfidf_value = math.log10(1 + tf_value) * idf_value  # Apply the TF-IDF formula
                    
                    # Place the TF-IDF value in the corresponding index
                    tfidf_vector[word_to_index[word]] = tfidf_value
            
            # Append the fixed-length TF-IDF vector to the list for the class
            tf_idf[c].append(tfidf_vector)

    print(f' Finished computing TF-IDF for {N} documents.')
    # print(f'TF-IDF : \n {tf_idf}')
    # print(f'TF : \n {tf}')
    # print(f'DF : \n {df}')
    C = list(dataset.keys())  # List of classes
    return C, tf, df, tf_idf, dataset # treat tf_idf as w

# Call the function
Classes, tf, df, tf_idf, D = read_20_newsgroups("/home/jems/cmsc422/p1/20_newsgroups")


 Reading 20 Newsgroups dataset from /home/jems/cmsc422/p1/20_newsgroups...
 Finished reading 19997 documents from 21 classes.
finished sorting out the words
 Finished computing TF-IDF for 19997 documents.


In [46]:
for c in tf_idf:
    print(f'{c} : {len(tf_idf[c])}')

# soc.religion.christian has 3 less documents than the rest so we append 3 filler arrays to make it 500
filler = np.zeros(500)
for i in range(3):
    tf_idf['soc.religion.christian'].append(filler)

for c in tf_idf:
    for file in tf_idf[c]:
        if len(file) != 500:
            print(f'{c} : {len(file)}')

sci.crypt : 1000
sci.space : 1000
comp.sys.mac.hardware : 1000
soc.religion.christian : 997
talk.religion.misc : 1000
talk.politics.misc : 1000
comp.os.ms-windows.misc : 1000
comp.windows.x : 1000
misc.forsale : 1000
comp.sys.ibm.pc.hardware : 1000
rec.sport.hockey : 1000
rec.motorcycles : 1000
comp.graphics : 1000
rec.sport.baseball : 1000
sci.electronics : 1000
sci.med : 1000
talk.politics.guns : 1000
talk.politics.mideast : 1000
alt.atheism : 1000
rec.autos : 1000


In [59]:
print(f'{len(Classes)}')

print(f'{len([np.array(tf_idf[c]) for c in Classes])}')

20
20


In [68]:
class_dict = {c: i for i, c in enumerate(Classes)}

temp = np.array([np.array(tf_idf[c]) for c in Classes])

print({len(temp[i]) for i in range(len(temp))})
X = []
for i in range(len(temp)):
    dot_product = []
    for j in range(len(temp[i])):
        dot_product.append(np.dot(temp[i][j], temp[i][j]))
    X.append(np.array(dot_product))
X = np.array(X)
print(X.shape)
train, test = [], []
for i in range(len(X)):
    train.append(X[i][len(X[i])//2:])
    test.append(X[i][:len(X[i])//2])

train = np.array(train)
test = np.array(test)

Y = np.full((len(Classes), len(Classes)), -1)

for i in range(len(Classes)):
    Y[i][i] = 1


{1000}
(20, 1000)


In [95]:
from qpsolvers import solve_qp
import osqp
from scipy import sparse
def train_svm(X, Y, C):
    samples, features = X.shape
    Y = Y.astype(float)
    K = np.dot(X, X.T)
    print(K.shape)
    P = np.outer(Y, Y) * K 
    epsilon = 1e-6
    P += np.eye(samples) * epsilon
    P_sparse = sparse.csr_matrix(P)
    G = np.vstack((np.eye(samples) * -1, np.eye(samples)))
    G_sparse = sparse.csr_matrix(G)
    H = np.vstack((np.zeros(samples), np.full(samples, C))).flatten()
    Q = np.full(samples, -1)
    A = Y.reshape(1, -1)
    A_sparse = sparse.csr_matrix(A)
    b = np.array([0.0])
    
    prob = osqp.OSQP()
    prob.setup(P=P_sparse, q=Q, A=A_sparse, l=b, u=b)
    res = prob.solve()
    alphas = res.x

    

    # alphas = solve_qp(P, Q, G, H, A, b, solver='osqp')
    #alphas = [a if a > 1e-5 else 0 for a in alphas]
    w = np.sum(alphas[:, None] * Y[:, None] * X, axis=0)

    # Find support vectors (where 0 < alpha_i < C)
    support_vector_indices = np.where((alphas > 1e-5) & (alphas < C))[0]
    support_vectors = X[support_vector_indices]
    support_alphas = alphas[support_vector_indices]
    support_labels = Y[support_vector_indices]

    # Compute the bias term b using support vectors with meaningful alphas
    epsilon = 1e-5
    margin_sv = support_alphas > epsilon
    b = np.mean([y_i - np.dot(w, x_i) for (y_i, x_i) in zip(support_labels[margin_sv], support_vectors[margin_sv])])

    return w, b, support_vectors, support_alphas, support_labels





20
(20, 1000, 500)


In [99]:
c = float(100)
train_svm_per_class = {}
for i, clas in enumerate(Classes):
    w, b, s_vectors, s_alphas, s_labels = train_svm(train, Y[i], c)
    train_svm_per_class[clas] = {'w': w, 'b': b, 's_vectors': s_vectors, 's_alphas': s_alphas, 's_labels': s_labels}

#print(train_svm_per_class)


(20, 20)
-----------------------------------------------------------------
           OSQP v0.6.3  -  Operator Splitting QP Solver
              (c) Bartolomeo Stellato,  Goran Banjac
        University of Oxford  -  Stanford University 2021
-----------------------------------------------------------------
problem:  variables n = 20, constraints m = 1
          nnz(P) + nnz(A) = 230
settings: linear system solver = qdldl,
          eps_abs = 1.0e-03, eps_rel = 1.0e-03,
          eps_prim_inf = 1.0e-04, eps_dual_inf = 1.0e-04,
          rho = 1.00e-01 (adaptive),
          sigma = 1.00e-06, alpha = 1.60, max_iter = 4000
          check_termination: on (interval 25),
          scaling: on, scaled_termination: off
          warm start: on, polish: off, time_limit: off

iter   objective    pri res    dua res    rho        time
   1  -3.0190e+01   8.96e-03   6.00e-01   1.00e-01   1.56e-04s
  25  -4.7160e+01   1.04e-06   2.84e-06   1.00e-01   1.79e-04s

status:               solved
number of