James Quirk (james.f.quirk.25@dartmouth.edu) / CS 72 Final Project / 03/01/2024

In [2]:
# Necessary packages
import pandas as pd
import autograd.numpy as np
import statistics
from sklearn.model_selection import train_test_split

In [3]:
# Store the number of appearances of each word as they are found in the VAD lexicon
def create_dict():
    filename = "NRC-VAD-Lexicon.txt"
    VAD = {}
    i = 0
    with open(filename, 'r') as file:
        line = file.readline()
        
        # Iterate through every line
        while line:
            info = line.strip().split('\t')            
            word = info[0]
            
            # Ignores words that do not have an associated embedding
            if word not in VAD:
                VAD[word] = [i, float(info[1]), float(info[2]), float(info[3])]
                i += 1
            line = file.readline()

    return VAD

VAD = create_dict()

# Ensure that information loaded correctly
print(VAD["saturate"])

[15347, 0.281, 0.68, 0.467]


In [4]:
# Create a dictionary that maps classes to indices
labels = {"crime, law and justice":1, "arts, culture, entertainment and media":2,
          "economy, business and finance":3, "disaster, accident and emergency incident":4,
          "environment":5, "education":6, "health":7, "human interest":8, "lifestyle and leisure":9, 
          "politics":10, "labour":11, "religion and belief":12, "science and technology":13, "society":14, 
          "sport":15, "conflict, war and peace":16, "weather":17}

In [6]:
data = pd.read_csv("MN-DS-news-classification.csv")

# Extract all documents and their associated classes (news topics)
nX = data["content"]
ny = data["category_level_1"]

# Fill feature vectors according to number of words within them: bag of words implementation
X = np.zeros((len(nX), len(VAD)))
y = np.zeros(len(ny))
for i, doc in enumerate(nX):
    feature_vector = len(VAD)*[0]
    y[i] = labels[ny[i]]
    words = doc.lower().strip().split()

    for word in words:
        if word in VAD:
            index = VAD[word][0]
            X[i][index] += 1

In [13]:
# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

<font size="18">**k-NN without embeddings**</font>

In [14]:
# Expanded from my own implementation of k-NN from CS74 (written from scratch)

# Pass in X_train and y_train to compare to every X_test value
def knn_classifier(X_train, X_test):
  close_array = [] # Array of predicted classes

  # For every data point
  for i, datapoint in enumerate(X_test):

    # Create an array of the distance to each training point
    dist_array = []
    for j, comparison in enumerate(X_train):
      distance = np.power(np.sum(np.power(np.abs(datapoint - comparison), 2)), 0.5) # Find euclidean distance between every test point and every train point
      dist_array.append((j, distance)) # Store the index so that we can index into y_test to find the class and the distance to organize in ascending order

    # Sort in ascending order
    dist_array.sort(key=lambda dist: dist[1])

    close_array.append(dist_array)

  # Return the array of predicted classes
  return close_array

y_pred = knn_classifier(X_train, X_test)

In [14]:
# Expanded from my own implementation of k-NN from CS74 (written from scratch)

# Helper function that allows us to change the k value without having to re-run comparison between train and test
def with_k(dist_array, k):
    pred_class = []
    for ele in dist_array:
        # List of closest values' classes
        pred = []
        for within_k in range(k): # Iterate through however many neighbors specified
            ind = ele[within_k][0] # Grab the index of the closest training points
            pred.append(y_train[ind]) # Find the class of the closest training points and append them

        pred_class.append(statistics.mode(pred)) # Finds the class with the most appearances and saves it as this test point's predicted class

    return pred_class

# Grab the accuracies for k = 1 through 6
pred_1 = with_k(y_pred, 1)
pred_2 = with_k(y_pred, 2)
pred_3 = with_k(y_pred, 3)
pred_4 = with_k(y_pred, 4)
pred_5 = with_k(y_pred, 5)
pred_6 = with_k(y_pred, 6)

In [11]:
# Extended from CS 74, I coded this from scratch

# Metric to evaluate the success of the multi-class classifier
def evaluate(y_actual,y_pred):
    success = 0
    for i in range(len(y_pred)):
      # Cast characters as integers to compare accurately
      if int(y_actual[i]) == int(y_pred[i]):
        success += 1 # Whenever prediction is right, add one to the correct counter

    accuracy = success/len(y_pred) # Accuracy = success/total

    return accuracy

In [148]:
# Run evaluation function
accuracy = evaluate(y_test, pred_1)
accuracy2 = evaluate(y_test, pred_2)
accuracy3 = evaluate(y_test, pred_3)
accuracy4 = evaluate(y_test, pred_4)
accuracy5 = evaluate(y_test, pred_5)
accuracy6 = evaluate(y_test, pred_6)

# Print output
print("k = 1 accuracy:", accuracy)
print("k = 2 accuracy:", accuracy2)
print("k = 3 accuracy:", accuracy3)
print("k = 4 accuracy:", accuracy4)
print("k = 5 accuracy:", accuracy5)
print("k = 6 accuracy:", accuracy6)

k = 1 accuracy: 0.43223443223443225
k = 2 accuracy: 0.43223443223443225
k = 3 accuracy: 0.4409340659340659
k = 4 accuracy: 0.440018315018315
k = 5 accuracy: 0.4409340659340659
k = 6 accuracy: 0.43864468864468864


In [150]:
# Import and create confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, pred_3)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 38   2   2   9   0   2   0   9   1   9   9   2   2  10   2   6   0]
 [  1  24   1   1   0   2   1   5   2   3   3   2   0   4   4   2   0]
 [  1   0  25   3   0   2   4   8   0  14   5   0   2   5   3   2   0]
 [  1   0   1  44   4   4   2  13   1   4   1   1   1   5   2   3   9]
 [  0   0   4  12  46   0   4  18   3  13   0   1   8   4   3   5   2]
 [  0   0   1   3   0  66   8   6   0   5   3  10  10   4   5   3   1]
 [  2   1   0   8   3   4  71  16   1  15   5   1   6  12   3   1   3]
 [  0   2   3  13   4   3   5  42   1  15   4   2   5   9   7   2   0]
 [  0   2   0   4   0   0   0   7  23   0   2   1   5   3   4   0   2]
 [  8   4   4  11   2   2   5  20   1  84  10   7   6   8   3  10   0]
 [  1   1   4   9   0   5   1  12   0   6  78   4   2   6   4   2   3]
 [  3   3   1  10   1   5   1  23   2  18   1  65   3  10   4   6   3]
 [  0   2   2   6   5   4  15  15   2  20   8   4  49   9   4   5   1]
 [  5   3   5  29   2   3  16  25   4  12  17   9  11  73  

<font size="18">**kNN with embeddings**</font>

In [7]:
# Initialize empty vectors
X = np.zeros((len(nX), 3))
y = np.zeros(len(ny))

# Then calculate mean V, A, and D scores for each document
for i, doc in enumerate(nX):
    valence = 0
    arousal = 0
    dominance = 0
    count = 0

    y[i] = labels[ny[i]]
    words = doc.lower().strip().split()

    # Apply to every word in every document that is in our dictionary
    for word in words:
        if word in VAD:
            valence += VAD[word][1]
            arousal += VAD[word][2]
            dominance += VAD[word][3]
            count += 1
    
    # Converts from sum to mean
    if count != 0:
        valence /= count
        arousal /= count
        dominance /= count

    X[i][0] = valence
    X[i][1] = arousal
    X[i][2] = dominance

In [8]:
# Similar function to the one found above

# Pass in X_train and y_train to compare to every X_test value
def knn_classifier_with_embeddings(X_train, X_test):
  close_array = [] # Array of predicted classes

  # For every data point
  for i, datapoint in enumerate(X_test):

    # Create an array of the distance to each training point
    dist_array = []
    for j, comparison in enumerate(X_train):
      distance = np.power(np.sum(np.power(np.abs(datapoint - comparison), 2)), 0.5) # Find euclidean distance between every test point and every train point
      dist_array.append((j, distance)) # Store the index so that we can index into y_test to find the class and the distance to organize in ascending order

    # Sort in ascending order
    dist_array.sort(key=lambda dist: dist[1])

    close_array.append(dist_array)

  # Return the array of predicted classes
  return close_array

# Split into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

y_pred = knn_classifier_with_embeddings(X_train, X_test)

In [15]:
# Calculate predictions for k = 1 through 5 & 100
pred_1 = with_k(y_pred, 1)
pred_2 = with_k(y_pred, 2)
pred_3 = with_k(y_pred, 3)
pred_4 = with_k(y_pred, 4)
pred_5 = with_k(y_pred, 5)
pred_100 = with_k(y_pred, 100)

In [16]:
# Run evaluation function
accuracy = evaluate(y_test, pred_1)
accuracy2 = evaluate(y_test, pred_2)
accuracy3 = evaluate(y_test, pred_3)
accuracy4 = evaluate(y_test, pred_4)
accuracy5 = evaluate(y_test, pred_5)
accuracy100 = evaluate(y_test, pred_100)

# Print output
print("k = 1 accuracy:", accuracy)
print("k = 2 accuracy:", accuracy2)
print("k = 3 accuracy:", accuracy3)
print("k = 4 accuracy:", accuracy4)
print("k = 5 accuracy:", accuracy5)
print("k = 100 accuracy:", accuracy100)

k = 1 accuracy: 0.16758241758241757
k = 2 accuracy: 0.16758241758241757
k = 3 accuracy: 0.17261904761904762
k = 4 accuracy: 0.17261904761904762
k = 5 accuracy: 0.184981684981685
k = 100 accuracy: 0.2174908424908425


In [190]:
conf_matrix = confusion_matrix(y_test, pred_100)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 4  0  0  5  3  0  3  0  0 24  1  1  6 10  3 32  2]
 [ 0  0  0  1  1 10  1  0  4  3  2  1  2 17 13  2  2]
 [ 0  0  0  2  6  4  6  1  0 20  1  4 11  1  6  9  5]
 [ 3  0  0 15  2  2 10  0  1  4  1  1  4 10 10 16 16]
 [ 1  0  0  7 22  5  6  0  1 19  0  6 17 10  4  6 12]
 [ 0  0  0  0  3 44  1  5  1 22  3 13 25  7 14  4  0]
 [ 5  0  1 17  7 10 12  0  2 15  5  6 16 13 15 16  7]
 [ 2  0  0  3  3 12  0 22 18  3  1  3  2 22 19  2  5]
 [ 0  0  0  0  1  2  0  1 18  0  0  0  0 12 27  0  2]
 [ 0  0  1  5  7  8  8  4  2 70  9 12 21 11  4 38  4]
 [ 2  0  0  6 12 15  5  6  1 18  3  7 19 14 21  5 10]
 [ 9  0  0  2  3 17  4  3  2 37  3 10 20 12  7 20  1]
 [ 1  0  0  0 15 23  9  8  1 26  2  6 29 11 11 10  5]
 [ 4  0  0  7  6 17  8  8  2 22  5  8 16 53 34 18  8]
 [ 3  0  0  5  4 10  5  7  2 10  2  7 11 15 70  9  6]
 [ 3  0  0  4  3  5  8  1  1 42  3  3  9  9  7 55  0]
 [ 3  0  0  9  5  0  7  0  2  0  0  0  2  9  1  3 44]]


<font size="18">**SVM**</font>

In [9]:
# Extended from Lizzy's code on Logistic Regression

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

#data organizing for sklearn model
df = pd.read_csv('MN-DS-news-classification.csv')

#clean
df['content'] = df['content'].apply(lambda x: ''.join(char for char in x if char.isalnum() or char.isspace()))
df['content'] = df['content'].map(lambda x: x.lower())

#pull relevent columns
articles = df['content'].tolist()
topics = df['category_level_1'].tolist()

#split to test and train
X_train, X_test, y_train, y_test = train_test_split(articles, topics, test_size=0.2, random_state=42)

# text to vector representation via bag of words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# log reg classifier
lr = SVC(max_iter = 2000)
lr.fit(X_train, y_train)

# go through test set
y_pred = lr.predict(X_test)

# get accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.5146520146520146
[[  5   3   0   0   0   0   0   0   2   1   0   5   3   3  44   2   0]
 [  0 116   3   2   0   0   0   0   1   0   0  10   3   3  23   0   0]
 [  0   4  42   5   0   1   0   1   1   1   0  12   0   1  33   0   0]
 [  1   6   2  36   1   0   2   0   0   0   0   8   0   1  28   1   7]
 [  0   4   0   0  20   0   1   0   0   6   0  18   0   8  25   2   0]
 [  0   0   1   1   0  62   0   0   0   0   0   2   6   7  32   0   0]
 [  0   2   0   2   1   0  75   1   0   0   1   4   3  11  24   1   0]
 [  0   2   0   2   0   1   0  58   0   1   1   5   0  21  45   0   0]
 [  0   5   0   2   0   1   3   1  20   0   5   0   3   6  65   7   0]
 [  0   2   3   0   0   2   0   1   0  64   1  11   0   7  36  13   0]
 [  0   0   0   0   0   0   0   1   1   0  25   1   2   4  26   5   0]
 [  0  17   1   0   1   0   2   2   0   2   0  86   1  13  53   0   0]
 [  0  16   2   2   0   0   0   0   1   0   1   9  86   4  58   0   0]
 [  0   1   2   1   0   4   3   3   2   0   1   

In [8]:
# Visualize the frequency of classifying documents as "society"
print(y_pred[0:30])

['society' 'health' 'society' 'society' 'weather' 'lifestyle and leisure'
 'society' 'education' 'health' 'human interest' 'society' 'society'
 'society' 'society' 'religion and belief' 'society' 'sport' 'labour'
 'society' 'sport' 'politics' 'education' 'society' 'society' 'society'
 'science and technology' 'conflict, war and peace' 'politics'
 'economy, business and finance' 'conflict, war and peace']
