In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import os 

def create_bow():
    directories = ['train/pos', 'train/neg', 'test/pos', 'test/neg'] 

    text = [] 
    score = [] 
    original_score = []

    for directory in directories: 
        print(f'Current directory: {directory}') 
        train_contents = os.listdir(directory)

        print(f'Total reviews in {directory}: {len(train_contents)}')
        
        for i in train_contents: 
            file_name = os.path.basename(i)
            text_file_path = os.path.join(directory, i) 
            review_score = int(i[:len(i)-4].split('_')[1])

            original_score.append(review_score) 
            
            if review_score >= 7: 
                review_score = 1
            elif review_score <= 4: 
                review_score = 0
            else: 
                continue # don't consider reviews that are neutral rated  
            
            with open(text_file_path, 'r', encoding='utf-8') as f: 
                text_review = f.readline() 

                text.append(text_review)
                score.append(review_score)
        
        print(f'Finished {directory}\n') 

    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,1)) 
    total_words = vectorizer.fit_transform(text) 

    print(f'Total data shape: {total_words.shape}')
    print('Finished running.') 
    
    return total_words, score, original_score, text

X, y, score, text = create_bow() 

# X is total vectorized words, removing stop words and 1-gram 
# y is modified based on scores -> 1 and 0 
# score is original scores
# text is array of all reviews 

# TRAIN DATA : total 25_000 -> 12_500 pos and 12_500 neg
# TEST DATA : total 25_000 -> 12_500 pos and 12_500 neg 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

seed = 42
tfidf_vectorizer = TfidfVectorizer()
tf_X = tfidf_vectorizer.fit_transform(text)
# score is original movie review scores

plt.figure(figsize=(8, 6))
plt.scatter(score, tf_X.toarray()[:, 0], color='blue')  
plt.title('Review Score vs TF-IDF')
plt.xlabel('Review Score')
plt.ylabel('TF-IDF')
plt.grid(True)
plt.show()

# Uses train + test data to split 25/75 and create new train + test data 
#   Maybe plot original data but use knn on movie review scores as 0 and 1
X_train, X_test, y_train, y_test = train_test_split(tf_X, y, test_size=0.2, random_state=seed)
#   Split training data into training and validation
X_tr, X_valid, y_tr, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

testing_k = [1, 5, 10, 25, 50, 75, 100]
acc_tr = np.zeros(len(testing_k))
acc_valid = np.zeros(len(testing_k))

for i, k in enumerate(testing_k):
# KNN Classification
    print("Starting kNN Classifier for", k, "neighbors")
    knn = KNeighborsClassifier(n_neighbors=k, n_jobs=-1)
    knn.fit(X_tr, y_tr)
    
    valid_predictions = knn.predict(X_valid)
    valid_accuracy = accuracy_score(y_valid, valid_predictions)
    acc_valid[i] = valid_accuracy
    print("Validation Accuracy (k = " + str(k) + "):", valid_accuracy)
    tr_predictions = knn.predict(X_tr)
    tr_accuracy = accuracy_score(y_tr, tr_predictions)
    acc_tr[i] = tr_accuracy
    print("Training Accuracy (k = " + str(k) + "):", tr_accuracy, "\n")

# Accuracy takes a while 
# Testing Accuracy: 0.2756 (with original movie review scores not as 0 and 1) -> looks better on plot but maybe use other scores for accuracy
# Took 15 minutes each

In [None]:
figure, axes = plt.subplots(1, figsize=(6, 6))
axes.plot(testing_k, acc_valid, marker='o', c='green', label='Validation')
axes.plot(testing_k, acc_tr, marker='o', c='red', label='Training')
axes.set_xlabel('k Neighbors')
axes.set_ylabel('Accuracy Rate')

legend = axes.legend()

In [None]:
optimal_k = None
optimal_acc = None
for i in range(7):
    if optimal_k == None or optimal_acc < acc_valid[i]:
        optimal_k = testing_k[i]
        optimal_acc = acc_valid[i]

print("Running test data with optimal k value of", optimal_k)
knn = KNeighborsClassifier(n_neighbors=optimal_k, n_jobs=-1)
knn.fit(X_tr, y_tr)
test_predictions = knn.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Testing Accuracy (k = " + str(optimal_k) + "):", test_accuracy)