In [39]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import accuracy_score

seed = 1234
np.random.seed(seed)  

In [28]:
def load_data_set(path):
    imdb_X = []
    imdb_y = []
    
    pos_folder = os.path.join(path, 'pos')
    for filename in os.listdir(pos_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(pos_folder, filename), 'r', encoding='utf-8') as file:
                imdb_X.append(file.read())
                imdb_y.append(1)
                      
    neg_folder = os.path.join(path, 'neg')
    for filename in os.listdir(neg_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(neg_folder, filename), 'r', encoding='utf-8') as file:
                imdb_X.append(file.read())
                imdb_y.append(0)

    store_results(f'{path}_X.txt', imdb_X)
    store_results(f'{path}_y.txt', imdb_y)
    
    return imdb_X, imdb_y

def store_results(filename, data_list):
    try:
        with open(filename, 'x', encoding='utf-8') as f:
            if (type(data_list[0]) == str):
                for line in data_list:
                    f.write(line + "\n")
            elif (type(data_list[0]) == int):
                for line in data_list:
                    f.write(str(line) + "\n")
        print(f"File '{filename}' created and content written successfully.")
    except FileExistsError:
        print(f"File '{filename}' already exists. No new content was written.")
    except Exception as e:
        print(f"An error occurred: {e}")

train_reviews, train_labels = load_data_set('aclImdb/train')
test_reviews, test_labels = load_data_set('aclImdb/test')

print(f"Training reviews: {len(train_reviews)}")
print(f"Test reviews: {len(test_reviews)}")

File 'aclImdb/train_X.txt' already exists. No new content was written.
File 'aclImdb/train_y.txt' already exists. No new content was written.
File 'aclImdb/test_X.txt' already exists. No new content was written.
File 'aclImdb/test_y.txt' already exists. No new content was written.
Training reviews: 25000
Test reviews: 25000


In [44]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

train_reviews = [preprocess(review) for review in train_reviews]
test_reviews = [preprocess(review) for review in test_reviews]

print(f"Training reviews: {len(train_reviews)}")
print(f"Test reviews: {len(test_reviews)}")

Training reviews: 25000
Test reviews: 25000


In [45]:
vectorizer = TfidfVectorizer(max_features=10000, 
        stop_words='english',
        ngram_range=(1, 2)
)
X_tr = vectorizer.fit_transform(train_reviews)
X_te = vectorizer.transform(test_reviews)

print(f"Training reviews: {X_tr.shape}")
print(f"Test reviews: {X_te.shape}")

Training reviews: (25000, 10000)
Test reviews: (25000, 10000)


In [46]:
learner = LinearSVC(C=1.0, class_weight='balanced', random_state=seed, max_iter=2000)
learner.fit(X_tr, y_tr)

y_pred = learner.predict(X_te)

accuracy = accuracy_score(y_te, y_pred)
print(f"Linear SVM Test Accuracy: {accuracy}")

Linear SVM Test Accuracy: 0.86588
