# CS345 Project

For our dataset, we will be using the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/). Due to the size of this dataset, it could not be uploaded to github. Please download the dataset yourself, extract it, and move it to the "data" directory.

### Imports

In [None]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from tqdm import tqdm

### Load the data

In [None]:
def load_data(data_dir):
    data = []
    for sentiment in ['pos', 'neg']:
        sentiment_dir = os.path.join(data_dir, sentiment)
        print(f"Processing '{sentiment}' reviews...")
        file_list = os.listdir(sentiment_dir)
        # Use tqdm to create a progress bar as loading can take a while, we want to make sure it isn't hanging
        for filename in tqdm(file_list, desc=f"Loading {sentiment} files"):
            if filename.endswith('.txt'):
                filepath = os.path.join(sentiment_dir, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    review = f.read()
                data.append({
                    'review': review,
                    'sentiment': 1 if sentiment == 'pos' else 0,
                })

    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} reviews from '{data_dir}'")
    return df

train_data = load_data('data/aclImdb/train')
test_data = load_data('data/aclImdb/test')

### Let's check out some of the data

In [None]:
print(train_data.head())

### Data Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text, stop_words, lemmatizer):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

def preprocess_with_nltk(data):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    tqdm.pandas(desc="Preprocessing text")
    data['review'] = data['review'].progress_apply(lambda x: preprocess_text(x, stop_words, lemmatizer))
    return data

train_data = preprocess_with_nltk(train_data)
test_data = preprocess_with_nltk(test_data)

Let's compare the data to what we had before preprocessing

In [None]:
print(train_data.head())

### We will be using both TF-IDF and Bag of Words for feature extraction, and compare performance between the two.

In [None]:
def extract_tfidf_features(train_data, test_data, max_features=5000):
    print("Extracting TF-IDF features...")
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['review'])
    X_test_tfidf = tfidf_vectorizer.transform(test_data['review'])
    return X_train_tfidf, X_test_tfidf, tfidf_vectorizer

def extract_bow_features(train_data, test_data, max_features=5000):
    print("Extracting Bag of Words features...")
    bow_vectorizer = CountVectorizer(max_features=max_features)
    X_train_bow = bow_vectorizer.fit_transform(train_data['review'])
    X_test_bow = bow_vectorizer.transform(test_data['review'])
    return X_train_bow, X_test_bow, bow_vectorizer

X_train_bow, X_test_bow, bow_vectorizer = extract_bow_features(train_data, test_data)
X_train_tfidf, X_test_tfidf, tfidf_vectorizer = extract_tfidf_features(train_data, test_data)

# Labels
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [None]:
# Compare dimensions and sparsity
print("Bag of Words Feature Matrix:")
print(f"Shape: {X_train_bow.shape}")
print("Sparsity (BoW): {:.2f}%".format(100 * (X_train_bow.nnz / (X_train_bow.shape[0] * X_train_bow.shape[1]))))

print("\nTF-IDF Feature Matrix:")
print(f"Shape: {X_train_tfidf.shape}")
print("Sparsity (TF-IDF): {:.2f}%".format(100 * (X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))))