Dataset Loading and Preprocessing

In [92]:
# Load the movie_reviews dataset
reviews = [(movie_reviews.words(fileid), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

print("Number of reviews:", len(reviews))

# Preprocess reviews
preprocessed_reviews = []
for review, label in reviews:
    words = [word.lower() for word in review ]
    preprocessed_reviews.append((' '.join(words), label))

# Possitive reviews are represented as Class 1 - Negative reviews are Class 0 !
reviews_text = [review for review, _ in preprocessed_reviews]
labels = [1 if label == 'pos' else 0 for _, label in preprocessed_reviews]


Number of reviews: 2000


Model Training & Predictions

* TF-IDF Vectorizer
* Support Vector Classifier (SVM): Model training, Prediction and Evaluation using Accuracy
* Cross-Validation Implementation: Stratified k-fold




In [93]:
# Define the number of folds
n_folds = 5

# Initialize StratifiedKFold
stratified_kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=30)

# Lists to store scores
accuracy_scores = []

# List to store examples (for sample printing later)
example_predictions = []

# Perform stratified k-fold cross-validation
for train_index, test_index in stratified_kf.split(reviews_text, labels):
    X_train_fold, X_test_fold = [reviews_text[i] for i in train_index], [reviews_text[i] for i in test_index]
    y_train_fold, y_test_fold = [labels[i] for i in train_index], [labels[i] for i in test_index]

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train_tfidf_fold = vectorizer.fit_transform(X_train_fold)
    X_test_tfidf_fold = vectorizer.transform(X_test_fold)

    # Train SVM classifier
    svm_clf = SVC()
    svm_clf.fit(X_train_tfidf_fold, y_train_fold)

    # Predict on test set
    y_pred_fold = svm_clf.predict(X_test_tfidf_fold)

    # Calculate accuracy
    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    accuracy_scores.append(accuracy_fold)

    # Store examples of model's classification for sample printing
    example_predictions.extend(zip(X_test_fold, y_test_fold, y_pred_fold))

# Calculate mean accuracy across folds
mean_accuracy = np.mean(accuracy_scores)
print("Mean Accuracy (Stratified K-Fold with SVM):", mean_accuracy)


Mean Accuracy (Stratified K-Fold with SVM): 0.8290000000000001


Sample Printing of the Models Prediction

In [95]:
# Print examples of model's classification
print("\nExamples of model's classification:")
random_indices = random.sample(range(len(example_predictions)), 10)
for index in random_indices:
    review, true_label, predicted_label = example_predictions[index]
    print("Review:", review[:200]," [...]")
    print("True Label:", true_label)
    print("Predicted Label:", predicted_label)
    print("-----------------------------")


Examples of model's classification:
Review: funny how your expectations can be defeated , and not in good ways . the ghost and the darkness promised ( at least , it seemed to me to promise ) a hemingwayesque showdown between men and nature . wh  [...]
True Label: 0
Predicted Label: 0
-----------------------------
Review: so here is the second of 1999 ' s remakes of classic horror movies . the first was the dumb , pathetic but ok remake of the classic ' the haunting ' . now comes the highly awaited remake of house on h  [...]
True Label: 1
Predicted Label: 1
-----------------------------
Review: anastasia contains something that has been lacking from all of the recent disney releases . . . ( especially hercules ) . . . emotion . all the wacky characters voiced by celebrities and fantastically  [...]
True Label: 1
Predicted Label: 1
-----------------------------
Review: capsule : gal is a 50s - ish london cockney gangster who has retired to spain . his old associates want him for one la