# Movie Reviews Sentiment Analysis Project
### Dataset: https://ai.stanford.edu/~amaas/data/sentiment/

In [5]:
# Revised final code:

# load packages
import os
import re
from collections import defaultdict


base_path = os.path.expanduser('~/Downloads/aclImdb/train')
pos_folder = os.path.join(base_path, 'pos')
neg_folder = os.path.join(base_path, 'neg')

# make lists - I thought of as many words in both categories as I could that would make sense.
pos_words = ["brilliant", "charming", "stunning", "remarkable", "vivid", "breathtaking", "mindblowing", "inspiring", "gripping", "heartfelt", "immersive", "memorable", 
             "masterpiece" "impressive", "superb", "lovable", "clever", "witty", "satisfying", "topnotch", "well done", "mustwatch", "tender", "flawless", 
             "incredible", "soulful", "amazing", "happy", "moving", "great", "astonishing", "astonished", "exciting", "majestic", "well", "fun", "good acting", "quality","hot",
             "interesting", "entertaining", "fantastic", "captivating", "good", "heartwarming", "original", "moving", "touching", "best", "most", 
             "powerful", "engaging", "funny", "hilarious", "insightful", "creative", "delightful", "beautiful", "cinematic", "worth it", "outstanding", 
             "phenomenal", "like", "liked", "enjoyed", "enjoy", "bold", "love", "loved every minute", "kept me hooked", "highly recommend"]
neg_words = ["not exciting", "cringy", "messy", "incoherent", "unbearable", "forgetful", "pissed", "no sense", "out of touch", "uncertain", "forgettable", "annoying", "letdown", "all over the place", "dreadful", "mindnumbing", "overhyped", "tedious", "flat", "lifeless", "uninspired", "hollow",
             "lackluster", "pointless", "mishandled", "cheesy", "overacted", "sluggish", "dismal", "offensive", "downhill", "cringed", "cringe", "confusing", "irritating", "eyerolling", "tonedeaf", "soulless","boring", 
             "bad", "terrible", "lame", "bomb", "fail", "not interesting", "bore", "cliche", "predictable", "wouldnt", "least", "shouldnt", "would not", "should not", "low", "never", "cheap", 
             "waste of time", "worst", "dont", "idiotic", "bother", "mediocre", "shitty", "fucked up", "wasnt good", "unrealistic", "unoriginal", "bland", "stupid", "disgusting", "failure", "upset", "upsetting", "disappointed", 
             "disappointing", "disappointment", "below average", "mid", "unsatisfactory", "awful", "not good", "weak", "lacking", "lacks", "poor", "poorly", "unconvincing", "hated", "disliked", "dislike", 
             "didnt"]
# make function to analyze text, containing the rule for categorizing a word as either positive or negative
def analyze_sentiment(text):
    words = re.findall(r'\b\w+\b', text.lower())
    pos_count = sum(1 for word in words if word in pos_words)
    neg_count = sum(1 for word in words if word in neg_words)
    
    if pos_count > neg_count:
        return 1, pos_count, neg_count    # positive - any text file that has more positive words than negative words is positive.
    else:
        return -1, pos_count, neg_count    # negative - any text file that has less or equal positive words than negative is negative. 
    # The reason why I included equal is because there are some phrases like "not good" or "not fun" that are made up of both a pos. and a neg. word when it is technically a negative phrase.
results = []

# applying the function to the positive reviews folder and appending the results into the list:
for filename in os.listdir(pos_folder):
    if filename.endswith('.txt'):
        filepath = os.path.join(pos_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
            sentiment, pos, neg = analyze_sentiment(content)
            results.append({
                'filename': filename,
                'folder': 'pos',
                'true_label': 1,
                'predicted_label': sentiment,
                'pos_count': pos,
                'neg_count': neg})

# applying the function to the negative reviews folder and appending the results into the list:
for filename in os.listdir(neg_folder):
    if filename.endswith('.txt'):
        filepath = os.path.join(neg_folder, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
            sentiment, pos, neg = analyze_sentiment(content)
            results.append({
                'filename': filename,
                'folder': 'neg',
                'true_label': -1,
                'predicted_label': sentiment,
                'pos_count': pos,
                'neg_count': neg})
# test
print("Test:")
for entry in results[:3]:
    print(entry)

# save the results list as a dataframe as a csv file 
import pandas as pd
df = pd.DataFrame(results)
df.to_csv('sentiment_results.csv', index=False)

Test:
{'filename': '4715_9.txt', 'folder': 'pos', 'true_label': 1, 'predicted_label': 1, 'pos_count': 2, 'neg_count': 0}
{'filename': '12390_8.txt', 'folder': 'pos', 'true_label': 1, 'predicted_label': 1, 'pos_count': 6, 'neg_count': 0}
{'filename': '8329_7.txt', 'folder': 'pos', 'true_label': 1, 'predicted_label': -1, 'pos_count': 0, 'neg_count': 0}


In [294]:
# count total review accuracy and accuracy by category
import pandas as pd

results = pd.read_csv('sentiment_results.csv')
correct_pos = sum((results['folder'] == 'pos') & (results['predicted_label'] == 1))
correct_neg = sum((results['folder'] == 'neg') & (results['predicted_label'] == -1))
total_pos = sum(results['folder'] == 'pos')
total_neg = sum(results['folder'] == 'neg')
total_files = total_pos + total_neg

accuracy = (correct_pos + correct_neg) / total_files
print(f"Accurate positive reviews: {correct_pos}/{total_pos}")
print(f"Accurate negative reviews: {correct_neg}/{total_neg}")
print(f"Overall accuracy: {accuracy:.1%}")

Accurate positive reviews: 11261/12500
Accurate negative reviews: 5750/12500
Overall accuracy: 68.0%


## Conclusions: 
It is harder to determine if a comment is negative because there are so many ambiguous words that can be used to express a reviewer's dissatisfaction with a film, because they can mention specific scenes or details about the movie that they didn't like, while for positive reviews, usually they mention the same adjectives like "amazing", "fantastic", or "great" to describe the film overall. Additionally, filtering the text in the beginning to remove all capitalization and special symbols help prevent discrepancies and misses from occurring during the process.