### IMDb Movie Reviews

In [2]:
#imports

import pandas as pd
import os
import random
import sys
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:
data_dir = r"C:\Users\bbuser\Downloads\aclImdb_v1 (1)\aclImdb"

def load_imdb_with_rating_v2(base_dir, subset="train", sample_size=5000):
    data_records = []

    categories = {"pos": 1, "neg": 0}  # map labels to 1/0
    
    for label, label_value in categories.items():
        path = os.path.join(base_dir, subset, label)
        
        # list all files in pos/neg folder
        all_files = os.listdir(path)
        
        # adjust if sample_size is larger than available
        if sample_size > len(all_files):
            sample_size = len(all_files)
        
        filenames = random.sample(all_files, sample_size)

        for filename in filenames:
            # filename looks like "12345_7.txt"
            file_id, rating_str = filename.split("_")
            rating = int(rating_str.split(".")[0])  # extract rating number
            
            file_path = os.path.join(path, filename)
            with open(file_path, encoding="utf-8") as f:
                review_text = f.read()
            
            data_records.append({
                "id": int(file_id),
                "rating": rating,
                "txt": review_text,
                "label": label_value
            })

    return pd.DataFrame(data_records)

# we are takaing 5000 sample 
df_subset = load_imdb_with_rating_v2(data_dir, subset="train", sample_size=5000)

print(f"Shape: {df_subset.shape}")
df_subset.head()


Shape: (10000, 4)


Unnamed: 0,id,rating,txt,label
0,7925,7,"Silly movie is really, really funny. Yes, it's...",1
1,6480,10,This was by far the best war documentary ever ...,1
2,12201,8,"Made in 1946 and released in 1948, The Lady an...",1
3,7241,8,Paul Lukas played a Russian intellectual makin...,1
4,10198,8,I'm grateful to Cesar Montano and his crew in ...,1


### data cleaning 

In [14]:
#telling NLTK (Natural Language Toolkit) to download extra language resources that aren’t included by default
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Prepare stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_review(text):
    """Clean and preprocess a single IMDb review."""
    # 1. Lowercase
    text = text.lower()
    
    # 2. Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # 3. Remove URLs and emails
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    
    # 4. Remove punctuation, numbers, emojis
    text = re.sub(r"[^a-z\s]", "", text)
    
    # 5. Tokenize
    tokens = text.split()
    
    # 6. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # 7. Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # 8. Keep meaningful tokens (>2 chars)
    tokens = [word for word in tokens if len(word) > 2]
    
    return " ".join(tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...


In [17]:
# Take a small sample of your raw reviews
sample_df = df_subset.sample(3, random_state=42)  # 3 random reviews
#we are tring to see an example in real time 
for idx, row in sample_df.iterrows():
    raw_text = row["txt"]
    cleaned_text = clean_review(raw_text)
    
    print("------------------Original Review-------------------:")
    print(raw_text[:300] + "..." if len(raw_text) > 300 else raw_text)  # show first 300 chars
    print("\n------------------------ Cleaned Review----------------------:")
    print(cleaned_text)
    print("="*80)


------------------Original Review-------------------:
I liked it but then I think I might have been ironing at the same time. This reworking of Cyrano de Bergerac/Roxanne is an utterly undemanding, formulaic romcom rescued from straight-to-video ignominy on its release by the sharp turn of Janeane Garofalo. Playing the Frasier of Pets, she finds hersel...

------------------------ Cleaned Review----------------------:
liked think might ironing time reworking cyrano bergeracroxanne utterly undemanding formulaic romcom rescued straighttovideo ignominy release sharp turn janeane garofalo playing frasier pet find caught love trap insecurity lead pas best friend uma thurman caller come acourtinthis interesting film fascinating career ben chaplin average british actor gave hollywood treadmill shot film unremarkable anonymity studio production unsurprising basis although appeared substantial cameo later terence malick film uma thurman ditzy turn autopilot michael lehmann package together comp