In [1]:
import os
import re
import html
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("omw-1.4", quiet=True)
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [3]:
data_dir = r"C:\Users\bbuser\Desktop\aclImdb"  
subset = "train"                               
per_class_sample = 5000                         
random_state = 42

In [4]:
out_dir = r"C:\Users\bbuser\Desktop\aclImdb_cleaned"
os.makedirs(out_dir, exist_ok=True)
out_csv = os.path.join(out_dir, f"imdb_{subset}_cleaned_sample.csv")

In [5]:
url_pat = re.compile(r"(?i)\b(?:https?://|www\.)\S+\b")
email_pat = re.compile(r"(?i)\b[\w\.-]+@[\w\.-]+\.\w+\b")
html_tag_pat = re.compile(r"<[^>]+>")
non_az_pat = re.compile(r"[^a-z\s]+")

In [6]:
def clean_review(text: str) -> str:
    """
    1) lowercase
    2) unescape + strip HTML tags
    3) remove urls & emails
    4) keep [a-z ] only
    5) remove stopwords
    6) lemmatize
    7) keep tokens len > 2
    """
    if not isinstance(text, str):
        text = "" if text is None else str(text)

    x = text.lower()
    x = html.unescape(x)
    x = html_tag_pat.sub(" ", x)
    x = url_pat.sub(" ", x)
    x = email_pat.sub(" ", x)
    x = non_az_pat.sub(" ", x)

    toks = []
    for t in x.split():
        if len(t) <= 2:
            continue
        if t in stop_words:
            continue
        t = lemmatizer.lemmatize(t)
        if t and len(t) > 2 and t not in stop_words:
            toks.append(t)
    return " ".join(toks)

In [7]:
def _extract_id_and_rating(filename: str):
    m = re.search(r"(\d+)_(\d+)\.txt$", filename)
    if not m:
        return None, None
    return int(m.group(1)), int(m.group(2))

def load_imdb_with_rating(base_dir: str | Path, subset: str = "train",
                          per_class: int = 5000, seed: int = 42) -> pd.DataFrame:
    base = Path(base_dir) / subset
    pos_dir = base / "pos"
    neg_dir = base / "neg"
    if not pos_dir.is_dir() or not neg_dir.is_dir():
        raise FileNotFoundError(
            f"Expected folders:\n  {pos_dir}\n  {neg_dir}\nPlease check the path."
        )

    rng = np.random.RandomState(seed)
    rows = []

    for label_name, label_val in [("pos", 1), ("neg", 0)]:
        folder = base / label_name
        files = [f for f in os.listdir(folder) if f.endswith(".txt")]
        k = min(per_class, len(files))
        sel_idx = rng.choice(len(files), size=k, replace=False)

        for idx in sel_idx:
            fname = files[idx]
            rid, rating = _extract_id_and_rating(fname)
            if rid is None:
                continue
            fpath = folder / fname
            with open(fpath, encoding="utf-8") as fh:
                txt = fh.read()
            rows.append({
                "id": rid,
                "rating": rating,
                "label": label_val,   # 1 = pos, 0 = neg
                "text": txt
            })
            
    df = pd.DataFrame(rows)
    return df.sample(frac=1.0, random_state=seed).reset_index(drop=True)

In [8]:
df_sample = load_imdb_with_rating(data_dir, subset=subset,
                                  per_class=per_class_sample, seed=random_state)
print("Shape:", df_sample.shape)
print("Class counts:\n", df_sample["label"].value_counts())

Shape: (10000, 4)
Class counts:
 label
0    5000
1    5000
Name: count, dtype: int64


In [9]:
df_sample["cleaned_review"] = df_sample["text"].apply(clean_review)

In [10]:
df_sample[["id", "rating", "label", "text", "cleaned_review"]].to_csv(out_csv, index=False, encoding="utf-8")
print(f"[✓] Saved cleaned dataset -> {out_csv}")

[✓] Saved cleaned dataset -> C:\Users\bbuser\Desktop\aclImdb_cleaned\imdb_train_cleaned_sample.csv


In [11]:
print("\n=== Before / After (5 examples) ===")
for i in range(5):
    raw = df_sample.loc[i, "text"][:250].replace("\n", " ")
    cleaned = df_sample.loc[i, "cleaned_review"][:250]
    print(f"\n#{i+1}")
    print("Raw:    ", raw)
    print("Cleaned:", cleaned)
    print("Label:  ", "pos" if df_sample.loc[i, "label"] == 1 else "neg",
          "| Rating:", df_sample.loc[i, "rating"])


=== Before / After (5 examples) ===

#1
Raw:     I saw this movie when it was released, and my distaste for it has stuck with me all these years. <br /><br />Here's why: <br /><br />Greenaway's goal seems to be to take every literary image in the Tempest and make it literal. If a character were to 
Cleaned: saw movie released distaste stuck year greenaway goal seems take every literary image tempest make literal character say heart take flight shown actual human heart pigeon wing attached flapping across screen process make lush tableau ultimately facil
Label:   neg | Rating: 3

#2
Raw:     In a series chock-full of brilliant episodes, this one stands out as one of my very favorites. It's not the most profound episode, there's no great meaning or message. But it's a lot of fun, and there are some fine performances.<br /><br />But what m
Cleaned: series chock full brilliant episode one stand one favorite profound episode great meaning message lot fun fine performance make really stand 

In [12]:
df_sample

Unnamed: 0,id,rating,label,text,cleaned_review
0,4083,3,0,"I saw this movie when it was released, and my ...",saw movie released distaste stuck year greenaw...
1,4753,10,1,"In a series chock-full of brilliant episodes, ...",series chock full brilliant episode one stand ...
2,12239,10,1,"Excellent farce! Which, of course, is all it i...",excellent farce course intended thankfully nei...
3,8664,8,1,"Steely, powerful gangster supreme Frankie Diom...",steely powerful gangster supreme frankie diome...
4,1470,10,1,There's the danger with the critic/philosopher...,danger critic philosopher slavoj zizek film di...
...,...,...,...,...,...
9995,10811,1,0,This is one of the most god-awful movies ever....,one god awful movie ever shaq better stick bas...
9996,4976,1,0,I watched Grendel the other night and am compe...,watched grendel night compelled put together p...
9997,4640,3,0,Hunters chase what they think is a man through...,hunter chase think man forest though audience ...
9998,10220,7,1,That hilarious line is typical of what these n...,hilarious line typical naughty sister say funn...


In [13]:
out_path = r"C:\Users\bbuser\Desktop\aclImdb_cleaned\imdb_train_cleaned_sample.csv"
df_sample.to_csv(out_path, index=False, encoding="utf-8")
print(f"Saved to: {out_path}")

Saved to: C:\Users\bbuser\Desktop\aclImdb_cleaned\imdb_train_cleaned_sample.csv
