# Movie Reviews

As a Data Scientist, I want to clean and preprocess IMDb movie reviews,
 So that the text is standardized, noise-free, and ready for sentiment analysis.


#### imports

In [18]:
import pandas as pd
import os, re, html, random
from pathlib import Path

import sys
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk import pos_tag

In [5]:
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

#### Data Loading

In [6]:
data_dir = r"C:/Users/bbuser/Desktop/aclImdb"

def load_imdb_with_rating_v2(base_dir, subset="train", sample_size=5000):
    data_records = []

    categories = {"pos": 1, "neg": 0}  # map labels to 1/0
    
    for label, label_value in categories.items():
        path = os.path.join(base_dir, subset, label)
        
        # list all files in pos/neg folder
        all_files = os.listdir(path)
        
        # adjust if sample_size is larger than available
        if sample_size > len(all_files):
            sample_size = len(all_files)
        
        filenames = random.sample(all_files, sample_size)

        for filename in filenames:
            # filename looks like "12345_7.txt"
            file_id, rating_str = filename.split("_")
            rating = int(rating_str.split(".")[0])  # extract rating number
            
            file_path = os.path.join(path, filename)
            with open(file_path, encoding="utf-8") as f:
                review_text = f.read()
            
            data_records.append({
                "id": int(file_id),
                "rating": rating,
                "txt": review_text,
                "label": label_value
            })

    return pd.DataFrame(data_records)


In [7]:
df_subset = load_imdb_with_rating_v2(data_dir, subset="train", sample_size=5000)
print(f"Shape: {df_subset.shape}")
print("Unique labels:", df_subset["label"].unique())
df_subset.head()

Shape: (10000, 4)
Unique labels: [1 0]


Unnamed: 0,id,rating,txt,label
0,4592,9,Jackie Chan's Police Story is a landmark film ...,1
1,4881,9,"""Kaabee"" depicts the hardship of a woman in pr...",1
2,10634,10,In order to stop her homosexual friend Albert ...,1
3,11075,10,"Having read the reviews for this film, I under...",1
4,2368,8,"The plot:Kurt Harris (Jeff Wincott), a bitter,...",1


In [8]:
df_subset['label'].unique()

array([1, 0])

In [9]:
df_subset.head()

Unnamed: 0,id,rating,txt,label
0,4592,9,Jackie Chan's Police Story is a landmark film ...,1
1,4881,9,"""Kaabee"" depicts the hardship of a woman in pr...",1
2,10634,10,In order to stop her homosexual friend Albert ...,1
3,11075,10,"Having read the reviews for this film, I under...",1
4,2368,8,"The plot:Kurt Harris (Jeff Wincott), a bitter,...",1


In [10]:
df_subset.tail()

Unnamed: 0,id,rating,txt,label
9995,209,1,I could never stand watching Happy Days after ...,0
9996,1602,2,Anyone who thinks this film has not been appre...,0
9997,10906,2,"Redundant, but again the case. If you enjoy th...",0
9998,4398,2,Tim Robbins and John Cusack are two actors I h...,0
9999,6647,2,"A drama at its very core, ""Anna"" displays that...",0


In [11]:
# sanity check for me 

data_dir = r"C:/Users/bbuser/Desktop/aclImdb"
subset = "train"

# Paths to positive and negative folders
pos_path = os.path.join(data_dir, subset, "pos")
neg_path = os.path.join(data_dir, subset, "neg")

# Check if the folders exist
print("Positive folder exists:", os.path.exists(pos_path))
print("Negative folder exists:", os.path.exists(neg_path))

# Check how many files are inside
print("Positive reviews:", len(os.listdir(pos_path)) if os.path.exists(pos_path) else "MISSING")
print("Negative reviews:", len(os.listdir(neg_path)) if os.path.exists(neg_path) else "MISSING")


Positive folder exists: True
Negative folder exists: True
Positive reviews: 12500
Negative reviews: 12500


#### Cleaning Steps

In [12]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# --- 3) CLEANING UTILITIES ----------------------------------------------------

# stopwords, but keep negations (important for sentiment)
EN_STOP = set(stopwords.words("english"))
for keep in ("not", "no", "nor"):
    EN_STOP.discard(keep)

# contraction patterns
CONTRACTIONS = [
    (r"n['’]t\b", " not"),
    (r"['’]re\b", " are"),
    (r"['’]s\b", ""),
    (r"['’]ve\b", " have"),
    (r"['’]ll\b", " will"),
    (r"['’]d\b", " would"),
    (r"['’]m\b", " am"),
]
def expand_contractions(text: str) -> str:
    for pat, repl in CONTRACTIONS:
        text = re.sub(pat, repl, text)
    return text

# regex patterns
RE_HTML = re.compile(r"<[^>]+>")
RE_URL  = re.compile(r"https?://\S+|www\.\S+")
RE_EMAIL= re.compile(r"\b[\w\.-]+@[\w\.-]+\.\w+\b")
RE_NUM  = re.compile(r"\b\d+\b")
RE_NON_ALPHA = re.compile(r"[^a-z\s]")

# lemmatizer
WN_LEMMA = WordNetLemmatizer()
def _to_wordnet_pos(tag: str):
    if tag.startswith("J"): return ADJ
    if tag.startswith("V"): return VERB
    if tag.startswith("N"): return NOUN
    if tag.startswith("R"): return ADV
    return NOUN


In [14]:
# --- 4) CLEAN REVIEW FUNCTION -------------------------------------------------
def clean_review(text: str, min_token_len: int = 3) -> str:
    """
    Cleans one review string according to task criteria.
    """
    if not isinstance(text, str):
        return ""

    # 1) lowercase
    text = text.lower()

    # 2) decode HTML + strip tags
    text = html.unescape(text)
    text = RE_HTML.sub(" ", text)

    # 3) expand contractions
    text = expand_contractions(text)

    # 4) remove urls, emails, numbers
    text = RE_URL.sub(" ", text)
    text = RE_EMAIL.sub(" ", text)
    text = RE_NUM.sub(" ", text)

    # 5) drop punctuation/emoji (keep a-z + space)
    text = RE_NON_ALPHA.sub(" ", text)

    # 6) tokenize
    tokens = text.split()
    if not tokens:
        return ""

    # 7) remove stopwords (negations kept)
    tokens = [t for t in tokens if t not in EN_STOP]
    if not tokens:
        return ""

    # 8) POS-tag + lemmatize
    tagged = pos_tag(tokens)
    lemmas = [WN_LEMMA.lemmatize(w, _to_wordnet_pos(t)) for w, t in tagged]

    # 9) filter very short tokens
    lemmas = [w for w in lemmas if len(w) >= min_token_len]

    return " ".join(lemmas)


In [15]:
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# --- 5) APPLY CLEANING --------------------------------------------------------
df_subset["cleaned_review"] = df_subset["txt"].apply(clean_review)

# check if any empty outputs
print("Empty cleaned rows:", (df_subset["cleaned_review"].str.len() == 0).sum())
df_subset[["txt", "cleaned_review"]].head()

Empty cleaned rows: 0


Unnamed: 0,txt,cleaned_review
0,Jackie Chan's Police Story is a landmark film ...,jackie chan police story landmark film honk ko...
1,"""Kaabee"" depicts the hardship of a woman in pr...",kaabee depicts hardship woman pre wwii raise k...
2,In order to stop her homosexual friend Albert ...,order stop homosexual friend albert perry king...
3,"Having read the reviews for this film, I under...",read review film understandably start watch gr...
4,"The plot:Kurt Harris (Jeff Wincott), a bitter,...",plot kurt harris jeff wincott bitter cop under...


#### Apply Cleaning

In [20]:
# --- 6) BEFORE / AFTER EXAMPLES ----------------------------------------------
RANDOM_STATE = 42   # you can pick any integer

examples = df_subset.sample(5, random_state=RANDOM_STATE)[["txt", "cleaned_review"]]
for i, row in examples.reset_index(drop=True).iterrows():
    print(f"\n--- EXAMPLE {i+1} ---")
    print("RAW:    ", row["txt"][:500].replace("\n"," "))
    print("CLEANED:", row["cleaned_review"])


--- EXAMPLE 1 ---
RAW:     I picked this one up because the music was done by Hans Zimmer, a customer of Metasonix modular synths (made by someone dear to me). The jacket art says "the 2003 version".<br /><br />I give it one point for a strong female, one point for cheezy dialog and one last point for meg foster's light blue eyes, of which there are plenty of shots of.<br /><br />It was fun seeing David MacCullum casually swimming (the pool has a plexiglass viewing window!), while his lady love was being chased by a psych
CLEANED: pick one music han zimmer customer metasonix modular synths make someone dear jacket art say version give one point strong female one point cheezy dialog one last point meg foster light blue eye plenty shot fun see david maccullum casually swim pool plexiglas view window lady love chase psycho greece set marginally impressive rich people house mendanassos castle find wonder able keep cleaning dust blow around wind not fierce enough believable keep think anim

In [21]:
# --- 7) SAVE CLEANED DATASET --------------------------------------------------
OUT_PATH = Path("C:/Users/bbuser/Desktop/DataScience-Brain-Bytes-1/Team_members/from_arwa/data/cleaned_reviews.csv")
cols = ["id", "rating", "label", "txt", "cleaned_review"]
df_subset[cols].to_csv(OUT_PATH, index=False, encoding="utf-8")
print(f"Saved to {OUT_PATH.resolve()}")

Saved to C:\Users\bbuser\Desktop\DataScience-Brain-Bytes-1\Team_members\from_arwa\data\cleaned_reviews.csv


In [24]:
cleaned_reviews = pd.read_csv(r"C:\Users\bbuser\Desktop\DataScience-Brain-Bytes-1\Team_members\from_arwa\data\cleaned_reviews.csv")

In [25]:
cleaned_reviews.head()

Unnamed: 0,id,rating,label,txt,cleaned_review
0,4592,9,1,Jackie Chan's Police Story is a landmark film ...,jackie chan police story landmark film honk ko...
1,4881,9,1,"""Kaabee"" depicts the hardship of a woman in pr...",kaabee depicts hardship woman pre wwii raise k...
2,10634,10,1,In order to stop her homosexual friend Albert ...,order stop homosexual friend albert perry king...
3,11075,10,1,"Having read the reviews for this film, I under...",read review film understandably start watch gr...
4,2368,8,1,"The plot:Kurt Harris (Jeff Wincott), a bitter,...",plot kurt harris jeff wincott bitter cop under...


#### Evaluation

In [26]:
# --- 8) TASK EXAMPLE CHECK ----------------------------------------------------
sample_text = 'I LOVED this movie!! <br /><br /> It was amazing :) 10/10'
print("Raw:", sample_text)
print("Cleaned:", clean_review(sample_text))

Raw: I LOVED this movie!! <br /><br /> It was amazing :) 10/10
Cleaned: love movie amaze


#### Baseline Classifier (TF-IDF + Logistic Regression)

In [27]:
# --- 9) OPTIONAL: BASELINE MODEL ---------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Features (X) and target (y)
X = df_subset["cleaned_review"].values
y = df_subset["label"].values  # 1 = positive, 0 = negative

# Stratified split: keep class balance in both splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Text -> features: TF-IDF with uni+bi-grams, light pruning
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),   # unigrams + bigrams capture short phrases/negations
    min_df=3,             # ignore very rare terms
    max_df=0.9            # drop extremely frequent boilerplate
)

Xtr = tfidf.fit_transform(X_train)
Xte = tfidf.transform(X_test)

# Linear model baseline
clf = LogisticRegression(
    max_iter=1000,        # allow convergence
    n_jobs=None,          # (param ignored for some solvers; fine to leave)
    C=1.0                 # regularization strength (can tune)
)
clf.fit(Xtr, y_train)

# Evaluation
pred = clf.predict(Xte)
acc = accuracy_score(y_test, pred)
print("Accuracy:", round(acc, 4))
print("\nClassification report:\n", classification_report(y_test, pred, digits=3))

cm = confusion_matrix(y_test, pred)
print("\nConfusion matrix:\n", cm)


Accuracy: 0.872

Classification report:
               precision    recall  f1-score   support

           0      0.891     0.848     0.869      1000
           1      0.855     0.896     0.875      1000

    accuracy                          0.872      2000
   macro avg      0.873     0.872     0.872      2000
weighted avg      0.873     0.872     0.872      2000


Confusion matrix:
 [[848 152]
 [104 896]]


In [31]:
test = "I LOVED movies that were better than others, but I wasn't impressed."
print(clean_review(test))

love movie well others not impressed


trying to find a way to up the accuracy and BERT came up, will test it below. however i need to change my env to make sure python -v matches pytorch

In [29]:
# ==== DistilBERT fine-tuning on raw text (standalone cell) ====================
# (Optional) first time installs:
# !pip install -U transformers torch scikit-learn --quiet

import os, random, numpy as np, pandas as pd, torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

# ---- 0) Pull the DataFrame (expects columns: 'txt', 'label') -----------------
df_candidates = [globals().get("df_subset"), globals().get("df")]
df = next((d for d in df_candidates
           if isinstance(d, pd.DataFrame) and {"txt","label"}.issubset(d.columns)), None)
if df is None:
    raise ValueError("No DataFrame with columns {'txt','label'} found. "
                     "Make sure you have df_subset or df with these columns.")

# ---- 1) Reproducibility & device --------------------------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ---- 2) Train/test split -----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    df["txt"].values, df["label"].values,
    test_size=0.20, random_state=SEED, stratify=df["label"].values
)

# ---- 3) Tokenizer & PyTorch Dataset wrapper ---------------------------------
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class TextClfDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.enc = tokenizer(
            list(texts), padding=True, truncation=True, max_length=max_len, return_tensors="pt"
        )
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return self.labels.size(0)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.enc.items()}
        item["labels"] = self.labels[idx]
        return item

train_ds = TextClfDataset(X_train, y_train, tokenizer)
test_ds  = TextClfDataset(X_test,  y_test,  tokenizer)

# ---- 4) Model & TrainingArguments -------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Modest batch sizes for CPU; raise if you have a GPU
train_bs = 16 if device == "cuda" else 8
eval_bs  = 32 if device == "cuda" else 16

args = TrainingArguments(
    output_dir="./distilbert-imdb",
    learning_rate=2e-5,
    per_device_train_batch_size=train_bs,
    per_device_eval_batch_size=eval_bs,
    num_train_epochs=3,          # 2–4 is typical for IMDB
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    seed=SEED
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"accuracy": (preds == labels).mean()}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# ---- 5) Train & Evaluate -----------------------------------------------------
trainer.train()
eval_metrics = trainer.evaluate()
print("\n=== DistilBERT Eval (Trainer) ===")
for k, v in eval_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# Detailed report
pred_out = trainer.predict(test_ds)
y_pred = np.argmax(pred_out.predictions, axis=1)

print("\n=== DistilBERT Detailed Report ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# (Optional) Save final model + tokenizer for later inference
# trainer.save_model("./distilbert-imdb/best")
# tokenizer.save_pretrained("./distilbert-imdb/best")


ModuleNotFoundError: No module named 'torch'