In [6]:
import os
import pandas as pd
import random

# 1- Data Loading

-IMDb Movie Reviews Dataset

In [16]:
data_dir = "Data/aclImdb"


-Dataset contains 10,000 labeled movie reviews (positive/negative).

In [17]:
def load_imdb_with_rating(base_dir, subset="train", sample_size=5000):
    rows = []
    
    for label_type in ["pos", "neg"]:
        folder = os.path.join(base_dir, subset, label_type)
        files = os.listdir(folder)
        
       
        
        sample_files = random.sample(files, sample_size)
        
        for fname in sample_files:
            
            file_id, rating_str = fname.split("_")
            rating = int(rating_str.split(".")[0]) 
            
            with open(os.path.join(folder, fname), "r", encoding="utf-8") as f:
                text = f.read()
            
           
            rows.append({
                "id": int(file_id),
                "rating": rating,
                "txt": text,
                "label": 1 if label_type == "pos" else 0
            })
    
    return pd.DataFrame(rows)


In [18]:
df_sample = load_imdb_with_rating(data_dir, subset="train", sample_size=5000)

print("Shape:", df_sample.shape)
df_sample.head()


Shape: (10000, 4)


Unnamed: 0,id,rating,txt,label
0,5903,8,I liked this movie sort of reminded me of my m...,1
1,194,8,Perhaps the funniest 'backstage at Hollywood' ...,1
2,5211,8,Since their nasty divorce from the Disney Comp...,1
3,7176,10,OK - you want to test somebody on how comforta...,1
4,5754,9,I remember seeing this one when I was seven or...,1


# 2- Cleaning Steps

-We need some libraries to start cleaning 

In [21]:
!pip install nltk


Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 3.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.0 MB/s eta 0:00:00
Downloading regex-2025.7.34-cp312-cp312-win_amd64.whl (275 kB)
Using cached joblib-1.5.1-py3-none-any.whl (307 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, joblib, nltk
Successfully installed joblib-1.5.1 nltk-3.9.1 regex-2025.7.34 tqdm-4.6


[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: C:\Users\bbuser\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


In [25]:
import sys
!{sys.executable} -m pip install nltk


Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2025.7.34-cp313-cp313-win_amd64.whl (275 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, nltk

   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ---------------------------------------- 0/3 [tqdm]
   ------------- -------------------------- 1/3 [regex]
   ------------- -------------------------- 1/3 [regex]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   ---------

In [26]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...


True

#### explanation for some language resources:
##### download('stopwords') --> We use it to remove stopwords
##### download('wordnet') --> Needed for lemmatization (reducing words to their base form)
##### download('omw-1.4') --> Additional linguistic data that helps lemmatizer handle words better

In [27]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


### What is happening here?
##### Loads the English stopword list from NLTK --> Converts it into a Python set for faster lookup
##### Creates lemmatizer object from WordNet --> Lemmatization = reducing a word to its base 

# let's explain what we do for the cleaning step:
##### Convert text to "lowercase"(1). Next, remove HTML tags using "BeautifulSoup"(2). After that remove "urls" and emails(3). Moving to, remove punctuation, numbers, emojis(4). Then we will spilt the sentences to the sprite words using "tokenize"(5). Moreover, remove "stopwords"(6).Before the final points, we will return the word to its origin using "lemmatize"(7). Finally, keep words longer than 2 chars(8).

In [28]:
def clean_review(text):
    # 1. 
    text = text.lower()
    
    # 2. 
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # 3. 
    text = re.sub(r'http\S+|www\S+|https\S+|[\w\.-]+@[\w\.-]+', '', text)
    
    # 4. 
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 5. 
    tokens = text.split()
    
    # 6. 
    tokens = [w for w in tokens if w not in stop_words]
    
    # 7. 
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    # 8. 
    tokens = [w for w in tokens if len(w) > 2]
    
    return " ".join(tokens)


# 3- Apply Cleaning 

- aAdd a new column cleaned_review 

In [29]:
df_sample["cleaned_review"] = df_sample["txt"].apply(clean_review)



# 4- Evaluation

In [30]:
df_sample[["txt", "cleaned_review"]].head(10)


Unnamed: 0,txt,cleaned_review
0,I liked this movie sort of reminded me of my m...,liked movie sort reminded marriage clean see f...
1,Perhaps the funniest 'backstage at Hollywood' ...,perhaps funniest backstage hollywood movie eve...
2,Since their nasty divorce from the Disney Comp...,since nasty divorce disney company disney keep...
3,OK - you want to test somebody on how comforta...,want test somebody comfortable adolescence emb...
4,I remember seeing this one when I was seven or...,remember seeing one seven eight must found cha...
5,As an old white housewife I can still apprecia...,old white housewife still appreciate laurence ...
6,James Cagney is best known for his tough chara...,james cagney best known tough character gangst...
7,"If I could go back, even as an adult and reliv...",could back even adult relive day summer spent ...
8,This was a typical grade B movie in 1940s Holl...,typical grade movie hollywood yet succeeded wa...
9,Red Rock West is a perfect example of how good...,red rock west perfect example good film practi...


In [32]:
df_sample

Unnamed: 0,id,rating,txt,label,cleaned_review
0,5903,8,I liked this movie sort of reminded me of my m...,1,liked movie sort reminded marriage clean see f...
1,194,8,Perhaps the funniest 'backstage at Hollywood' ...,1,perhaps funniest backstage hollywood movie eve...
2,5211,8,Since their nasty divorce from the Disney Comp...,1,since nasty divorce disney company disney keep...
3,7176,10,OK - you want to test somebody on how comforta...,1,want test somebody comfortable adolescence emb...
4,5754,9,I remember seeing this one when I was seven or...,1,remember seeing one seven eight must found cha...
...,...,...,...,...,...
9995,12250,2,"Upon viewing Tobe Hooper's gem, Crocodile, in ...",0,upon viewing tobe hoopers gem crocodile develo...
9996,1686,1,Imagine that you are asked by your date what m...,0,imagine asked date movie wanted see remember s...
9997,8252,3,Whattt was with the sound? It sounded like it ...,0,whattt sound sounded like dubbedotherwise bad ...
9998,6290,3,Recap: Ron is about to marry Mel. They are dee...,0,recap ron marry mel deeply love certain perfec...


## Save a cleaned version 

In [33]:
df_sample.to_csv("imdb_cleaned_sample.csv", index=False)
