In [1]:
import os
import pandas as pd
import random

In [4]:
data_dir = r"C:\Users\bbuser\Downloads\aclImdb_v1\aclImdb"

In [5]:
def load_imdb_with_rating(base_dir, subset="train", sample_size=5000):
    rows = []
    
    for label_type in ["pos", "neg"]:
        folder = os.path.join(base_dir, subset, label_type)
        files = os.listdir(folder)
        
       
        
        sample_files = random.sample(files, sample_size)
        
        for fname in sample_files:
            
            file_id, rating_str = fname.split("_")
            rating = int(rating_str.split(".")[0]) 
            
            with open(os.path.join(folder, fname), "r", encoding="utf-8") as f:
                text = f.read()
            
           
            rows.append({
                "id": int(file_id),
                "rating": rating,
                "txt": text,
                "label": 1 if label_type == "pos" else 0
            })
    
    return pd.DataFrame(rows)

In [6]:
df_sample = load_imdb_with_rating(data_dir, subset="train", sample_size=5000)

print("Shape:", df_sample.shape)
df_sample.head()


Shape: (10000, 4)


Unnamed: 0,id,rating,txt,label
0,4506,10,I want to add to the praise for the production...,1
1,537,10,To me A Matter of Life and Death is just that-...,1
2,6577,8,"With few exceptions, most of George Bernard Sh...",1
3,10992,10,"Damn, I've seen this movie for at least 4 time...",1
4,324,8,I didn't expect to like this film as much as I...,1


In [7]:
!pip install nltk



In [8]:
import sys
!{sys.executable} -m pip install nltk



In [9]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...


True

In [10]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

In [11]:
def clean_review(text):
     
    text = text.lower()
    
    text = BeautifulSoup(text, "html.parser").get_text()
    
    text = re.sub(r'http\S+|www\S+|https\S+|[\w\.-]+@[\w\.-]+', '', text)
    
    text = re.sub(r'[^a-z\s]', '', text)
    
    tokens = text.split()
    
    tokens = [w for w in tokens if w not in stop_words]
    
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    
    tokens = [w for w in tokens if len(w) > 2]
    
    return " ".join(tokens)

In [12]:
df_sample["cleaned_review"] = df_sample["txt"].apply(clean_review)

  text = BeautifulSoup(text, "html.parser").get_text()


In [13]:
df_sample[["txt", "cleaned_review"]].head(10)

Unnamed: 0,txt,cleaned_review
0,I want to add to the praise for the production...,want add praise production film especially lum...
1,To me A Matter of Life and Death is just that-...,matter life death simply best film ever madefr...
2,"With few exceptions, most of George Bernard Sh...",exception george bernard shaw play virtually d...
3,"Damn, I've seen this movie for at least 4 time...",damn ive seen movie least time still dont get ...
4,I didn't expect to like this film as much as I...,didnt expect like film much got simply saw lis...
5,Although it's not as creepy as it's cult class...,although creepy cult classic predecessor zombi...
6,"first, someone mentioned here that because thi...",first someone mentioned released limited quant...
7,"As a kid, I loved this game. I played it a zil...",kid loved game played zillion time spring frie...
8,******* SPOILER! ********<br /><br />i saw thi...,spoiler saw film year back lovely story young ...
9,"""Moonstruck"" is one of the best films ever. I ...",moonstruck one best film ever film dvd movie d...


In [14]:
df_sample

Unnamed: 0,id,rating,txt,label,cleaned_review
0,4506,10,I want to add to the praise for the production...,1,want add praise production film especially lum...
1,537,10,To me A Matter of Life and Death is just that-...,1,matter life death simply best film ever madefr...
2,6577,8,"With few exceptions, most of George Bernard Sh...",1,exception george bernard shaw play virtually d...
3,10992,10,"Damn, I've seen this movie for at least 4 time...",1,damn ive seen movie least time still dont get ...
4,324,8,I didn't expect to like this film as much as I...,1,didnt expect like film much got simply saw lis...
...,...,...,...,...,...
9995,2214,4,"this movie wasn't absolutely atrocious, but it...",0,movie wasnt absolutely atrocious pretty bad ac...
9996,7842,1,Caught this by accident on a t.v. showing - an...,0,caught accident showing could hardly believe u...
9997,890,2,I saw this regurgitated pile of vignettes toni...,0,saw regurgitated pile vignette tonight preview...
9998,516,2,Ever notice how in his later movies Burt Reyno...,0,ever notice later movie burt reynolds laugh so...


In [15]:
df_sample.to_csv("imdb_movie_cleaned.csv", index=False)