# IMDb Movie Reviews

### 📦 1. Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### 📁 2. Load Reviews from Files

In [3]:
import pandas as pd
import glob

def load_reviews_from_dir(directory, label):
    files = glob.glob(directory + "/*.txt")
    data = []
    for f in files:
        with open(f, encoding="utf-8") as file:
            text = file.read()
            data.append([text, label])
    return pd.DataFrame(data, columns=['review', 'sentiment'])

train_pos = "aclImdb_v1/aclImdb/train/pos"
train_neg = "aclImdb_v1/aclImdb/train/neg"


df_train = pd.concat([
    load_reviews_from_dir(train_pos, 'positive'),
    load_reviews_from_dir(train_neg, 'negative')
], ignore_index=True)


> **Explanation:**
This function reads all text files from a specified directory (positive or negative IMDB reviews), labels them accordingly, and returns a single DataFrame. Both positive and negative reviews are concatenated into one training DataFrame.

In [5]:
df_train.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Homelessness (or Houselessness as George Carli...,positive
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive
3,This is easily the most underrated film inn th...,positive
4,This is not the typical Mel Brooks film. It wa...,positive


In [7]:
df_train.columns

Index(['review', 'sentiment'], dtype='object')

In [8]:
df_train.index

RangeIndex(start=0, stop=25000, step=1)

### 📊 3. Sample a Subset of Reviews

In [9]:
df_sample = df_train.sample(n=10000, random_state=42).reset_index(drop=True)

> **Explanation:**
To improve processing speed, a random sample of 10,000 reviews is taken from the training set using a fixed random seed for reproducibility.

### 🧹 4. Clean and Preprocess Text

In [12]:
import string

def clean_review(text):
    # Convert to lowercase
    text = text.lower()

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove URLs and emails
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)

    # Remove punctuation, numbers, and non-letters
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords & short tokens, lemmatize
    cleaned_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in stop_words and len(word) > 2
    ]

    # Join tokens back to string
    return " ".join(cleaned_tokens)


> **Explanation:**
The `clean_review` function performs several preprocessing steps:
> - Lowercasing
> - Removing HTML, URLs, and emails
> - Removing punctuation and digits
> - Tokenization
> - Stopword removal and lemmatization

### 🧼 5. Apply Cleaning to Dataset

In [13]:
df_sample['cleaned_review'] = df_sample['review'].apply(clean_review)

> **Explanation:**
The cleaning function is applied to every review in the sample dataset to create a new column, `cleaned_review`.

### 👀 6. Preview Cleaned Data

In [14]:
df_sample[['review', 'cleaned_review']].head()

Unnamed: 0,review,cleaned_review
0,In Panic In The Streets Richard Widmark plays ...,panic street richard widmark play navy doctor ...
1,If you ask me the first one was really better ...,ask first one really better one look sarah rea...
2,I am a big fan a Faerie Tale Theatre and I've ...,big fan faerie tale theatre ive seen one best ...
3,I just finished reading a book about Dillinger...,finished reading book dillinger movie horribly...
4,Greg Davis and Bryan Daly take some crazed sta...,greg davis bryan daly take crazed statement te...


### 💾 7. Save Cleaned Data to CSV

In [15]:
df_sample.to_csv("cleaned_imdb_sample.csv", index=False)