In [1]:
import os
print(os.listdir("."))          # should show "aclImdb"
print(os.listdir("aclImdb"))    # should show train, test


['.ipynb_checkpoints', 'aclImdb', 'aclImdb_v1 (1).tar.gz', 'baboon.png', 'Binary Classification with Both Algorithms.ipynb', 'Color Enhancement.ipynb', 'data', 'Data Dictionary - carprices.xlsx', 'diabetes.csv', 'draft.ipynb', 'Fish Weight Modeling for Market Insights.ipynb', 'Fish.csv', 'flat_unclean_data.ipynb', 'goldhill.bmp', 'Image Processing Basics (Grayscale Conversion and Filtering).ipynb', 'IMAGE processing.ipynb', 'Image Sharpening for Medical Imaging.ipynb', 'Image Transformation on baboon.png.ipynb', 'IMAGE.jpg', 'image3.jpg', 'IMD.ipynb', 'IMDB_cleaned_train.csv', 'Introduction_to_Histogram.ipynb', 'Iris Multi-class Classification with K-NN.ipynb', 'Keyword-Based News Classification Model.ipynb', 'laptop unclean data.ipynb', 'lenna.png', 'News Search Engine.ipynb', 'News_Category_Dataset_v3.json', 'news_index.csv', 'news_tfidf_matrix.npz', 'news_tfidf_vectorizer.joblib', 'numpy_task.ipynb', 'python dubizzle_scraper.ipynb', 'Regression Analysis.ipynb', 'Sales Data Simulatio

In [2]:
import os
import re
import tarfile
import pandas as pd
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


for res in ["stopwords", "punkt", "punkt_tab", "wordnet"]:
    nltk.download(res, quiet=True)

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


def safe_extract(tar, path="."):
    """Safely extract tar.gz to avoid path traversal attacks"""
    for member in tar.getmembers():
        member_path = os.path.join(path, member.name)
        if not os.path.commonprefix([os.path.abspath(path), os.path.abspath(member_path)]) == os.path.abspath(path):
            raise Exception("Attempted Path Traversal in Tar File")
    tar.extractall(path)

# Extract dataset if not already extracted
tar_file = "aclImdb_v1.tar.gz"
if os.path.exists(tar_file) and not os.path.exists("aclImdb"):
    print("Extracting dataset...")
    with tarfile.open(tar_file, "r:gz") as tar:
        safe_extract(tar, ".")
    print("Extraction completed!")

dataset_path = "aclImdb"


# Cleaning Function
def clean_review(text):
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove HTML tags safely
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
    
    # Remove URLs, emails
    text = re.sub(r'http\S+|www.\S+|[\w\.-]+@[\w\.-]+', '', text)
    
    # Remove punctuation, numbers, emojis
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords and short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return " ".join(tokens)

# Load Dataset

def load_imdb_dataset(path, subset="train", sample_size=None):
    data, labels = [], []
    for label in ["pos", "neg"]:
        folder = os.path.join(path, subset, label)
        for fname in os.listdir(folder):
            file_path = os.path.join(folder, fname)
            if os.path.isfile(file_path):
                with open(file_path, encoding="utf-8") as f:
                    review = f.read()
                    data.append(review)
                    labels.append(label)
    
    df = pd.DataFrame({"review": data, "sentiment": labels})
    
    if sample_size:
        df = df.sample(sample_size, random_state=42).reset_index(drop=True)
    
    return df


df_train = load_imdb_dataset(dataset_path, subset="train", sample_size=10000)
df_train["cleaned_review"] = df_train["review"].apply(clean_review)


print(df_train[["review", "cleaned_review"]].head(5))

# Save cleaned train set
df_train.to_csv("IMDB_cleaned_train.csv", index=False)


df_test = load_imdb_dataset(dataset_path, subset="test")
df_test["cleaned_review"] = df_test["review"].apply(clean_review)
df_test.to_csv("IMDB_cleaned_test.csv", index=False)

print("✅ Cleaned datasets saved: IMDB_cleaned_train.csv & IMDB_cleaned_test.csv")


  text = BeautifulSoup(text, "html.parser").get_text(separator=" ")


                                              review  \
0  In Panic In The Streets Richard Widmark plays ...   
1  If you ask me the first one was really better ...   
2  I am a big fan a Faerie Tale Theatre and I've ...   
3  I just finished reading a book about Dillinger...   
4  Greg Davis and Bryan Daly take some crazed sta...   

                                      cleaned_review  
0  panic street richard widmark play navy doctor ...  
1  ask first one really better one look sarah rea...  
2  big fan faerie tale theatre seen one best funn...  
3  finished reading book dillinger movie horribly...  
4  greg davis bryan daly take crazed statement te...  


  text = BeautifulSoup(text, "html.parser").get_text(separator=" ")


✅ Cleaned datasets saved: IMDB_cleaned_train.csv & IMDB_cleaned_test.csv
