In [None]:
import os
import re
import tarfile
import pandas as pd
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


file_path = r"C:\Users\bbuser\Downloads\aclImdb_v1.tar.gz"
extract_dir = r"C:\Users\bbuser\Downloads\aclImdb"


if not os.path.exists(extract_dir):
    with tarfile.open(file_path, "r:gz") as tar:
        tar.extractall(path=extract_dir, filter="data")  # safe extraction
    print("Extraction done!")
else:
    print(" Already extracted!")


def load_imdb_data(data_dir):
    data = {"review": [], "sentiment": []}
    for sentiment in ["pos", "neg"]:
        folder = os.path.join(data_dir, sentiment)
        for fname in os.listdir(folder):
            with open(os.path.join(folder, fname), encoding="utf-8") as f:
                data["review"].append(f.read())
                data["sentiment"].append(sentiment)
    return pd.DataFrame(data)

train_df = load_imdb_data(os.path.join(extract_dir, "aclImdb/train"))
test_df = load_imdb_data(os.path.join(extract_dir, "aclImdb/test"))

df = pd.concat([train_df, test_df], ignore_index=True)
print("Dataset loaded. Shape:", df.shape)


def clean_review(text):
    # Lowercase
    text = text.lower()
    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove URLs/emails
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"\S+@\S+", '', text)
    # Remove non-letters
    text = re.sub(r"[^a-z\s]", '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords + short tokens
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    # Lemmatize
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

print("Cleaning reviews... (this may take a few minutes)")
df["cleaned_review"] = df["review"].apply(clean_review)

examples = df[["review", "cleaned_review"]].head(5)
for i, row in examples.iterrows():
    print("\nRaw:", row["review"][:200], "...")
    print("Cleaned:", row["cleaned_review"])


X_train, X_test, y_train, y_test = train_test_split(
    df["cleaned_review"], df["sentiment"], test_size=0.2, random_state=42
)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=200)
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print("\n Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

output_path = r"C:\Users\bbuser\Downloads\IMDB_cleaned.csv"
df.to_csv(output_path, index=False)
print(f"\n Cleaned dataset saved to {output_path}")




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


 Already extracted!
