In [None]:
# Step 1: Download and extract IMDb dataset
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

# Step 2: Import libraries
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Step 3: Load dataset (train set only)
def load_data(folder):
    reviews = []
    labels = []
    for label in ['pos', 'neg']:
        path = os.path.join(folder, label)
        for file in os.listdir(path)[:1000]:  # Load only 1000 positive & 1000 negative for speed
            with open(os.path.join(path, file), encoding='utf-8') as f:
                reviews.append(f.read())
                labels.append(1 if label == 'pos' else 0)
    return pd.DataFrame({'review': reviews, 'sentiment': labels})

data = load_data('aclImdb/train')

# Step 4: Split and vectorize
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 5: Train model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Step 6: Predict
print("Test Accuracy:", model.score(X_test_vec, y_test))

# Step 7: Try your own review
def predict(review):
    vec = vectorizer.transform([review])
    pred = model.predict(vec)[0]
    return "Positive" if pred == 1 else "Negative"

print(predict("The movie was so exciting"))
print(predict("I loved the story and the acting was amazing!"))


--2025-07-26 09:14:57--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2025-07-26 09:15:28 (2.62 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]

Test Accuracy: 0.8175
Positive
Positive
