### 1. Import libraries

In [3]:
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bbuser\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

### 2. Load dataset

In [5]:
import pandas as pd
import glob

def load_reviews_from_dir(directory, label):
    files = glob.glob(directory + "/*.txt")
    data = []
    for f in files:
        with open(f, encoding="utf-8") as file:
            text = file.read()
            data.append([text, label])
    return pd.DataFrame(data, columns=['review', 'sentiment'])

train_pos = r"C:\Users\bbuser\Desktop\Jupyter\aclImdb\train\pos"
train_neg = r"C:\Users\bbuser\Desktop\Jupyter\aclImdb\train\neg"


df_train = pd.concat([
    load_reviews_from_dir(train_pos, 'positive'),
    load_reviews_from_dir(train_neg, 'negative')
], ignore_index=True)

In [6]:
df_train.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Homelessness (or Houselessness as George Carli...,positive
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive
3,This is easily the most underrated film inn th...,positive
4,This is not the typical Mel Brooks film. It wa...,positive


### 3. Clean the reviews

In [7]:
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Keep only letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # Lowercase
    text = text.lower().split()
    # Remove stopwords and lemmatize
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
    return " ".join(text)

In [8]:
df_train["cleaned_review"] = df_train["review"].apply(clean_text)
df_train = df_train[["cleaned_review", "sentiment"]]  # keep only what we need
df_train.head()

  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,cleaned_review,sentiment
0,bromwell high cartoon comedy ran time program ...,positive
1,homelessness houselessness george carlin state...,positive
2,brilliant acting lesley ann warren best dramat...,positive
3,easily underrated film inn brook cannon sure f...,positive
4,typical mel brook film much less slapstick mov...,positive


### 4. Train-test split

In [9]:
X = df_train["cleaned_review"]
y = df_train["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### 5. Apply TF-IDF

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

### 6. Train Logistic Regression

In [11]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [12]:
y_pred_log = log_reg.predict(X_test_tfidf)

### 7. Train Decision Tree

In [14]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_tfidf, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [15]:
y_pred_dt = dt.predict(X_test_tfidf)

### 8. Train Random Forest

In [16]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
y_pred_rf = rf.predict(X_test_tfidf)

### 9. Evaluate all models

In [18]:
def evaluate_model(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, pos_label="positive"),
        "Recall": recall_score(y_true, y_pred, pos_label="positive"),
        "F1": f1_score(y_true, y_pred, pos_label="positive"),
    }

results = pd.DataFrame({
    "Logistic Regression": evaluate_model(y_test, y_pred_log),
    "Decision Tree": evaluate_model(y_test, y_pred_dt),
    "Random Forest": evaluate_model(y_test, y_pred_rf)
}).T

results

Unnamed: 0,Accuracy,Precision,Recall,F1
Logistic Regression,0.8722,0.860799,0.888,0.874188
Decision Tree,0.6994,0.70012,0.6976,0.698858
Random Forest,0.8408,0.847188,0.8316,0.839322


### 10. Classification reports

In [19]:
print("Logistic Regression:\n", classification_report(y_test, y_pred_log))
print("Decision Tree:\n", classification_report(y_test, y_pred_dt))
print("Random Forest:\n", classification_report(y_test, y_pred_rf))

Logistic Regression:
               precision    recall  f1-score   support

    negative       0.88      0.86      0.87      2500
    positive       0.86      0.89      0.87      2500

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

Decision Tree:
               precision    recall  f1-score   support

    negative       0.70      0.70      0.70      2500
    positive       0.70      0.70      0.70      2500

    accuracy                           0.70      5000
   macro avg       0.70      0.70      0.70      5000
weighted avg       0.70      0.70      0.70      5000

Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.85      0.84      2500
    positive       0.85      0.83      0.84      2500

    accuracy                           0.84      5000
   macro avg       0.84      0.84      0.84      5000
weighted avg       0

### 11. Example predictions

In [20]:
sample_reviews = X_test.sample(5, random_state=42)
sample_preds = {
    "Logistic Regression": log_reg.predict(vectorizer.transform(sample_reviews)),
    "Decision Tree": dt.predict(vectorizer.transform(sample_reviews)),
    "Random Forest": rf.predict(vectorizer.transform(sample_reviews)),
}

pd.DataFrame({
    "Review": sample_reviews.values,
    "True Sentiment": y_test.loc[sample_reviews.index].values,
    "LogReg Prediction": sample_preds["Logistic Regression"],
    "DecisionTree Prediction": sample_preds["Decision Tree"],
    "RandomForest Prediction": sample_preds["Random Forest"],
})

Unnamed: 0,Review,True Sentiment,LogReg Prediction,DecisionTree Prediction,RandomForest Prediction
0,checking spoiler alert case perhaps one horren...,negative,negative,negative,negative
1,film appears draw borderline one side love fin...,positive,positive,negative,negative
2,seen film one time year ago day always told pe...,positive,positive,positive,positive
3,belgian film directed tom barman singer well k...,positive,positive,negative,positive
4,viggo mortensen star new inmate haunted prison...,positive,positive,positive,positive


In [21]:
# Cell 10: Insights
print("Insights:")
print("- Logistic Regression usually performs very well on text classification because TF-IDF makes reviews linearly separable.")
print("- Decision Tree tends to overfit and may perform worse on new reviews.")
print("- Random Forest reduces overfitting and often performs better than a single Decision Tree.")
print("- If speed and simplicity are important: choose Logistic Regression.")
print("- If you want a stronger but more complex model: Random Forest.")

Insights:
- Logistic Regression usually performs very well on text classification because TF-IDF makes reviews linearly separable.
- Decision Tree tends to overfit and may perform worse on new reviews.
- Random Forest reduces overfitting and often performs better than a single Decision Tree.
- If speed and simplicity are important: choose Logistic Regression.
- If you want a stronger but more complex model: Random Forest.
