In [1]:
# pip install scikit-learn

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


# CountVectorizer implements Bag of Words.

# TfidfVectorizer computes TF‑IDF weights.

# LogisticRegression is a common linear model for text classification.

# ---------------------------------------
# 1. Example product reviews with labels
# ---------------------------------------
texts = [
    "One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked." ,
    "A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time",
    "I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioner.",
    "Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet.",
    "Petter Mattei's 'Love in the Time of Money' is a visually stunning film to watch.",
    "Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication.",
    "I sure would like to see a resurrection of a up dated Seahunt series with the tech they have",
    "This show was an amazing, fresh & innovative idea in the 70's when it first aired.",
    "Encouraged by the positive comments about this film on here I was looking forward to watching this.",
    "If you like original gut wrenching laughter you will like this movie. If you are young or old",
    "Phil the Alien is one of those quirky films where the humour is based around the oddness",
    "I saw this movie when I was about 12 when it came out. I recall the scariest scene was the big bird.",
    "So im not a big fan of Boll's work but then again not many are. I enjoyed his movie Postal.",
    "My first exposure to the Templarios & not a good one."
]

# 1 = positive, 0 = negative
labels = np.array([1,1,1,0,1,1,1,0,0,1,0,0,0,0])

# This mimics real review text with positive/negative sentiment labels.


# 2. Bag of Words representation
# ---------------------------------------
# 2. Bag of Words (CountVectorizer)
# ---------------------------------------
bow_vectorizer = CountVectorizer(lowercase=True, stop_words="english")  # simple preprocessing [web:132]
X_bow = bow_vectorizer.fit_transform(texts)

print("BoW shape:", X_bow.shape)           # (documents, vocabulary_size)
print("Sample vocabulary (first 20):", list(bow_vectorizer.vocabulary_.keys())[:20])

# BoW counts how often each word appears in each document, ignoring order.

# Result is a sparse document‑term matrix suitable for ML models.

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
X_tfidf = tfidf_vectorizer.fit_transform(texts)

print("TF-IDF shape:", X_tfidf.shape)
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Example terms:", feature_names[:15])

# TF‑IDF = term frequency × inverse document frequency, up‑weighting words that are common in a
#  document but rare overall.

# Often improves performance over raw counts for tasks like sentiment or spam detection.

# Train sentiment classifier (BoW vs TF‑IDF)

# Train/test split
X_bow_train, X_bow_test, y_train, y_test = train_test_split(
    X_bow, labels, test_size=0.3, random_state=42, stratify=labels
)

X_tfidf_train, X_tfidf_test, _, _ = train_test_split(
    X_tfidf, labels, test_size=0.3, random_state=42, stratify=labels
)

# Logistic Regression with BoW
bow_clf = LogisticRegression(max_iter=1000)
bow_clf.fit(X_bow_train, y_train)

y_pred_bow = bow_clf.predict(X_bow_test)

print("\n=== BoW + Logistic Regression ===")
print(classification_report(y_test, y_pred_bow, target_names=["negative","positive"]))
print("Confusion matrix (BoW):\n", confusion_matrix(y_test, y_pred_bow))

# Logistic Regression with TF-IDF

tfidf_clf = LogisticRegression(max_iter=1000)
tfidf_clf.fit(X_tfidf_train, y_train)

y_pred_tfidf = tfidf_clf.predict(X_tfidf_test)

print("\n=== TF-IDF + Logistic Regression ===")
print(classification_report(y_test, y_pred_tfidf, target_names=["negative","positive"]))
print("Confusion matrix (TF-IDF):\n", confusion_matrix(y_test, y_pred_tfidf))

# This mirrors standard text‑classification pipelines in scikit‑learn tutorials.

# Comparing reports often shows TF‑IDF giving slightly better precision/recall on small corpora.

# Prediction of sentiment for new reviews

# Using the trained TF-IDF model
new_reviews = [
    "My first exposure to the Templarios & not a good one.",
    "One of the most significant quotes from the entire film is pronounced halfway through."
]

# Vectorize with the same fitted TF-IDF vectorizer
new_X = tfidf_vectorizer.transform(new_reviews)
new_preds = tfidf_clf.predict(new_X)
new_probs = tfidf_clf.predict_proba(new_X)

for review, label, prob in zip(new_reviews, new_preds, new_probs):
    sentiment = "positive" if label == 1 else "negative"
    print(f"\nReview: {review}")
    print(f"Predicted sentiment: {sentiment} (prob={prob[label]:.2f})")
# Reusing the same vectorizer and classifier emulates a production sentiment‑analysis API.

# Logistic regression outputs probabilities that can be thresholded or inspected for uncertainty.

BoW shape: (14, 95)
Sample vocabulary (first 20): ['reviewers', 'mentioned', 'watching', 'just', 'oz', 'episode', 'll', 'hooked', 'wonderful', 'little', 'production', 'br', 'filming', 'technique', 'unassuming', 'old', 'time', 'thought', 'way', 'spend']
TF-IDF shape: (14, 95)
Example terms: ['12' '70' 'air' 'aired' 'alien' 'amazing' 'based' 'basically' 'big'
 'bird' 'boll' 'boy' 'br' 'came' 'closet']

=== BoW + Logistic Regression ===
              precision    recall  f1-score   support

    negative       1.00      0.67      0.80         3
    positive       0.67      1.00      0.80         2

    accuracy                           0.80         5
   macro avg       0.83      0.83      0.80         5
weighted avg       0.87      0.80      0.80         5

Confusion matrix (BoW):
 [[2 1]
 [0 2]]

=== TF-IDF + Logistic Regression ===
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
    positive       0.40      1.00      0.57     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
