# Module 4: Feedback Analytics (NLP)

This notebook covers:
- Text preprocessing for passenger feedback
- Sentiment classification (TF-IDF + LogisticRegression)
- Using pre-trained transformer for sentiment (Hugging Face)
- Topic modeling to cluster complaints

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Load data
data_dir = Path('../data')
feedback = pd.read_csv(data_dir / 'feedback.csv')
ratings = pd.read_csv(data_dir / 'ratings.csv')

df = feedback.merge(ratings[['trip_id', 'rating']], on='trip_id')

# Create sentiment labels
def rating_to_sentiment(rating):
    if rating >= 4: return 'positive'
    elif rating == 3: return 'neutral'
    else: return 'negative'

df['sentiment'] = df['rating'].apply(rating_to_sentiment)
print(f"Loaded {len(df)} feedback entries")
print(df['sentiment'].value_counts())

In [None]:
# Text preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join(text.split())

df['clean_text'] = df['feedback_text'].apply(preprocess_text)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment']
)

# TF-IDF
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Train classifier
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred, labels=['negative', 'neutral', 'positive'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['negative', 'neutral', 'positive'],
            yticklabels=['negative', 'neutral', 'positive'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Sentiment Classification - Confusion Matrix')
plt.tight_layout()
plt.show()

# Save models
import joblib
joblib.dump(tfidf, Path('../src/tfidf_vectorizer.joblib'))
joblib.dump(clf, Path('../src/sentiment_model.joblib'))
print("✓ Saved sentiment model")

In [None]:
# Try Hugging Face transformer (if available)
try:
    from transformers import pipeline
    sentiment_pipe = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
    
    samples = ["Driver was rude and late", "Great ride, very professional", "Ride was okay"]
    print("=== Hugging Face Sentiment ===")
    for text in samples:
        result = sentiment_pipe(text)[0]
        print(f"'{text}' → {result['label']}: {result['score']:.3f}")
except ImportError:
    print("Install transformers: pip install transformers")