In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

# Silent NLTK downloads
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

# Text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'\@\w+|\#\w+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Tokenize
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Create balanced dataset
data = pd.DataFrame({
    'text': [
        # Positive (15 examples)
        "I love this product! It's amazing",
        "This is the best day ever",
        "Just got promoted at work! So excited",
        "My package arrived early! Great service",
        "Absolutely in love with my new phone",
        "The conference was informative and well-organized",
        "The team did an outstanding job on this project",
        "What an incredible performance",
        "I'm really happy with the service",
        "Highly recommend to everyone",
        "Such a beautiful experience",
        "Perfect solution for my needs",
        "Exceeded all my expectations",
        "Wonderful customer support",
        "Couldn't be happier with my purchase",
        
        # Negative (15 examples)
        "Terrible experience, would not recommend",
        "I'm so frustrated with this app",
        "The customer service was horrible",
        "This restaurant has the worst food",
        "Flight got delayed again",
        "Broken item received. Very disappointed",
        "The app interface is confusing",
        "Worst customer experience of my life",
        "The food was cold when it arrived",
        "Never buying from this brand again",
        "Complete waste of money",
        "Extremely poor quality",
        "Absolutely disgusting",
        "Failed to meet basic expectations",
        "Regret purchasing this product",
        
        # Neutral (15 examples)
        "The service was okay, nothing special",
        "The weather is nice today",
        "The movie was decent, but could've been better",
        "The book was interesting, though a bit long",
        "The delivery was late but the product is good",
        "Average performance, expected better",
        "Pretty good for the price",
        "The event was well-organized but crowded",
        "The product is okay, nothing special",
        "The tutorial was helpful for beginners",
        "The presentation was informative but too long",
        "The service was adequate for the price",
        "The food was neither good nor bad",
        "The movie was average, not great not terrible",
        "The app works but could use some improvements"
    ],
    'sentiment': ['positive']*15 + ['negative']*15 + ['neutral']*15
})

# Preprocess text
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Split data (30% test for better evaluation)
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'], 
    data['sentiment'], 
    test_size=0.3, 
    random_state=42,
    stratify=data['sentiment']
)

# Verify class distribution
print("Training set distribution:")
print(y_train.value_counts())
print("\nTest set distribution:")
print(y_test.value_counts())

# Build pipeline with optimized parameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000, ngram_range=(1, 2))),
    ('clf', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        max_depth=10,
        random_state=42
    ))
])

# Train model
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("\nModel Accuracy: {:.2f}%".format(accuracy_score(y_test, y_pred)*100))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Prediction function with confidence scores
def predict_sentiment(text):
    try:
        cleaned = preprocess_text(text)
        pred = pipeline.predict([cleaned])[0]
        proba = pipeline.predict_proba([cleaned])[0]
        return pred, dict(zip(pipeline.classes_, proba))
    except Exception as e:
        print(f"Prediction error: {str(e)}")
        return "error", {"positive": 0, "negative": 0, "neutral": 0}

# Test cases
test_posts = [
    "I'm really happy with the service!",
    "This is unacceptable quality",
    "The event was well-organized but crowded",
    "The product is not okay, I hate this",
    "Absolutely wonderful experience",
    "The worst product I've ever bought"
]

print("\nTesting the model:")
for post in test_posts:
    sentiment, confidence = predict_sentiment(post)
    print(f"\nPost: '{post}'")
    print(f"Predicted sentiment: {sentiment}")
    print(f"Confidence: {confidence}")

Training set distribution:
sentiment
negative    11
neutral     10
positive    10
Name: count, dtype: int64

Test set distribution:
sentiment
neutral     5
positive    5
negative    4
Name: count, dtype: int64

Model Accuracy: 28.57%

Classification Report:
              precision    recall  f1-score   support

    negative       0.30      0.75      0.43         4
     neutral       1.00      0.20      0.33         5
    positive       0.00      0.00      0.00         5

    accuracy                           0.29        14
   macro avg       0.43      0.32      0.25        14
weighted avg       0.44      0.29      0.24        14


Testing the model:

Post: 'I'm really happy with the service!'
Predicted sentiment: positive
Confidence: {'negative': 0.293916019189153, 'neutral': 0.11406260956815187, 'positive': 0.5920213712426947}

Post: 'This is unacceptable quality'
Predicted sentiment: negative
Confidence: {'negative': 0.4634242933567909, 'neutral': 0.17305282356314083, 'positive': 0.