In [None]:
"""
# Twitter Sentiment Analysis - Model Training Notebook

## 1. Import Libraries
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

"""
## 2. Load Processed Data
"""

try:
    df = pd.read_csv('../data/processed/cleaned_twitter_data.csv')
    print("Loaded processed data shape:", df.shape)
except:
    # Fallback to original data with preprocessing
    df = pd.read_csv('../Twitter_Data.csv')
    # Apply preprocessing here if needed

X = df['cleaned_text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

"""
## 3. Define Models and Parameters
"""

models = {
    'Naive_Bayes': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', MultinomialNB())
        ]),
        'params': {
            'tfidf__max_features': [1000, 2000, 3000],
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'clf__alpha': [0.1, 0.5, 1.0]
        }
    },
    'Logistic_Regression': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', LogisticRegression(random_state=42))
        ]),
        'params': {
            'tfidf__max_features': [2000, 3000],
            'tfidf__ngram_range': [(1, 2)],
            'clf__C': [0.1, 1, 10],
            'clf__max_iter': [1000]
        }
    },
    'Random_Forest': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', RandomForestClassifier(random_state=42))
        ]),
        'params': {
            'tfidf__max_features': [2000, 3000],
            'clf__n_estimators': [100, 200],
            'clf__max_depth': [10, 20, None],
            'clf__min_samples_split': [2, 5]
        }
    },
    'SVM': {
        'pipeline': Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', SVC(random_state=42))
        ]),
        'params': {
            'tfidf__max_features': [2000, 3000],
            'clf__C': [0.1, 1, 10],
            'clf__kernel': ['linear', 'rbf']
        }
    }
}

"""
## 4. Model Training with Grid Search
"""

trained_models = {}
best_model = None
best_score = 0

for model_name, model_config in models.items():
    print(f"\n{'='*50}")
    print(f"Training {model_name}...")
    print(f"{'='*50}")
    
    pipeline = model_config['pipeline']
    params = model_config['params']
    
    # Perform grid search
    grid_search = GridSearchCV(
        pipeline, 
        params, 
        cv=5, 
        scoring='accuracy',
        n_jobs=-1,
        verbose=1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Store results
    trained_models[model_name] = {
        'model': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'cv_results': grid_search.cv_results_
    }
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best CV score: {grid_search.best_score_:.4f}")
    
    # Update best model
    if grid_search.best_score_ > best_score:
        best_score = grid_search.best_score_
        best_model = grid_search.best_estimator_
        best_model_name = model_name

"""
## 5. Model Evaluation
"""

print(f"\n{'='*50}")
print("MODEL EVALUATION ON TEST SET")
print(f"{'='*50}")

results = {}

for model_name, model_info in trained_models.items():
    print(f"\n{model_name}:")
    model = model_info['model']
    
    # Predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    results[model_name] = {
        'accuracy': accuracy,
        'predictions': y_pred,
        'model': model
    }
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(report)

"""
## 6. Model Comparison
"""

# Create comparison dataframe
comparison_data = []
for model_name, result in results.items():
    comparison_data.append({
        'Model': model_name,
        'CV_Score': trained_models[model_name]['best_score'],
        'Test_Accuracy': result['accuracy']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test_Accuracy', ascending=False)

print("\nModel Comparison:")
print(comparison_df)

# Plot comparison
plt.figure(figsize=(10, 6))
x = np.arange(len(comparison_df))
width = 0.35

plt.bar(x - width/2, comparison_df['CV_Score'], width, label='CV Score', alpha=0.7)
plt.bar(x + width/2, comparison_df['Test_Accuracy'], width, label='Test Accuracy', alpha=0.7)

plt.xlabel('Models')
plt.ylabel('Scores')
plt.title('Model Performance Comparison')
plt.xticks(x, comparison_df['Model'], rotation=45)
plt.legend()
plt.tight_layout()
plt.show()

"""
## 7. Save Trained Models
"""

print("\nSaving trained models...")

for model_name, model_info in trained_models.items():
    model = model_info['model']
    filename = f"../models/trained_models/{model_name}_model.pkl"
    joblib.dump(model, filename)
    print(f"Saved {model_name} to {filename}")

# Save best model separately
best_model_filename = "../models/trained_models/best_model.pkl"
joblib.dump(best_model, best_model_filename)
print(f"\nBest model ({best_model_name}) saved to {best_model_filename}")

# Save results
import json
results_summary = {
    'best_model': best_model_name,
    'best_score': best_score,
    'model_comparison': comparison_df.to_dict('records')
}

with open('../models/model_evaluation/training_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\nModel training completed successfully!")