# Part B: Model Development & Training
## Weather Emergency Prediction - Rostov Region

This notebook handles:
- Model training for emergency prediction
- Model evaluation and metrics
- Feature importance analysis
- Model comparison

In [None]:
# Install required packages
!pip install pandas numpy scikit-learn matplotlib seaborn joblib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

np.random.seed(42)
print("‚úÖ Libraries imported successfully")

## 1. Load Processed Data from Part A

In [None]:
# Load the merged data from Part A
try:
    merged_df = pd.read_csv('merged_data_rostov.csv')
    merged_df['date'] = pd.to_datetime(merged_df['date'])
    print(f"‚úÖ Loaded {len(merged_df)} records")
    print(f"Emergency days: {merged_df['has_emergency'].sum()}")
    print(f"Normal days: {(merged_df['has_emergency']==0).sum()}")
except FileNotFoundError:
    print("‚ùå File not found. Please run Part A first or upload the file.")
    # Uncomment to upload
    # from google.colab import files
    # uploaded = files.upload()
    # merged_df = pd.read_csv('merged_data_rostov.csv')
    # merged_df['date'] = pd.to_datetime(merged_df['date'])

merged_df.head()

## 2. Prepare Training Data

In [None]:
def prepare_features(df):
    """Prepare feature matrix and target variable."""
    df = df.dropna(subset=['has_emergency'])
    
    # Exclude non-feature columns
    exclude_cols = ['date', 'latitude', 'longitude', 'has_emergency',
                   'emergency_type', 'emergency_severity']
    
    feature_cols = [col for col in df.columns 
                   if col not in exclude_cols and df[col].dtype in ['int64', 'float64']]
    
    X = df[feature_cols].fillna(0)
    y = df['has_emergency']
    
    print(f"Features: {len(feature_cols)}")
    print(f"Samples: {len(X)}")
    print(f"Class distribution:")
    print(y.value_counts())
    
    return X, y, feature_cols

X, y, feature_names = prepare_features(merged_df)
print(f"\n‚úÖ Data prepared: {X.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úÖ Data split and scaled")

## 3. Train Multiple Models

In [None]:
# Model 1: Random Forest
print("Training Random Forest...")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train_scaled, y_train)
print("‚úÖ Random Forest trained")

# Model 2: Gradient Boosting
print("\nTraining Gradient Boosting...")
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    max_depth=5,
    random_state=42
)
gb_model.fit(X_train_scaled, y_train)
print("‚úÖ Gradient Boosting trained")

# Model 3: Logistic Regression
print("\nTraining Logistic Regression...")
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000
)
lr_model.fit(X_train_scaled, y_train)
print("‚úÖ Logistic Regression trained")

## 4. Model Evaluation

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance."""
    print(f"\n{'='*50}")
    print(f"{model_name} EVALUATION")
    print(f"{'='*50}")
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print(f"ROC AUC:   {roc_auc:.4f}")
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# Evaluate all models
rf_metrics = evaluate_model(rf_model, X_test_scaled, y_test, "Random Forest")
gb_metrics = evaluate_model(gb_model, X_test_scaled, y_test, "Gradient Boosting")
lr_metrics = evaluate_model(lr_model, X_test_scaled, y_test, "Logistic Regression")

## 5. Model Comparison

In [None]:
# Compare models
comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting', 'Logistic Regression'],
    'Accuracy': [rf_metrics['accuracy'], gb_metrics['accuracy'], lr_metrics['accuracy']],
    'Precision': [rf_metrics['precision'], gb_metrics['precision'], lr_metrics['precision']],
    'Recall': [rf_metrics['recall'], gb_metrics['recall'], lr_metrics['recall']],
    'F1 Score': [rf_metrics['f1'], gb_metrics['f1'], lr_metrics['f1']],
    'ROC AUC': [rf_metrics['roc_auc'], gb_metrics['roc_auc'], lr_metrics['roc_auc']]
})

print("\n" + "="*70)
print("MODEL COMPARISON")
print("="*70)
print(comparison_df.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
comparison_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC']].plot(
    kind='bar', ax=ax
)
plt.title('Model Performance Comparison', fontsize=16, fontweight='bold')
plt.ylabel('Score')
plt.xticks(rotation=45, ha='right')
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# ROC Curves
plt.figure(figsize=(10, 8))

for model_name, metrics in [('Random Forest', rf_metrics), 
                             ('Gradient Boosting', gb_metrics),
                             ('Logistic Regression', lr_metrics)]:
    fpr, tpr, _ = roc_curve(y_test, metrics['y_pred_proba'])
    plt.plot(fpr, tpr, label=f"{model_name} (AUC = {metrics['roc_auc']:.3f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison', fontsize=16, fontweight='bold')
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20))

# Visualize top features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importance (Random Forest)', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 7. Save Best Model

In [None]:
# Select best model based on F1 score
best_model_name = comparison_df.loc[comparison_df['F1 Score'].idxmax(), 'Model']
print(f"\nüèÜ Best Model: {best_model_name}")

if best_model_name == 'Random Forest':
    best_model = rf_model
elif best_model_name == 'Gradient Boosting':
    best_model = gb_model
else:
    best_model = lr_model

# Save model and scaler
model_data = {
    'model': best_model,
    'scaler': scaler,
    'feature_names': feature_names,
    'model_type': best_model_name
}

joblib.dump(model_data, 'emergency_prediction_model.pkl')
print("\n‚úÖ Model saved as 'emergency_prediction_model.pkl'")

# Download model (uncomment if needed)
# from google.colab import files
# files.download('emergency_prediction_model.pkl')

## 8. Test Prediction Function

In [None]:
def predict_emergency(temperature, precipitation, humidity, wind_speed, pressure):
    """Make a prediction for given weather conditions."""
    # Load model
    model_data = joblib.load('emergency_prediction_model.pkl')
    model = model_data['model']
    scaler = model_data['scaler']
    feature_names = model_data['feature_names']
    
    # Create input with basic features
    input_dict = {
        'temperature': temperature,
        'precipitation': precipitation,
        'humidity': humidity,
        'wind_speed': wind_speed,
        'pressure': pressure
    }
    
    # Add missing features as 0
    for feat in feature_names:
        if feat not in input_dict:
            input_dict[feat] = 0.0
    
    # Create DataFrame
    input_df = pd.DataFrame([input_dict])[feature_names]
    
    # Scale and predict
    input_scaled = scaler.transform(input_df)
    prediction = model.predict(input_scaled)[0]
    probability = model.predict_proba(input_scaled)[0]
    
    return {
        'will_occur': bool(prediction),
        'probability': float(probability[1]),
        'confidence': float(max(probability))
    }

# Test prediction
print("\n" + "="*50)
print("TEST PREDICTIONS")
print("="*50)

# Test case 1: Extreme heat
result1 = predict_emergency(temperature=38, precipitation=0, humidity=20, wind_speed=3, pressure=1010)
print(f"\nCase 1: Extreme heat (38¬∞C, low humidity)")
print(f"Prediction: {'‚ö†Ô∏è EMERGENCY' if result1['will_occur'] else '‚úÖ NO EMERGENCY'}")
print(f"Probability: {result1['probability']:.2%}")
print(f"Confidence: {result1['confidence']:.2%}")

# Test case 2: Normal conditions
result2 = predict_emergency(temperature=20, precipitation=5, humidity=60, wind_speed=4, pressure=1013)
print(f"\nCase 2: Normal conditions (20¬∞C, moderate humidity)")
print(f"Prediction: {'‚ö†Ô∏è EMERGENCY' if result2['will_occur'] else '‚úÖ NO EMERGENCY'}")
print(f"Probability: {result2['probability']:.2%}")
print(f"Confidence: {result2['confidence']:.2%}")

# Test case 3: Heavy rain
result3 = predict_emergency(temperature=15, precipitation=80, humidity=90, wind_speed=10, pressure=995)
print(f"\nCase 3: Heavy rain (80mm precipitation)")
print(f"Prediction: {'‚ö†Ô∏è EMERGENCY' if result3['will_occur'] else '‚úÖ NO EMERGENCY'}")
print(f"Probability: {result3['probability']:.2%}")
print(f"Confidence: {result3['confidence']:.2%}")

## Summary

‚úÖ **Completed:**
- Trained 3 different models (Random Forest, Gradient Boosting, Logistic Regression)
- Evaluated models on multiple metrics
- Compared model performance
- Analyzed feature importance
- Saved best model for deployment
- Created prediction function

**Next:** Continue to Part C - API Development or Part D - Web Interface