In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TechKey Analysis - Model Training Notebook\n",
    "\n",
    "This notebook demonstrates the machine learning model training process for student risk prediction."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
    "from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve\n",
    "from sklearn.pipeline import Pipeline\n",
    "import joblib\n",
    "import sys\n",
    "import os\n",
    "\n",
    "# Add src to path\n",
    "sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd())))\n",
    "\n",
    "# Set plotting style\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and prepare data\n",
    "from src.database import get_session\n",
    "from src.models import Student, Grade, Attendance\n",
    "from src.predictor import prepare_training_data\n",
    "\n",
    "session = get_session()\n",
    "\n",
    "# Prepare training data using our utility function\n",
    "X, y, feature_names = prepare_training_data(session)\n",
    "\n",
    "print(f\"Training data shape: {X.shape}\")\n",
    "print(f\"Number of features: {len(feature_names)}\")\n",
    "print(f\"Feature names: {feature_names}\")\n",
    "print(f\"Class distribution: {np.bincount(y)}\")\n",
    "print(f\"Percentage of high-risk students: {np.mean(y) * 100:.1f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create DataFrame for EDA\n",
    "df_features = pd.DataFrame(X, columns=feature_names)\n",
    "df_features['risk_label'] = y\n",
    "df_features['risk_level'] = df_features['risk_label'].map({0: 'Low Risk', 1: 'High Risk'})\n",
    "\n",
    "# Feature distributions by risk level\n",
    "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for i, feature in enumerate(feature_names):\n",
    "    for risk_level in ['Low Risk', 'High Risk']:\n",
    "        data = df_features[df_features['risk_level'] == risk_level][feature]\n",
    "        axes[i].hist(data, alpha=0.7, label=risk_level, bins=15)\n",
    "    \n",
    "    axes[i].set_title(feature.replace('_', ' ').title())\n",
    "    axes[i].set_xlabel('Value')\n",
    "    axes[i].set_ylabel('Frequency')\n",
    "    axes[i].legend()\n",
    "\n",
    "# Remove empty subplots\n",
    "for i in range(len(feature_names), len(axes)):\n",
    "    fig.delaxes(axes[i])\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation analysis\n",
    "plt.figure(figsize=(10, 8))\n",
    "correlation_matrix = df_features[feature_names].corr()\n",
    "sns.heatmap(correlation_matrix, \n",
    "            annot=True, \n",
    "            cmap='coolwarm', \n",
    "            center=0,\n",
    "            square=True,\n",
    "            fmt='.2f')\n",
    "plt.title('Feature Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Correlation with target\n",
    "target_correlations = {}\n",
    "for feature in feature_names:\n",
    "    correlation = np.corrcoef(df_features[feature], y)[0, 1]\n",
    "    target_correlations[feature] = correlation\n",
    "\n",
    "print(\"Correlation with target variable:\")\n",
    "for feature, corr in sorted(target_correlations.items(), key=lambda x: abs(x[1]), reverse=True):\n",
    "    print(f\"{feature:20} : {corr:6.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, \n",
    "    test_size=0.2, \n",
    "    random_state=42, \n",
    "    stratify=y\n",
    ")\n",
    "\n",
    "print(f\"Training set size: {X_train.shape[0]}\")\n",
    "print(f\"Test set size: {X_test.shape[0]}\")\n",
    "print(f\"Training class distribution: {np.bincount(y_train)}\")\n",
    "print(f\"Test class distribution: {np.bincount(y_test)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define models to evaluate\n",
    "models = {\n",
    "    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),\n",
    "    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced'),\n",
    "    'SVM': SVC(probability=True, random_state=42, class_weight='balanced')\n",
    "}\n",
    "\n",
    "# Evaluate models using cross-validation\n",
    "results = {}\n",
    "for name, model in models.items():\n",
    "    # Create pipeline with scaling for models that need it\n",
    "    if name in ['Logistic Regression', 'SVM']:\n",
    "        pipeline = Pipeline([\n",
    "            ('scaler', StandardScaler()),\n",
    "            ('classifier', model)\n",
    "        ])\n",
    "    else:\n",
    "        pipeline = Pipeline([\n",
    "            ('classifier', model)\n",
    "        ])\n",
    "    \n",
    "    # Cross-validation\n",
    "    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')\n",
    "    \n",
    "    # Fit model\n",
    "    pipeline.fit(X_train, y_train)\n",
    "    \n",
    "    # Predictions\n",
    "    y_pred = pipeline.predict(X_test)\n",
    "    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]\n",
    "    \n",
    "    # Store results\n",
    "    results[name] = {\n",
    "        'model': pipeline,\n",
    "        'cv_mean': cv_scores.mean(),\n",
    "        'cv_std': cv_scores.std(),\n",
    "        'test_accuracy': (y_pred == y_test).mean(),\n",
    "        'roc_auc': roc_auc_score(y_test, y_pred_proba),\n",
    "        'y_pred': y_pred,\n",
    "        'y_pred_proba': y_pred_proba\n",
    "    }\n",
    "    \n",
    "    print(f\"{name:20} | CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})\")\n",
    "    print(f\"{name:20} | Test Accuracy: {results[name]['test_accuracy']:.3f}\")\n",
    "    print(f\"{name:20} | ROC AUC: {results[name]['roc_auc']:.3f}\")\n",
    "    print(\"-\" * 50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare model performance\n",
    "comparison_df = pd.DataFrame({\n",
    "    'Model': list(results.keys()),\n",
    "    'CV Accuracy': [results[name]['cv_mean'] for name in results],\n",
    "    'Test Accuracy': [results[name]['test_accuracy'] for name in results],\n",
    "    'ROC AUC': [results[name]['roc_auc'] for name in results]\n",
    "})\n",
    "\n",
    "print(\"=== Model Comparison ===\")\n",
    "print(comparison_df.round(3))\n",
    "\n",
    "# Visual comparison\n",
    "fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
    "\n",
    "# Accuracy comparison\n",
    "x_pos = np.arange(len(results))\n",
    "width = 0.35\n",
    "\n",
    "axes[0].bar(x_pos - width/2, comparison_df['CV Accuracy'], width, label='CV Accuracy', alpha=0.7)\n",
    "axes[0].bar(x_pos + width/2, comparison_df['Test Accuracy'], width, label='Test Accuracy', alpha=0.7)\n",
    "axes[0].set_xlabel('Model')\n",
    "axes[0].set_ylabel('Accuracy')\n",
    "axes[0].set_title('Model Accuracy Comparison')\n",
    "axes[0].set_xticks(x_pos)\n",
    "axes[0].set_xticklabels(comparison_df['Model'])\n",
    "axes[0].legend()\n",
    "\n",
    # ROC AUC comparison\n",
    "axes[1].bar(comparison_df['Model'], comparison_df['ROC AUC'], color='orange', alpha=0.7)\n",
    "axes[1].set_xlabel('Model')\n",
    "axes[1].set_ylabel('ROC AUC Score')\n",
    "axes[1].set_title('Model ROC AUC Comparison')\n",
    "axes[1].set_ylim(0, 1)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ROC curves for all models\n",
    "plt.figure(figsize=(10, 8))\n",
    "\n",
    "for name, result in results.items():\n",
    "    fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])\n",
    "    plt.plot(fpr, tpr, label=f'{name} (AUC = {result[\"roc_auc\"]:.3f})', linewidth=2)\n",
    "\n",
    "plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.title('ROC Curves - Model Comparison')\n",
    "plt.legend()\n",
    "plt.grid(True, alpha=0.3)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select best model\n",
    "best_model_name = max(results.keys(), key=lambda x: results[x]['roc_auc'])\n",
    "best_model = results[best_model_name]['model']\n",
    "\n",
    "print(f\"Best model: {best_model_name}\")\n",
    "print(f\"ROC AUC: {results[best_model_name]['roc_auc']:.3f}\")\n",
    "print(f\"Test Accuracy: {results[best_model_name]['test_accuracy']:.3f}\")\n",
    "\n",
    "# Detailed classification report for best model\n",
    "y_pred_best = results[best_model_name]['y_pred']\n",
    "print(\"\\n=== Classification Report ===\")\n",
    "print(classification_report(y_test, y_pred_best, target_names=['Low Risk', 'High Risk']))\n",
    "\n",
    "# Confusion matrix\n",
    "cm = confusion_matrix(y_test, y_pred_best)\n",
    "plt.figure(figsize=(8, 6))\n",
    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
    "            xticklabels=['Low Risk', 'High Risk'],\n",
    "            yticklabels=['Low Risk', 'High Risk'])\n",
    "plt.title(f'Confusion Matrix - {best_model_name}')\n",
    "plt.xlabel('Predicted')\n",
    "plt.ylabel('Actual')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature importance for tree-based models\n",
    "if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):\n",
    "    importances = best_model.named_steps['classifier'].feature_importances_\n",
    "    \n",
    "    # Create feature importance DataFrame\n",
    "    feature_importance_df = pd.DataFrame({\n",
    "        'feature': feature_names,\n",
    "        'importance': importances\n",
    "    }).sort_values('importance', ascending=False)\n",
    "    \n",
    "    # Plot feature importance\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.barplot(data=feature_importance_df, x='importance', y='feature', palette='viridis')\n",
    "    plt.title(f'Feature Importance - {best_model_name}')\n",
    "    plt.xlabel('Importance')\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    print(\"=== Feature Importance ===\")\n",
    "    print(feature_importance_df.round(4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Hyperparameter tuning for the best model\n",
    "if best_model_name == 'Random Forest':\n",
    "    param_grid = {\n",
    "        'classifier__n_estimators': [50, 100, 200],\n",
    "        'classifier__max_depth': [5, 10, 15, None],\n",
    "        'classifier__min_samples_split': [2, 5, 10],\n",
    "        'classifier__min_samples_leaf': [1, 2, 4]\n",
    "    }\n",
    "elif best_model_name == 'Logistic Regression':\n",
    "    param_grid = {\n",
    "        'classifier__C': [0.1, 1, 10, 100],\n",
    "        'classifier__penalty': ['l1', 'l2'],\n",
    "        'classifier__solver': ['liblinear']\n",
    "    }\n",
    "elif best_model_name == 'SVM':\n",
    "    param_grid = {\n",
    "        'classifier__C': [0.1, 1, 10, 100],\n",
    "        'classifier__kernel': ['linear', 'rbf']\n",
    "    }\n",
    "\n",
    "# Perform grid search\n",
    "print(f\"Performing hyperparameter tuning for {best_model_name}...\")\n",
    "grid_search = GridSearchCV(\n",
    "    best_model, \n",
    "    param_grid, \n",
    "    cv=5, \n",
    "    scoring='roc_auc',\n",
    "    n_jobs=-1,\n",
    "    verbose=1\n",
    ")\n",
    "\n",
    "grid_search.fit(X_train, y_train)\n",
    "\n",
    "print(f\"Best parameters: {grid_search.best_params_}\")\n",
    "print(f\"Best cross-validation score: {grid_search.best_score_:.3f}\")\n",
    "\n",
    "# Update best model with tuned parameters\n",
    "best_model_tuned = grid_search.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Final evaluation on test set\n",
    "y_pred_tuned = best_model_tuned.predict(X_test)\n",
    "y_pred_proba_tuned = best_model_tuned.predict_proba(X_test)[:, 1]\n",
    "\n",
    "final_accuracy = (y_pred_tuned == y_test).mean()\n",
    "final_roc_auc = roc_auc_score(y_test, y_pred_proba_tuned)\n",
    "\n",
    "print(\"=== Final Model Performance ===\")\n",
    "print(f\"Accuracy: {final_accuracy:.3f}\")\n",
    "print(f\"ROC AUC: {final_roc_auc:.3f}\")\n",
    "print(\"\\n=== Classification Report ===\")\n",
    "print(classification_report(y_test, y_pred_tuned, target_names=['Low Risk', 'High Risk']))\n",
    "\n",
    "# Compare with baseline\n",
    "baseline_accuracy = max(np.mean(y_test), 1 - np.mean(y_test))\n",
    "print(f\"\\nBaseline accuracy (majority class): {baseline_accuracy:.3f}\")\n",
    "print(f\"Improvement over baseline: {final_accuracy - baseline_accuracy:.3f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the trained model\n",
    "model_filename = f'../data/trained_model_{best_model_name.lower().replace(\" \", \"_\")}.pkl'\n",
    "joblib.dump(best_model_tuned, model_filename)\n",
    "\n",
    "# Save feature names and model info\n",
    "model_info = {\n",
    "    'feature_names': feature_names,\n",
    "    'model_name': best_model_name,\n",
    "    'accuracy': final_accuracy,\n",
    "    'roc_auc': final_roc_auc,\n",
    "    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),\n",
    "    'feature_importance': feature_importance_df.to_dict('records') if 'feature_importance_df' in locals() else None\n",
    "}\n",
    "\n",
    "joblib.dump(model_info, f'../data/model_info.pkl')\n",
    "\n",
    "print(f\"Model saved to: {model_filename}\")\n",
    "print(f\"Model info saved to: ../data/model_info.pkl\")\n",
    "\n",
    "# Close database session\n",
    "session.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Training Summary\n",
    "\n",
    "### Best Model: [Model Name]\n",
    "- **ROC AUC**: [Value]\n",
    "- **Accuracy**: [Value]\n",
    "- **Improvement over baseline**: [Value]\n",
    "\n",
    "### Key Features (by importance):\n",
    "1. [Most important feature]\n",
    "2. [Second most important feature]\n",
    "3. [Third most important feature]\n",
    "\n",
    "### Model Insights:\n",
    "- The model shows good discrimination between high-risk and low-risk students\n",
    "- [Feature] is the strongest predictor of student risk\n",
    "- The model achieves [percentage]% accuracy on the test set\n",
    "\n",
    "### Deployment Recommendations:\n",
    "1. Use probability threshold of [value] for high-risk classification\n",
    "2. Monitor model performance monthly\n",
    "3. Retrain model with new data every [time period]\n",
    "4. Focus interventions on students with prediction probability > [threshold]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}