In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sales Call NLP Model Development\n",
    "\n",
    "## Objectives\n",
    "- Data Preparation\n",
    "- Multi-Label Classification\n",
    "- Model Evaluation\n",
    "- Error Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import Libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "# Machine Learning Libraries\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.multioutput import MultiOutputClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import (\n",
    "    classification_report, \n",
    "    confusion_matrix, \n",
    "    precision_recall_curve,\n",
    "    average_precision_score\n",
    ")\n",
    "\n",
    "# Text Preprocessing\n",
    "import re\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 1. Data Loading and Preprocessing\n",
    "def load_and_preprocess_data(file_path='../data/calls_dataset.csv'):\n",
    "    # Load Dataset\n",
    "    df = pd.read_csv(file_path)\n",
    "    \n",
    "    # Text Preprocessing Function\n",
    "    def preprocess_text(text):\n",
    "        # Lowercase\n",
    "        text = text.lower()\n",
    "        \n",
    "        # Remove special characters\n",
    "        text = re.sub(r'[^a-zA-Z\\s]', '', text)\n",
    "        \n",
    "        # Remove stopwords\n",
    "        stop_words = set(stopwords.words('english'))\n",
    "        words = text.split()\n",
    "        text = ' '.join([word for word in words if word not in stop_words])\n",
    "        \n",
    "        return text\n",
    "    \n",
    "    # Apply preprocessing\n",
    "    df['processed_text'] = df['text_snippet'].apply(preprocess_text)\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Load Data\n",
    "df = load_and_preprocess_data()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 2. Multi-Label Preprocessing\n",
    "mlb = MultiLabelBinarizer()\n",
    "df['labels_list'] = df['labels'].str.split(',')\n",
    "y = mlb.fit_transform(df['labels_list'])\n",
    "\n",
    "# Split Data\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    df['processed_text'], y, test_size=0.2, random_state=42\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 3. Feature Vectorization\n",
    "vectorizer = TfidfVectorizer(max_features=5000)\n",
    "X_train_vectorized = vectorizer.fit_transform(X_train)\n",
    "X_test_vectorized = vectorizer.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 4. Model Training\n",
    "classifier = MultiOutputClassifier(LogisticRegression(max_iter=1000))\n",
    "classifier.fit(X_train_vectorized, y_train)\n",
    "\n",
    "# Cross-Validation\n",
    "cv_scores = cross_val_score(classifier, X_train_vectorized, y_train, cv=5)\n",
    "print(\"Cross-Validation Scores:\", cv_scores)\n",
    "print(\"Mean CV Score:\", cv_scores.mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 5. Model Evaluation\n",
    "y_pred = classifier.predict(X_test_vectorized)\n",
    "\n",
    "# Classification Report\n",
    "print(\"Classification Report:\")\n",
    "print(classification_report(y_test, y_pred, target_names=mlb.classes_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 6. Visualization: Confusion Matrix\n",
    "plt.figure(figsize=(10, 8))\n",
    "cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))\n",
    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n",
    "plt.title('Confusion Matrix')\n",
    "plt.ylabel('True Label')\n",
    "plt.xlabel('Predicted Label')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 7. Precision-Recall Curve\n",
    "def plot_precision_recall_curve(y_test, y_pred):\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    for i, label in enumerate(mlb.classes_):\n",
    "        precision, recall, _ = precision_recall_curve(y_test[:, i], y_pred[:, i])\n",
    "        avg_precision = average_precision_score(y_test[:, i], y_pred[:, i])\n",
    "        plt.plot(recall, precision, label=f'{label} (AP={avg_precision:.2f})')\n",
    "    \n",
    "    plt.xlabel('Recall')\n",
    "    plt.ylabel('Precision')\n",
    "    plt.title('Precision-Recall Curve')\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "plot_precision_recall_curve(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 8. Error Analysis\n",
    "def analyze_errors(X_test, y_test, y_pred, mlb):\n",
    "    misclassified_indices = np.where((y_test != y_pred).any(axis=1))[0]\n",
    "    \n",
    "    print(\"Misclassified Samples:\")\n",
    "    for idx in misclassified_indices[:5]:  # First 5 misclassified samples\n",
    "        print(\"\\nText:\", X_test.iloc[idx])\n",
    "        print(\"True Labels:\", mlb.inverse_transform(y_test[idx:idx+1])[0])\n",
    "        print(\"Predicted Labels:\", mlb.inverse_transform(y_pred[idx:idx+1])[0])\n",
    "\n",
    "analyze_errors(X_test, y_test, y_pred, mlb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# 9. Save Model\n",
    "import pickle\n",
    "\n",
    "with open('../model/nlp_model.pkl', 'wb') as f:\n",
    "    pickle.dump((classifier, vectorizer, mlb), f)"
   ]
  }
 ],
 "metadata": {
   "kernelspec": {
     "display_name": "Python 3",
     "language": "python",
     "name": "python3"
   }
 }
}