<a href="https://colab.research.google.com/github/lakshmi-29/ai-document-summarizer/blob/main/SupportTicketTriagePipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install the main ML libraries (often pre-installed, but good practice)
!pip install numpy pandas scikit-learn

# 2. Install the LLM-specific libraries (these are less likely to be pre-installed)
!pip install scikit-llm openai

# Output should show "Successfully installed..."

Collecting scikit-llm
  Downloading scikit_llm-1.4.2-py3-none-any.whl.metadata (4.3 kB)
Downloading scikit_llm-1.4.2-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-llm
Successfully installed scikit-llm-1.4.2


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
# --- Corrected Imports for Scikit-LLM ---

# Correct path for ZeroShotGPTClassifier (Classification)
from skllm.models.gpt.classification.zero_shot import ZeroShotGPTClassifier

# Correct path for GPTGeneration (Text-to-Text Modelling)

# The rest of your imports should remain the same:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
import json

# --- SET YOUR API KEY ---
# The Scikit-LLM library requires an OpenAI API key for its functionality
# You should set this as an environment variable or directly in the code (not recommended for real projects)
os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY_HERE"

In [None]:
# --- 1. SIMULATE DATA ---
data = {
    'Ticket_Description': [
        "My account is locked and I cannot access my billing info. This is urgent!",
        "The website is slow, but I can still check out. Just a minor bug report.",
        "Need to reset my password; tried the link but it failed. Please route to tech.",
        "Question about the new enterprise pricing structure. Talk to Sales.",
        "The database is completely down after the last update. High priority system crash!",
        "Small visual glitch on the footer of the page. Low priority."
    ],
    'Priority': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low'],
    'Department': ['Billing', 'Technical', 'Technical', 'Sales', 'Technical', 'Technical'],
    # Simulate a continuous variable for regression (Time_to_Resolution_Days)
    'Resolution_Days': [1.5, 5.0, 3.2, 2.5, 0.8, 6.0]
}
df = pd.DataFrame(data)
print("--- Initial Data Sample ---")
print(df.head())
print("-" * 40)

# --- 2. TRAIN/TEST SPLIT ---
# We split the data to ensure model evaluation is on unseen data
X_train, X_test, y_priority_train, y_priority_test, y_dept_train, y_dept_test, y_res_train, y_res_test = train_test_split(
    df['Ticket_Description'],
    df['Priority'],
    df['Department'],
    df['Resolution_Days'],
    test_size=0.5, # Small size for demo
    random_state=42
)
print(f"Train samples: {len(X_train)}, Test samples: {len(X_test)}")

--- Initial Data Sample ---
                                  Ticket_Description Priority Department  \
0  My account is locked and I cannot access my bi...     High    Billing   
1  The website is slow, but I can still check out...      Low  Technical   
2  Need to reset my password; tried the link but ...   Medium  Technical   
3  Question about the new enterprise pricing stru...   Medium      Sales   
4  The database is completely down after the last...     High  Technical   

   Resolution_Days  
0              1.5  
1              5.0  
2              3.2  
3              2.5  
4              0.8  
----------------------------------------
Train samples: 3, Test samples: 3


In [None]:
# --- 3. TF-IDF VECTORIZATION ---
# TfidfVectorizer converts text into a matrix of token counts/weights (NumPy array under the hood)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# --- 4. PRIORITY CLASSIFICATION (Scikit-learn) ---
# Logistic Regression is a good baseline for text classification
priority_model = LogisticRegression(solver='liblinear', random_state=42)
priority_model.fit(X_train_vectorized, y_priority_train)

# Predict and Evaluate
y_priority_pred = priority_model.predict(X_test_vectorized)
print("\n--- PRIORITY CLASSIFICATION REPORT (Scikit-learn) ---")
print(classification_report(y_priority_test, y_priority_pred, zero_division=0))


--- PRIORITY CLASSIFICATION REPORT (Scikit-learn) ---
              precision    recall  f1-score   support

        High       0.00      0.00      0.00       1.0
         Low       0.00      0.00      0.00       2.0
      Medium       0.00      0.00      0.00       0.0

    accuracy                           0.00       3.0
   macro avg       0.00      0.00      0.00       3.0
weighted avg       0.00      0.00      0.00       3.0



In [None]:
# --- 5. RESOLUTION TIME REGRESSION (Scikit-learn) ---
# Random Forest Regressor is robust and handles non-linear relationships
from sklearn.metrics import root_mean_squared_error
res_time_model = RandomForestRegressor(n_estimators=10, random_state=42)
res_time_model.fit(X_train_vectorized, y_res_train)

# Predict and Evaluate
y_res_pred = res_time_model.predict(X_test_vectorized)

# CORRECTED LINE: Use the dedicated function and remove 'squared=False'
rmse = root_mean_squared_error(y_res_test, y_res_pred)

print(f"\n--- RESOLUTION TIME PREDICTION (Scikit-learn) ---")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f} days")


--- RESOLUTION TIME PREDICTION (Scikit-learn) ---
Root Mean Squared Error (RMSE): 2.71 days


In [None]:
# --- 6. LLM FILTERING AND EXTRACTION ---

# 6.1. Filtering: Select tickets predicted as 'High' priority
test_df = pd.DataFrame({
    'Ticket_Description': X_test,
    'Predicted_Priority': y_priority_pred,
    'Predicted_Resolution': y_res_pred
}).reset_index(drop=True)

# Filter for tickets the classical model flagged as high priority
critical_tickets = test_df[test_df['Predicted_Priority'] == 'High'].copy()
print(f"\n--- Critical Tickets Filtered for LLM: {len(critical_tickets)} ---")

if not critical_tickets.empty:
    # 6.2. LLM Extraction Setup (GPTGeneration)
    # This model is used for text-to-text generation and structured output
    extractor = GPTGeneration(
        model="gpt-3.5-turbo",  # Use a suitable model
        system_prompt="You are an AI analyst. Extract structured details from the support ticket text. The output MUST be a valid JSON object."
    )

    # The extraction prompt defines the desired JSON structure
    extraction_prompt = """
    Analyze the following support ticket: '{text}'.
    Extract the following details as a single JSON object:
    {{
        "User_Goal": "What the user is trying to achieve (2-3 words)",
        "Root_Issue": "The technical or billing issue described",
        "Urgency_Reason": "The specific reason why the ticket is high priority"
    }}
    """

    # Run LLM extraction on the critical tickets
    extraction_results = []

    # Use a loop for demonstration (for a real project, use `extractor.predict(list_of_texts)`)
    for ticket in critical_tickets['Ticket_Description']:

        # Format the prompt with the current ticket text
        prompt = extraction_prompt.format(text=ticket)

        # Get LLM prediction
        llm_output = extractor.predict([prompt])[0]

        try:
            # Parse the JSON output from the LLM
            parsed_json = json.loads(llm_output)
            extraction_results.append(parsed_json)
        except json.JSONDecodeError:
            # Handle cases where the LLM might return malformed JSON
            extraction_results.append({'Error': 'JSON Decode Failed', 'Raw_Output': llm_output})

    # Combine results
    llm_df = pd.DataFrame(extraction_results)
    final_output = pd.concat([critical_tickets.reset_index(drop=True), llm_df], axis=1)

    print("\n--- FINAL ACTIONABLE INSIGHTS (LLM Structured Output) ---")
    print(final_output[['Ticket_Description', 'Predicted_Priority', 'User_Goal', 'Root_Issue', 'Urgency_Reason']].to_markdown(index=False))

else:
    print("No critical tickets found by the classical model for LLM analysis.")


--- Critical Tickets Filtered for LLM: 0 ---
No critical tickets found by the classical model for LLM analysis.
