In [None]:
# 1. Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import joblib
import os

In [None]:
# 2. Data Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
df = pd.read_csv("expanded_prompts.csv")
df["processed_prompt"] = df["prompt"].apply(preprocess_text)

In [None]:
# 3. Encode labels
le_cluster = LabelEncoder()
le_sub_class = LabelEncoder()
df["cluster_encoded"] = le_cluster.fit_transform(df["cluster"])
df["sub_class_encoded"] = le_sub_class.fit_transform(df["sub_class"])

In [None]:
# 4. TF-IDF feature extraction
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2, max_df=0.9)
X = vectorizer.fit_transform(df["processed_prompt"])
y_cluster = df["cluster_encoded"]

In [None]:
# 5. Split data for cluster prediction
X_train, X_test, y_train, y_test = train_test_split(X, y_cluster, test_size=0.2, random_state=42, stratify=y_cluster)

In [None]:
# 6. Train cluster classifier (SVM optimized)
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_svm = GridSearchCV(SVC(class_weight='balanced', probability=True), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)
best_cluster_model = grid_svm.best_estimator_

In [None]:
# Save cluster model
joblib.dump(best_cluster_model, "cluster_model.pkl")

In [None]:
# 7. Train sub_class models per cluster
if not os.path.exists("subclass_models"):
    os.makedirs("subclass_models")

In [None]:
for cluster_label in df["cluster_encoded"].unique():
    subset = df[df["cluster_encoded"] == cluster_label]
    if subset["sub_class_encoded"].nunique() < 2:
        continue  # skip clusters without at least 2 subclasses
    X_sub = vectorizer.transform(subset["processed_prompt"])
    y_sub = subset["sub_class_encoded"]
    X_train_sub, X_test_sub, y_train_sub, y_test_sub = train_test_split(
        X_sub, y_sub, test_size=0.2, random_state=42, stratify=y_sub
    )
    model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
    model.fit(X_train_sub, y_train_sub)
    joblib.dump(model, f"subclass_models/subclass_model_{cluster_label}.pkl")

In [None]:
# 8. Prediction function with hierarchical classification
def predict_cluster_and_subclass(prompt):
    processed_prompt = preprocess_text(prompt)
    X_new = vectorizer.transform([processed_prompt])
    cluster_model = joblib.load("cluster_model.pkl")
    predicted_cluster = cluster_model.predict(X_new)
    cluster_label = le_cluster.inverse_transform(predicted_cluster)[0]

In [None]:
    subclass_model_path = f"subclass_models/subclass_model_{predicted_cluster[0]}.pkl"
    if os.path.exists(subclass_model_path):
        sub_model = joblib.load(subclass_model_path)
        predicted_sub_class = sub_model.predict(X_new)
        sub_class_label = le_sub_class.inverse_transform(predicted_sub_class)[0]
    else:
        sub_class_label = "No subclass model"

In [None]:
    return cluster_label, sub_class_label

In [None]:
# 9. Test Prediction
print(predict_cluster_and_subclass("How to use OpenAI's API within Streamlit?"))