In [None]:
# 1. Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import joblib

In [None]:
# nltk.download("punkt")
# nltk.download("stopwords")
# nltk.download("wordnet")

In [None]:
# 2. Load data
df = pd.read_csv("expanded_prompts.csv")

In [None]:
# 3. Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
df["processed_prompt"] = df["prompt"].apply(preprocess_text)

In [None]:
# 4. Encoding labels
le_cluster = LabelEncoder()
le_sub_class = LabelEncoder()
df["cluster_encoded"] = le_cluster.fit_transform(df["cluster"])
df["sub_class_encoded"] = le_sub_class.fit_transform(df["sub_class"])

In [None]:
# 5. Feature extraction with optimized TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2, max_df=0.9)
X = vectorizer.fit_transform(df["processed_prompt"])
y_cluster = df["cluster_encoded"]
y_sub_class = df["sub_class_encoded"]

In [None]:
# 6. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_cluster, test_size=0.2, random_state=42, stratify=y_cluster)

In [None]:
# 7. Model definitions
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "SVM": SVC(class_weight='balanced', probability=True),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

In [None]:
# 8. Hyperparameter tuning for SVM
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
grid_svm = GridSearchCV(SVC(class_weight='balanced', probability=True), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)

In [None]:
# 9. Evaluate models
best_model = None
best_acc = 0

In [None]:
for name, model in models.items():
    if name == "SVM":
        model = grid_svm.best_estimator_
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"{name} - Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_model = model

In [None]:
# 10. Save the best cluster model
joblib.dump(best_model, "cluster_best_model.pkl")

In [None]:
# 11. Subclass model (using RandomForest optimized)
X_train, X_test, y_train, y_test = train_test_split(X, y_sub_class, test_size=0.2, random_state=42, stratify=y_sub_class)
subclass_model = RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42)
subclass_model.fit(X_train, y_train)
y_pred = subclass_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Subclass RandomForest - Accuracy: {acc:.4f}")
joblib.dump(subclass_model, "subclass_best_model.pkl")

In [None]:
# 12. Prediction function
def predict_cluster_and_subclass(new_prompt):
    processed_prompt = preprocess_text(new_prompt)
    X_new = vectorizer.transform([processed_prompt])
    predicted_cluster = best_model.predict(X_new)
    predicted_sub_class = subclass_model.predict(X_new)
    cluster_label = le_cluster.inverse_transform(predicted_cluster)
    sub_class_label = le_sub_class.inverse_transform(predicted_sub_class)
    return cluster_label[0], sub_class_label[0]

In [None]:
# 13. Test Prediction
print(predict_cluster_and_subclass("How to use OpenAI's API within Streamlit?"))