In [4]:
import json
import pandas as pd
import numpy as np
import wandb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix
)

In [2]:
data = []
with open("long_queries_data.json", 'r', encoding='utf-8', errors='replace') as f:
    for line in f:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Skipping invalid line: {e}")
            
df_queries = pd.DataFrame(data)
df_queries.head()

Unnamed: 0,name,employee_tenure,department,job_title,company_size,company_sector,tasks,long_queries,line_idx,task_idx,task,user_idx,final_tool
0,Alexander,experienced,Customer Support / Service,Resolution Specialist,mid-sized,Legal & Law Firms,"[Respond to 3 customer complaints via email, I...",resolution support customer service legal issu...,0,0,,17,Gmail
1,Alexander,experienced,Customer Support / Service,Resolution Specialist,mid-sized,Legal & Law Firms,"[Respond to 3 customer complaints via email, I...",customer complaints email template resolution ...,0,0,,17,Gmail
2,Alexander,experienced,Customer Support / Service,Resolution Specialist,mid-sized,Legal & Law Firms,"[Respond to 3 customer complaints via email, I...",complaints management service customer support...,0,0,,17,Gmail
3,Alexander,experienced,Customer Support / Service,Resolution Specialist,mid-sized,Legal & Law Firms,"[Respond to 3 customer complaints via email, I...",legal firm customer service email responses co...,0,0,,17,Gmail
4,Alexander,experienced,Customer Support / Service,Resolution Specialist,mid-sized,Legal & Law Firms,"[Respond to 3 customer complaints via email, I...",mid-sized law firms complaint handling procedu...,0,0,,17,Gmail


In [5]:
config = {
    "model": "RandomForest",
    "vectorizer": "Tfidf",
    "ngram_range": (1, 2),
    "stop_words": "english",
    "max_features": 5000,
    "n_estimators": 100
}

wandb.init(project="pack-intent", name="rf-tfidf-v1", config=config)

[34m[1mwandb[0m: Currently logged in as: [33mkelly-deng[0m ([33mfa-orchestra-brain[0m) to [32mhttps://wandb.gnlp.io[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df_queries["long_queries"],
    df_queries["final_tool"],
    test_size=0.2,
    random_state=42,
    stratify=df_queries["final_tool"]
)

In [7]:
vectorizer = TfidfVectorizer(
    stop_words=config["stop_words"],
    ngram_range=config["ngram_range"],
    max_features=config["max_features"]
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:
clf = RandomForestClassifier(
    n_estimators=config["n_estimators"],
    class_weight="balanced",
    random_state=42
)
clf.fit(X_train_vec, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [9]:
y_pred = clf.predict(X_test_vec)
acc = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='weighted'
)

wandb.log({
    "accuracy": acc,
    "precision_weighted": precision,
    "recall_weighted": recall,
    "f1_weighted": f1
})

report = classification_report(y_test, y_pred, output_dict=True)
wandb.log({"classification_report": report})
wandb.sklearn.plot_confusion_matrix(y_test, y_pred, labels=clf.classes_)

wandb.finish()

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
accuracy,▁
f1_weighted,▁
precision_weighted,▁
recall_weighted,▁

0,1
accuracy,0.50743
f1_weighted,0.47681
precision_weighted,0.48982
recall_weighted,0.50743
