In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os
import re

In [2]:
TRAINING_DATA_PATH = "data/training_data_combined.csv"
MODEL_DIR = "./trained_model"

os.makedirs(MODEL_DIR, exist_ok=True)

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df= pd.read_csv(TRAINING_DATA_PATH)

df = df.dropna(subset=["text", "issue_resolve_time_label"])
print(f"Initial size: {len(df)} rows")

df["clean_text"] = df["text"].apply(clean_text)

df = df[df["clean_text"].str.strip() != ""]
print(f"After cleaning: {len(df)} rows")

Initial size: 222606 rows
After cleaning: 222606 rows


In [9]:
print("Encoding labels...")
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["issue_resolve_time_label"])

print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)

print("Vectorizing text...")
tfidf = TfidfVectorizer(max_features=10000, stop_words="english")
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("Training Logistic Regression...")
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("Saving model artifacts to:", MODEL_DIR)
joblib.dump(clf, os.path.join(MODEL_DIR, "classifier.pkl"))
joblib.dump(tfidf, os.path.join(MODEL_DIR, "tfidf.pkl"))
joblib.dump(label_encoder, os.path.join(MODEL_DIR, "label_encoder.pkl"))

print("Training complete. All artifacts saved.")

Encoding labels...
Splitting data...
Vectorizing text...
Training Logistic Regression...
Accuracy: 0.32884865908988814
Classification Report:
                    precision    recall  f1-score   support

        1-4 weeks       0.22      0.19      0.20      6364
        1-6 hours       0.24      0.03      0.05      3409
       1-6 months       0.24      0.33      0.28      6922
         1-7 days       0.24      0.39      0.30      7943
6 months - 1 year       0.44      0.09      0.15      2396
       6-24 hours       0.18      0.04      0.07      3714
          <1 hour       0.55      0.67      0.60      9887
          >1 year       0.33      0.23      0.27      3887

         accuracy                           0.33     44522
        macro avg       0.31      0.25      0.24     44522
     weighted avg       0.32      0.33      0.30     44522

Saving model artifacts to: ./trained_model
Training complete. All artifacts saved.
