In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_absolute_error, r2_score
)
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE
import pickle

# === Load dataset ===
df = pd.read_csv("parsed_test_cases_augmented.csv")
df.dropna(subset=["step_keywords", "num_steps", "duration", "result"], inplace=True)
df["result_encoded"] = df["result"].map({"PASS": 1, "FAIL": 0})

# === Feature extraction ===
tfidf = TfidfVectorizer()
X_keywords = tfidf.fit_transform(df["step_keywords"])

scaler = MinMaxScaler()
X_steps = scaler.fit_transform(df[["num_steps"]])

X_features = np.hstack([X_keywords.toarray(), X_steps])
y_class = df["result_encoded"]
y_reg = df["duration"]

# === Classification with SMOTE ===
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_features, y_class)
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train_cls, y_train_cls)

# === Regression ===
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_features, y_reg, test_size=0.2, random_state=42
)

reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)

# === Save trained models and preprocessors ===
with open("clf.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("reg.pkl", "wb") as f:
    pickle.dump(reg, f)
with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# === Prediction utility ===
def predict_test_case(test_id):
    """
    Returns prediction info for a given test_id in df.
    Only works in training context, not for deployment.
    """
    row = df[df["test_id"] == test_id]
    if row.empty:
        return None

    row_index = row.index[0]
    input_vec = X_features[row_index].reshape(1, -1)

    similarities = cosine_similarity(input_vec, X_features)[0]
    top_indices = similarities.argsort()[-4:-1][::-1]
    similar_ids = df.iloc[top_indices]["test_id"].tolist()

    pred_duration = reg.predict(input_vec)[0]
    pred_pass_prob = clf.predict_proba(input_vec)[0][1]

    return {
        "test_id": test_id,
        "predicted_duration": pred_duration,
        "pass_probability": pred_pass_prob,
        "similar_cases": similar_ids
    }
