In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_absolute_error, r2_score
)
from sklearn.metrics.pairwise import cosine_similarity
from imblearn.over_sampling import SMOTE

# === Load dataset ===
df = pd.read_csv("parsed_test_cases_augmented.csv")

# === Drop missing or invalid values ===
df.dropna(subset=["step_keywords", "num_steps", "duration", "result"], inplace=True)

# === Encode binary result ===
df["result_encoded"] = df["result"].map({"PASS": 1, "FAIL": 0})

# === TF-IDF vectorize step_keywords ===
tfidf = TfidfVectorizer()
X_keywords = tfidf.fit_transform(df["step_keywords"])

# === Normalize num_steps ===
scaler = MinMaxScaler()
X_steps = scaler.fit_transform(df[["num_steps"]])

# === Combine features into a single matrix ===
X_features = np.hstack([X_keywords.toarray(), X_steps])

# === Target variables ===
y_class = df["result_encoded"]        # for classification (PASS/FAIL)
y_reg = df["duration"]                # for regression (duration prediction)

# === Print dataset shape ===
print("Feature matrix shape:", X_features.shape)
print("Classification labels shape:", y_class.shape)
print("Regression labels shape:", y_reg.shape)

Feature matrix shape: (404, 102)
Classification labels shape: (404,)
Regression labels shape: (404,)


In [8]:
# === Apply SMOTE to handle class imbalance ===
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_features, y_class)

print("\nAfter SMOTE resampling:", np.bincount(y_resampled))

# === Classification model ===
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

clf = LogisticRegression(class_weight='balanced', max_iter=1000)
clf.fit(X_train_cls, y_train_cls)
y_pred_cls = clf.predict(X_test_cls)

print("\n=== Classification Report ===")
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# === Regression model ===
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_features, y_reg, test_size=0.2, random_state=42
)

reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print("\n=== Regression Report ===")
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R² Score:", r2_score(y_test_reg, y_pred_reg))



After SMOTE resampling: [319 319]

=== Classification Report ===
Accuracy: 0.640625
              precision    recall  f1-score   support

           0       0.62      0.68      0.65        62
           1       0.67      0.61      0.63        66

    accuracy                           0.64       128
   macro avg       0.64      0.64      0.64       128
weighted avg       0.64      0.64      0.64       128


=== Regression Report ===
MAE: 1.2995135802469155
R² Score: 0.9484427623425737


In [10]:
# === Prediction helper ===
def predict_test_case(test_id):
    """
    For a given test_id in the dataset, return its:
    - 3 most similar test cases
    - predicted duration (regressor)
    - predicted pass probability (classifier)
    """
    row = df[df["test_id"] == test_id]
    if row.empty:
        print(f"Test case '{test_id}' not found.")
        return

    row_index = row.index[0]
    input_vec = X_features[row_index].reshape(1, -1)

    similarities = cosine_similarity(input_vec, X_features)[0]
    top_indices = similarities.argsort()[-4:-1][::-1]
    similar_ids = df.iloc[top_indices]["test_id"].tolist()

    pred_duration = reg.predict(input_vec)[0]
    pred_pass_prob = clf.predict_proba(input_vec)[0][1]

    print(f"\n🧪 Test Case ID: {test_id}")
    print(f"⏱ Predicted Duration: {pred_duration:.2f} seconds")
    print(f"✅ Predicted PASS Probability: {pred_pass_prob*100:.1f}%")
    print("🔍 Most Similar Test Cases:")
    for i, sid in enumerate(similar_ids, 1):
        print(f"   {i}. {sid}")

# === Example usage ===
if __name__ == "__main__":
    predict_test_case("TC003")


🧪 Test Case ID: TC003
⏱ Predicted Duration: 35.75 seconds
✅ Predicted PASS Probability: 49.9%
🔍 Most Similar Test Cases:
   1. TC237
   2. TC086
   3. TC215
