In [1]:
# test_case_model.py

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# === Load dataset ===
df = pd.read_csv("parsed_test_cases_augmented.csv")

# === Drop missing or invalid values ===
df.dropna(subset=["step_keywords", "num_steps", "duration", "result"], inplace=True)

# === Encode binary result ===
df["result_encoded"] = df["result"].map({"PASS": 1, "FAIL": 0})

# === TF-IDF vectorize step_keywords ===
tfidf = TfidfVectorizer()
X_keywords = tfidf.fit_transform(df["step_keywords"])

# === Normalize num_steps ===
scaler = MinMaxScaler()
X_steps = scaler.fit_transform(df[["num_steps"]])

# === Combine features into a single matrix ===
X_features = np.hstack([X_keywords.toarray(), X_steps])

# === Target variables ===
y_class = df["result_encoded"]        # for classification (PASS/FAIL)
y_reg = df["duration"]                # for regression (duration prediction)

# === Print dataset shape ===
print("Feature matrix shape:", X_features.shape)
print("Classification labels shape:", y_class.shape)
print("Regression labels shape:", y_reg.shape)


Feature matrix shape: (16, 37)
Classification labels shape: (16,)
Regression labels shape: (16,)


In [2]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, r2_score

# === Split data into training and test sets ===
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_features, y_class, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_features, y_reg, test_size=0.2, random_state=42)

# === Classification model ===
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_cls, y_train_cls)
y_pred_cls = clf.predict(X_test_cls)

print("\n=== Classification Report ===")
print("Accuracy:", accuracy_score(y_test_cls, y_pred_cls))
print(classification_report(y_test_cls, y_pred_cls))

# === Regression model ===
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print("\n=== Regression Report ===")
print("MAE:", mean_absolute_error(y_test_reg, y_pred_reg))
print("R² Score:", r2_score(y_test_reg, y_pred_reg))



=== Classification Report ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4


=== Regression Report ===
MAE: 0.9979188651738937
R² Score: 0.3354729296542178


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_test_case(test_id):
    """
    For a given test_id in the dataset, return its:
    - 3 most similar test cases
    - predicted duration (regressor)
    - predicted pass probability (classifier)
    """
    # Find the row
    row = df[df["test_id"] == test_id]
    if row.empty:
        print(f"Test case '{test_id}' not found.")
        return

    # Extract feature vector
    row_index = row.index[0]
    input_vec = X_features[row_index].reshape(1, -1)

    # Find 3 most similar test cases (excluding itself)
    similarities = cosine_similarity(input_vec, X_features)[0]
    top_indices = similarities.argsort()[-4:-1][::-1]
    similar_ids = df.iloc[top_indices]["test_id"].tolist()

    # Predict duration and pass probability
    pred_duration = reg.predict(input_vec)[0]
    pred_pass_prob = clf.predict_proba(input_vec)[0][1]

    # Output
    print(f"\n🧪 Test Case ID: {test_id}")
    print(f"⏱ Predicted Duration: {pred_duration:.2f} seconds")
    print(f"✅ Predicted PASS Probability: {pred_pass_prob*100:.1f}%")
    print("🔍 Most Similar Test Cases:")
    for i, sid in enumerate(similar_ids, 1):
        print(f"   {i}. {sid}")

# === Example usage ===
if __name__ == "__main__":
    predict_test_case("TestCase001_LBConfiguration_aug")



🧪 Test Case ID: TestCase001_LBConfiguration_aug
⏱ Predicted Duration: 19.18 seconds
✅ Predicted PASS Probability: 100.0%
🔍 Most Similar Test Cases:
   1. TestCase002_IBITBaslatma_aug
   2. TestCase002_IBITBaslatma
   3. TestCase001_LBConfiguration
