In [56]:
# Requirement analysis and test case generation
import os
os.environ["USE_TF"] = "0"   # disable TensorFlow
os.environ["USE_TORCH"] = "1"  # ensure PyTorch is used

# Step 1: Load requirement document
file_path = "../data/sample_requirements.txt"  # change if needed
with open(file_path, 'r') as file:
    text = file.read()

print("Preview of requirements:\n")
print(text[:500])  # show first 500 chars

# Step 2: Process text with spaCy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
sentences = [sent.text.strip() for sent in doc.sents]
print(f"\nExtracted {len(sentences)} sentences.")

# Step 3: Extract requirement sentences
requirements = [sent for sent in sentences if any(keyword in sent.lower() for keyword in ["should", "must", "shall"])]

# Step 4: Function to generate actionable description from requirement
def generate_description(req):
    # Take first clause or phrase up to 'that' or 'which'
    desc = req.split(" that ")[0].split(" which ")[0].split(".")[0]
    # Simplify common patterns
    desc = desc.replace("The system ", "").replace("Users ", "User ").replace("must ", "").replace("should ", "")
    return desc.strip()

# Step 5: Function to generate actionable steps
def generate_steps(req):
    steps = []
    # Example simple parsing: split by common connectors to make actions
    connectors = [" that ", " and ", " upon ", " after ", " then "]
    actions = [req]
    for conn in connectors:
        temp = []
        for act in actions:
            temp.extend(act.split(conn))
        actions = temp
    # Clean and create steps
    for idx, act in enumerate(actions, start=1):
        act = act.strip()
        if act:
            # Remove trailing punctuation
            act = act.rstrip(".")
            steps.append(f"Step {idx}: {act}")
    # Add validation step
    steps.append(f"Step {len(steps)+1}: Verify the expected outcome")
    return steps

# Step 6: Generate structured test cases
import pandas as pd

test_cases = []
for i, req in enumerate(requirements, start=1):
    test_cases.append({
        "TestCaseID": f"TC_{i:03}",
        "Requirement": req,
        "Description": generate_description(req),
        "Steps": generate_steps(req)
    })

df = pd.DataFrame(test_cases)

# Step 7: Display first 5 test cases
from IPython.display import display
pd.set_option("display.max_colwidth", None)

print("\nGenerated Actionable Test Cases (Top 5):")
display(df.head(5).style.set_table_attributes("style='display:inline'").set_table_styles(
    [{'selector': 'table', 'props': [('max-height', '500px'), ('overflow-y', 'scroll')]}]
))


# Step 8: Save test cases to CSV
df.to_csv("../data/generated_test_cases.csv", index=False)
print("\nTest cases saved to '../data/generated_test_cases.csv'")


Preview of requirements:

The system should provide a secure authentication mechanism that allows users to log in to their accounts using their registered email address and password. Upon entering valid credentials, the system must authenticate the user and redirect them to their personalized dashboard.
To protect user accounts from brute-force attacks, the system must lock a user account after five consecutive failed login attempts. Any attempt to log in with the correct credentials after the lockout should be denied un

Extracted 16 sentences.

Generated Actionable Test Cases (Top 5):


Unnamed: 0,TestCaseID,Requirement,Description,Steps
0,TC_001,The system should provide a secure authentication mechanism that allows users to log in to their accounts using their registered email address and password.,provide a secure authentication mechanism,"['Step 1: The system should provide a secure authentication mechanism', 'Step 2: allows users to log in to their accounts using their registered email address', 'Step 3: password', 'Step 4: Verify the expected outcome']"
1,TC_002,"Upon entering valid credentials, the system must authenticate the user and redirect them to their personalized dashboard.","Upon entering valid credentials, the system authenticate the user and redirect them to their personalized dashboard","['Step 1: Upon entering valid credentials, the system must authenticate the user', 'Step 2: redirect them to their personalized dashboard', 'Step 3: Verify the expected outcome']"
2,TC_003,"To protect user accounts from brute-force attacks, the system must lock a user account after five consecutive failed login attempts.","To protect user accounts from brute-force attacks, the system lock a user account after five consecutive failed login attempts","['Step 1: To protect user accounts from brute-force attacks, the system must lock a user account', 'Step 2: five consecutive failed login attempts', 'Step 3: Verify the expected outcome']"
3,TC_004,Any attempt to log in with the correct credentials after the lockout should be denied until the lock is lifted.,Any attempt to log in with the correct credentials after the lockout be denied until the lock is lifted,"['Step 1: Any attempt to log in with the correct credentials', 'Step 2: the lockout should be denied until the lock is lifted', 'Step 3: Verify the expected outcome']"
4,TC_005,The system must provide users with the ability to securely reset their passwords in case they forget them.,provide users with the ability to securely reset their passwords in case they forget them,"['Step 1: The system must provide users with the ability to securely reset their passwords in case they forget them', 'Step 2: Verify the expected outcome']"



Test cases saved to '../data/generated_test_cases.csv'


In [57]:
# Test case prioritization
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from IPython.display import display

# --- Load previously generated test cases ---
df = pd.read_csv("../data/generated_test_cases.csv")
print(f"Loaded {len(df)} base test cases.")

# --- Expand dataset to ~200 test cases (for better ML training) ---
df = pd.concat([df]*20, ignore_index=True)  # replicate test cases
print(f"Expanded dataset to {len(df)} test cases for training.")

# --- Simulated Feature Engineering ---
np.random.seed(42)  # reproducibility
df["PastDefects"] = np.random.randint(0, 5, len(df))        # historical defect count
df["ExecutionTime"] = np.random.randint(1, 10, len(df))     # execution time (mins)
df["FailureRate"] = np.random.rand(len(df))                 # failure probability

# Label (High-Risk = 1, Low-Risk = 0) → rule-based for demo
df["HighRisk"] = (df["PastDefects"] + df["FailureRate"]*10 > 5).astype(int)

# --- Train/Test Split (Stratified to balance classes) ---
X = df[["PastDefects", "ExecutionTime", "FailureRate"]]
y = df["HighRisk"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- ML Model: Random Forest ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"\nModel Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# --- Assign Risk Scores to All Test Cases ---
df["RiskScore"] = model.predict_proba(X)[:,1]  # probability of being HighRisk
df = df.sort_values(by="RiskScore", ascending=False)

# --- Display Top 10 Risky Test Cases with color-coded RiskScore ---
pd.set_option("display.max_colwidth", None)

def highlight_risk(val):
    if val > 0.7:
        color = 'red'
    elif val > 0.4:
        color = 'orange'
    else:
        color = 'green'
    return f'color: {color}; font-weight: bold'

print("\nTop 10 Prioritized Test Cases:")
display(
    df[["TestCaseID", "Description", "RiskScore"]].head(10)
      .style.format({"RiskScore": "{:.2f}"})
      .map(highlight_risk, subset=["RiskScore"])
      .set_table_attributes("style='display:inline-block; max-height:400px; overflow-y:auto;'")
      .set_table_styles([{'selector': 'th', 'props': [('background-color', '#f7f7f7'), ('font-weight', 'bold')]}])
)

# --- Save prioritized test cases ---
df.to_csv("../data/prioritized_test_cases.csv", index=False)
print("\nPrioritized test cases saved to '../data/prioritized_test_cases.csv'")


Loaded 16 base test cases.
Expanded dataset to 320 test cases for training.

Model Accuracy: 0.95

Top 10 Prioritized Test Cases:


Unnamed: 0,TestCaseID,Description,RiskScore
283,TC_012,Under no circumstances passwords be stored in plaintext,1.0
287,TC_016,Any attempt to perform an action after timeout redirect the user to the login page,1.0
285,TC_014,"Upon successful registration, the system send a confirmation email to the new user, ensuring the email address is valid and allowing the user to verify their account",1.0
268,TC_013,"The application improve accessibility and user comfort by offering a dark mode option, allowing users to switch between light and dark themes seamlessly across the entire application",1.0
264,TC_009,The registration process validate,1.0
58,TC_011,"For maximum security, all user passwords be stored in the system using irreversible hashing with the SHA-256 algorithm",1.0
56,TC_009,The registration process validate,1.0
269,TC_014,"Upon successful registration, the system send a confirmation email to the new user, ensuring the email address is valid and allowing the user to verify their account",1.0
257,TC_002,"Upon entering valid credentials, the system authenticate the user and redirect them to their personalized dashboard",1.0
53,TC_006,"This be done via a verification email containing a secure reset link,",1.0



Prioritized test cases saved to '../data/prioritized_test_cases.csv'


In [58]:
# Defect Prediction ans root cause analysis
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# --- Simulated test failure logs ---
data = {
    "LogMessage": [
        "Timeout occurred while connecting to database",
        "NullPointerException in UserService",
        "Intermittent UI failure on login button",
        "Failed to load configuration file",
        "IndexOutOfBoundsException in PaymentModule",
        "Network disconnect during API call",
        "Button not clickable intermittently",
        "Database connection refused",
        "Incorrect calculation in billing module",
        "Service unavailable error in API"
    ],
    "DefectType": [
        "Environment Issue",
        "Code Defect",
        "Flaky Test",
        "Environment Issue",
        "Code Defect",
        "Environment Issue",
        "Flaky Test",
        "Environment Issue",
        "Code Defect",
        "Environment Issue"
    ]
}

df_logs = pd.DataFrame(data)

# --- Step 1: Display sample failure logs ---
print("Sample Failure Logs:")
display(df_logs)

# --- Step 2: TF-IDF Vectorization with bigrams ---
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(df_logs["LogMessage"])
y = df_logs["DefectType"]

from sklearn.model_selection import cross_val_score, StratifiedKFold

# Determine minimum class count
min_class_count = df_logs['DefectType'].value_counts().min()
cv_splits = min(3, min_class_count)  # use 3 or less

model = LogisticRegression(max_iter=500)
cv = StratifiedKFold(n_splits=cv_splits, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv)
print(f"\nCross-validated Accuracy: {scores.mean():.2f} ± {scores.std():.2f}")



# --- Step 4: Train on full dataset ---
model.fit(X, y)

# --- Step 5: Hybrid rule-based prediction function ---
def predict_defect(log):
    log_lower = log.lower()
    if "timeout" in log_lower or "network" in log_lower or "database" in log_lower:
        return "Environment Issue"
    if "exception" in log_lower or "error" in log_lower:
        return "Code Defect"
    if "intermittent" in log_lower or "fails occasionally" in log_lower:
        return "Flaky Test"
    x_vec = vectorizer.transform([log])
    return model.predict(x_vec)[0]

# --- Step 6: Predict defect type for new logs ---
new_logs = [
    "Unable to connect to server",
    "Button click fails occasionally",
    "ArithmeticException in InvoiceService",
    "API timeout during request",
    "UI button intermittently not working"
]

print("\nPredicted Defect Types for New Logs:")
for log in new_logs:
    defect = predict_defect(log)
    print(f"- Log: '{log}' → Predicted Defect: {defect}")


Sample Failure Logs:


Unnamed: 0,LogMessage,DefectType
0,Timeout occurred while connecting to database,Environment Issue
1,NullPointerException in UserService,Code Defect
2,Intermittent UI failure on login button,Flaky Test
3,Failed to load configuration file,Environment Issue
4,IndexOutOfBoundsException in PaymentModule,Code Defect
5,Network disconnect during API call,Environment Issue
6,Button not clickable intermittently,Flaky Test
7,Database connection refused,Environment Issue
8,Incorrect calculation in billing module,Code Defect
9,Service unavailable error in API,Environment Issue



Cross-validated Accuracy: 0.60 ± 0.20

Predicted Defect Types for New Logs:
- Log: 'Unable to connect to server' → Predicted Defect: Environment Issue
- Log: 'Button click fails occasionally' → Predicted Defect: Flaky Test
- Log: 'ArithmeticException in InvoiceService' → Predicted Defect: Code Defect
- Log: 'API timeout during request' → Predicted Defect: Environment Issue
- Log: 'UI button intermittently not working' → Predicted Defect: Flaky Test


In [59]:
#Test Failure Categorization & Auto-Triage

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# --- Simulated failed test case logs ---
# In real life, load from test execution results
failure_logs = {
    "TestCaseID": ["TC_001", "TC_005", "TC_008", "TC_012", "TC_015"],
    "FailureMessage": [
        "Login button not clickable intermittently",
        "NullPointerException in PaymentModule",
        "Database connection timeout",
        "Page not loading due to network error",
        "Incorrect calculation in invoice total"
    ],
    "ActualDefectType": [
        "Flaky Test",
        "Code Defect",
        "Environment Issue",
        "Environment Issue",
        "Code Defect"
    ]
}

df_failures = pd.DataFrame(failure_logs)
display(df_failures)

# --- Step 1: Text Vectorization ---
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_failures["FailureMessage"])
y = df_failures["ActualDefectType"]

# --- Step 2: Train Classifier ---
model = MultinomialNB()
model.fit(X, y)

# --- Step 3: Auto-Triage Rules ---
triage_mapping = {
    "Code Defect": "Developer Team",
    "Environment Issue": "Environment/DevOps Team",
    "Flaky Test": "QA Team"
}

# --- Step 4: Predict & Assign Teams ---
df_failures["PredictedDefectType"] = model.predict(X)
df_failures["AssignedTeam"] = df_failures["PredictedDefectType"].map(triage_mapping)

print("\nTest Failure Categorization & Auto-Triage:")
display(df_failures[["TestCaseID", "FailureMessage", "PredictedDefectType", "AssignedTeam"]])

# Optional: Save auto-triaged failures
df_failures.to_csv("../data/auto_triaged_failures.csv", index=False)
print("\nAuto-triaged failures saved to '../data/auto_triaged_failures.csv'")


Unnamed: 0,TestCaseID,FailureMessage,ActualDefectType
0,TC_001,Login button not clickable intermittently,Flaky Test
1,TC_005,NullPointerException in PaymentModule,Code Defect
2,TC_008,Database connection timeout,Environment Issue
3,TC_012,Page not loading due to network error,Environment Issue
4,TC_015,Incorrect calculation in invoice total,Code Defect



Test Failure Categorization & Auto-Triage:


Unnamed: 0,TestCaseID,FailureMessage,PredictedDefectType,AssignedTeam
0,TC_001,Login button not clickable intermittently,Flaky Test,QA Team
1,TC_005,NullPointerException in PaymentModule,Code Defect,Developer Team
2,TC_008,Database connection timeout,Environment Issue,Environment/DevOps Team
3,TC_012,Page not loading due to network error,Environment Issue,Environment/DevOps Team
4,TC_015,Incorrect calculation in invoice total,Code Defect,Developer Team



Auto-triaged failures saved to '../data/auto_triaged_failures.csv'


In [60]:
#  AI-Based API Testing & Response Prediction

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor

# --- Simulated API request-response dataset ---
# Columns: RequestData, ActualResponseTime (ms), ExpectedStatusCode
api_data = {
    "RequestData": [
        "GET /users?id=1",
        "POST /login {user:abc, pwd:123}",
        "GET /orders?id=10",
        "POST /payment {amount:100, currency:USD}",
        "GET /products?category=electronics"
    ],
    "ActualResponseTime": [120, 250, 150, 300, 180],  # in ms
    "ExpectedStatusCode": [200, 200, 200, 201, 200]
}

df_api = pd.DataFrame(api_data)
display(df_api)

# --- Step 1: Feature Engineering ---
# For demonstration, we create dummy numerical features from request strings
df_api["RequestLength"] = df_api["RequestData"].apply(len)
df_api["NumParams"] = df_api["RequestData"].apply(lambda x: x.count("="))
X = df_api[["RequestLength", "NumParams"]]
y_time = df_api["ActualResponseTime"]

# --- Step 2: Train Model to Predict Response Time ---
model_time = RandomForestRegressor(n_estimators=100, random_state=42)
model_time.fit(X, y_time)

# --- Step 3: Predict Expected Response Time for new requests ---
new_requests = [
    "GET /users?id=2",
    "POST /login {user:def, pwd:456}",
    "GET /orders?id=20"
]

df_new = pd.DataFrame({"RequestData": new_requests})
df_new["RequestLength"] = df_new["RequestData"].apply(len)
df_new["NumParams"] = df_new["RequestData"].apply(lambda x: x.count("="))
X_new = df_new[["RequestLength", "NumParams"]]

df_new["PredictedResponseTime"] = model_time.predict(X_new)

print("\nPredicted API Response Times:")
display(df_new)

# --- Step 4: Flagging anomalies ---
# Example rule: if actual response > predicted + 50ms, flag anomaly
df_new["AnomalyFlag"] = df_new["PredictedResponseTime"].apply(lambda x: "Check" if x > 250 else "OK")

print("\nAPI Testing with Anomaly Detection:")
display(df_new)

#  Optional: Save API predictions
df_new.to_csv("../data/api_response_predictions.csv", index=False)
print("\nAPI response predictions saved to '../data/api_response_predictions.csv'")


Unnamed: 0,RequestData,ActualResponseTime,ExpectedStatusCode
0,GET /users?id=1,120,200
1,"POST /login {user:abc, pwd:123}",250,200
2,GET /orders?id=10,150,200
3,"POST /payment {amount:100, currency:USD}",300,201
4,GET /products?category=electronics,180,200



Predicted API Response Times:


Unnamed: 0,RequestData,RequestLength,NumParams,PredictedResponseTime
0,GET /users?id=2,15,1,132.2
1,"POST /login {user:def, pwd:456}",31,0,246.4
2,GET /orders?id=20,17,1,146.0



API Testing with Anomaly Detection:


Unnamed: 0,RequestData,RequestLength,NumParams,PredictedResponseTime,AnomalyFlag
0,GET /users?id=2,15,1,132.2,OK
1,"POST /login {user:def, pwd:456}",31,0,246.4,OK
2,GET /orders?id=20,17,1,146.0,OK



API response predictions saved to '../data/api_response_predictions.csv'
