<a href="https://colab.research.google.com/github/ikoojos/Algorithm-Debt-Research/blob/master/Copy_of_LR_and_Custom_features_COmparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Final Experiments'

# General imports
import sys
import pandas as pd
import numpy as np
from itertools import product
import importlib
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score
)
from scipy.sparse import hstack, csr_matrix
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Custom modules
sys.path.append('/content/drive/My Drive/AD Final Experiments')
from preprocessing import preprocess_data
from splitting import split_data
from utils import *
from evaluate_model import evaluate_best_model
from lr_tuning import hyperparameter_tuning

# Reload custom modules to ensure latest updates
for module in ['preprocessing', 'splitting', 'utils', 'evaluate_model', 'lr_tuning']:
    importlib.reload(sys.modules[module])

# Ignore all warnings
warnings.filterwarnings("ignore")


Mounted at /content/drive
/content/drive/My Drive/AD Final Experiments


In [None]:
file_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
data = preprocess_data(file_path)
#X_train_final, X_val, X_test, y_train_final, y_val, y_test = split_data(data)

print("Data preprocessing Complete!")


Data preprocessing Complete!


In [None]:
df = data
X = df['Comments'].apply(lambda x: x.lower().strip())
y = df['TDType']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class CustomFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, keywords, save_csv=False, csv_path=None):
        self.keywords = keywords
        self.save_csv = save_csv
        self.csv_path = csv_path

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        # Extract custom features
        custom_features = [
            {f'contains_{kw}': int(kw in text.lower()) for kw in self.keywords}
            for text in X
        ]


        custom_features_df = pd.DataFrame(custom_features)

        if self.save_csv and self.csv_path:
            combined_df = pd.DataFrame({'Comments': X}).reset_index(drop=True)
            combined_df = pd.concat([combined_df, custom_features_df], axis=1)

            if y is not None:
                combined_df['TDType'] = y.reset_index(drop=True) if isinstance(y, pd.Series) else pd.Series(y).reset_index(drop=True)

            combined_df.to_csv(self.csv_path, index=False)
            print(f"CSV saved to {self.csv_path}")

        # Return sparse matrix of features for the pipeline
        return csr_matrix(custom_features_df.values)

keywords = ['shape', 'input', 'tensor', 'output', 'size', 'convolution',
            'value', 'efficient', 'matrix', 'model', 'node', 'function', 'batch']

# Save CSVs
debug_extractor = CustomFeatureExtractor(keywords, save_csv=True, csv_path="Custom AD Features.csv")
debug_extractor.transform(X_train_raw, y_train)


pipeline = Pipeline([
    ('features', FeatureUnion([
        ('vectorizer', CountVectorizer()),
        ('custom', CustomFeatureExtractor(keywords, save_csv=False))  # Disable saving in pipeline
    ])),
    ('clf', LogisticRegression(class_weight='balanced', random_state=42))
])

param_grid = {
    'clf__C': [0.01, 1, 10],
    'clf__penalty': ['l2'],
    'clf__max_iter': [100, 200]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=1)
grid_search.fit(X_train_raw, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print(f"Best F1 Score (Macro) on Training Set: {best_score:.2f}")

y_pred_test = best_model.predict(X_test_raw)

print("\nEvaluation on Test Set:")
print(f"F1 Score (Macro): {f1_score(y_test, y_pred_test, average='macro'):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

CSV saved to Custom AD Features.csv
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters: {'clf__C': 10, 'clf__max_iter': 100, 'clf__penalty': 'l2'}
Best F1 Score (Macro) on Training Set: 0.66

Evaluation on Test Set:
F1 Score (Macro): 0.68
Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.51      0.57      0.54       200
         COMPATIBILITY       0.56      0.55      0.55        89
                DEFECT       0.50      0.59      0.54       135
                DESIGN       0.86      0.82      0.84      2206
         DOCUMENTATION       0.62      0.57      0.59        23
        IMPLEMENTATION       0.63      0.75      0.68       387
                  TEST       0.70      0.76      0.73       143
WITHOUT_CLASSIFICATION       0.97      0.96      0.96      4592

              accuracy                           0.88      7775
             macro avg       0.67      0.70      0.68      7775
       

In [None]:
import joblib

# Save model
joblib.dump(best_model, 'LR_CustomFeatures.pkl')

# Later, load it without retraining
best_model = joblib.load('LR_CustomFeatures.pkl')


In [None]:

llm_examples = [
    "Adjust learning per minibatches at very beginning of training process this could be used to tackle the unstableness of ASGD",
    "TODO: We should be able to move instead of copy but it currently isn't strightforward due to redU and redVT being slices",

    "Hack for the tracer that allows us to represent RNNs as singlenodes and export them to ONNX in this form",

    "Linux gcc barfs on this ^^ for 'us = (double)((std::wstring)arg).size();' due to some ambiguity error (while it works fine with Visual Studio). If you encounter this, instead say 'us = (double)((const std::wstring&)arg).size();' with a &. Don't forget the const (I have seen broken typecasts without).",
    "TODO: fix libname for OSX / Windows",
    "TODO: just load 5.1, not 5.1.3TODO: dynamic version checks via cudnnGetVersion",
    "/* TODO: remove the extra copies of the input. These are only * used for debugging purposes during development and testing. */",
    "/*TODO: merge with call site*/ void BackpropToLeftS(Matrix<ElemType & input1FunctionValues, Matrix<ElemType & input0GradientValues, const Matrix<ElemType & gradientValues, Matrix<ElemType & tempMatrix) { tempMatrix.SetValue(gradientValues); tempMatrix.ColumnElementMultiplyWith(input1FunctionValues); input0GradientValues += tempMatrix;",
    "TODO vectorize mixed product",

    "TODO(b/73448937): Move all update damping code to a separate class/function.",
    "This really should be done in an external debugging tool",
    "======================================================================= ReshapeNode -- reshape input matrix TODO: Why is this in NonlinearityNodes.h? Should he linear algebra no? =======================================================================",


    "TODO(nsilberman): Documentation.",
    "todo: add assertion }",
    "TODO: add loading from checkpoint",
    "TODO(satok): Implement all possible cases.",

]


# Optional: convert to lowercase and strip whitespace, same as training
llm_examples_clean = [c.lower().strip() for c in llm_examples]


In [None]:
# Predict on LLM examples
llm_predictions = best_model.predict(llm_examples_clean)

# Print results
for comment, pred in zip(llm_examples, llm_predictions):
    print(f"Comment: {comment}\nYour Model Prediction: {pred}\n")

Comment: Adjust learning per minibatches at very beginning of training process this could be used to tackle the unstableness of ASGD
Your Model Prediction: ALGORITHM

Comment: FIXME NEON has 16 quad registers, but since the current register allocator is so bad, it is much better to reduce it to 8
Your Model Prediction: ALGORITHM

Comment: We keep track of the pending count and dead input count for each graph node. The representation used here is designed to be cache efficient for graphs with large numbers of nodes, where most nodes have relatively small maximum pending counts (e.g. for one LSTM model, 99% of 5000+ nodes had in-degrees of 3 or less). We use one byte to hold both the pending and dead count for a node where these together can fit in one byte, and we use a hash table to handle the rare node ids that need larger counts than this. TODO(yuanbyu): We current use O(# of nodes in partition) space even for nested iterations where only a small fraction of the nodes are involved. T

In [None]:
import pandas as pd

# Data for the table
comments = [
    "Adjust learning per minibatches...",
    "FIXME NEON has 16 quad registers...",
    "We keep track of the pending count...",
    "TODO: We should be able to move...",
    "note: the temp variable here...",
    "Hack for the tracer that allows...",
    "TODO we could do this much...",
    "#pragma omp parallel for...",
    "Linux gcc barfs on this...",
    "TODO: fix libname for OSX / Windows...",
    "/* TODO: remove the extra copies...",
    "/*TODO: merge with call site*/...",
    "TODO vectorize mixed product",
    "TODO: if p2p isn't supported...",
    "TODO(b/73448937): Move all...",
    "This really should be done in an...",
    "ReshapeNode -- reshape input matrix...",
    "TODO(xpan): Make it text proto...",
    "/** * @brief Manages memory allocation...",
    "TODO(nsilberman): Documentation.",
    "/*l*/) todo: add assertion }",
    "TODO: add loading from checkpoint",
    "TODO(satok): Implement all...",
    "Buffers for constants are ignored..."
]

ground_truth = [
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "COMPATIBILITY",
    "COMPATIBILITY",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION"
]

llm_prediction = [
    "Algorithm Debt",
    "Algorithm Debt",
    "Algorithm Debt",
    "Design Debt",
    "Design Debt",
    "Design Debt",
    "Algorithm Debt",
    "Algorithm Debt",
    "Compatibility Debt",
    "Compatibility Debt",
    "Design Debt",
    "Design Debt",
    "Algorithm Debt",
    "Requirement Debt",
    "Design Debt",
    "Design Debt",
    "Design Debt",
    "Requirement Debt",
    "Documentation Debt",
    "Documentation Debt",
    "Test Debt",
    "Requirement Debt",
    "Requirement Debt",
    "Requirement Debt"
]

your_prediction = [
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "COMPATIBILITY",
    "COMPATIBILITY",
    "DESIGN",
    "DESIGN",
    "IMPLEMENTATION",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION"
]

# Create a DataFrame
df = pd.DataFrame({
    'Comment': comments,
    'Ground Truth': ground_truth,
    'LLM Prediction': llm_prediction,
    'Your Prediction': your_prediction
})

# Print the DataFrame
print(df)

                                      Comment    Ground Truth  \
0          Adjust learning per minibatches...       ALGORITHM   
1         FIXME NEON has 16 quad registers...       ALGORITHM   
2       We keep track of the pending count...       ALGORITHM   
3          TODO: We should be able to move...       ALGORITHM   
4             note: the temp variable here...       ALGORITHM   
5          Hack for the tracer that allows...       ALGORITHM   
6               TODO we could do this much...       ALGORITHM   
7                 #pragma omp parallel for...       ALGORITHM   
8                  Linux gcc barfs on this...   COMPATIBILITY   
9      TODO: fix libname for OSX / Windows...   COMPATIBILITY   
10        /* TODO: remove the extra copies...          DESIGN   
11          /*TODO: merge with call site*/...          DESIGN   
12               TODO vectorize mixed product          DESIGN   
13            TODO: if p2p isn't supported...          DESIGN   
14              TODO(b/73

In [None]:
ground_truth = [
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "ALGORITHM",
    "COMPATIBILITY",
    "COMPATIBILITY",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DESIGN",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "DOCUMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION",
    "IMPLEMENTATION"
]

llm_predictions = [
    "Algorithm Debt",
    "Algorithm Debt",
    "Algorithm Debt",
    "Design Debt",
    "Design Debt",
    "Design Debt",
    "Algorithm Debt",
    "Algorithm Debt",
    "Compatibility Debt",
    "Compatibility Debt",
    "Design Debt",
    "Design Debt",
    "Algorithm Debt",
    "Requirement Debt",
    "Design Debt",
    "Design Debt",
    "Design Debt",
    "Requirement Debt",
    "Documentation Debt",
    "Documentation Debt",
    "Test Debt",
    "Requirement Debt",
    "Requirement Debt",
    "Requirement Debt"
]


comparison_df = pd.DataFrame({
    "Comment": llm_examples,
    "Ground Truth": ground_truth,
    "LLM Prediction": llm_predictions,
    "Your Model Prediction": pred  # replace with your predictions variable if needed
})

print(comparison_df)


                                              Comment    Ground Truth  \
0   Adjust learning per minibatches at very beginn...       ALGORITHM   
1   FIXME NEON has 16 quad registers, but since th...       ALGORITHM   
2   We keep track of the pending count and dead in...       ALGORITHM   
3   TODO: We should be able to move instead of cop...       ALGORITHM   
4   note: the temp variable here gets completely e...       ALGORITHM   
5   Hack for the tracer that allows us to represen...       ALGORITHM   
6   TODO we could do this much more efficiently, w...       ALGORITHM   
7   #pragma omp parallel for TODO: Depending in ci...       ALGORITHM   
8   Linux gcc barfs on this ^^ for 'us = (double)(...   COMPATIBILITY   
9   TODO: fix libname for OSX / WindowsTODO: just ...   COMPATIBILITY   
10  /* TODO: remove the extra copies of the input....          DESIGN   
11  /*TODO: merge with call site*/ void BackpropTo...          DESIGN   
12                       TODO vectorize mixed produ

##Step 1: Prepare the LLM Examples as a “Test Set”

In [None]:
# Example comments from LLM paper
llm_examples = [
    "Either a tensor pointer (pass-by-reference) or a tensor (pass-by-value). TODO(yuanbyu): A better way to do has_value?",
    "/*! \\brief path to the csv file */",
    "Declare node, internal data structure.",
    "TODO: actually, as long as the type is floating point, we can",
    "Reorder Cast and Transpose if beneficial. A common pattern after the layout optimizer is casting an uint8 NHWC image to float before transposing it to NCHW."
    # ... add the rest of the examples you want to compare
]

# Optional: convert to lowercase and strip whitespace, same as training
llm_examples_clean = [c.lower().strip() for c in llm_examples]


##Step 2: Predict Using Your Trained Model

In [None]:
# Predict on LLM examples
llm_predictions = best_model.predict(llm_examples_clean)

# Print results
for comment, pred in zip(llm_examples, llm_predictions):
    print(f"Comment: {comment}\nYour Model Prediction: {pred}\n")


Comment: Either a tensor pointer (pass-by-reference) or a tensor (pass-by-value). TODO(yuanbyu): A better way to do has_value?
Your Model Prediction: DESIGN

Comment: /*! \brief path to the csv file */
Your Model Prediction: WITHOUT_CLASSIFICATION

Comment: Declare node, internal data structure.
Your Model Prediction: WITHOUT_CLASSIFICATION

Comment: TODO: actually, as long as the type is floating point, we can
Your Model Prediction: DESIGN

Comment: Reorder Cast and Transpose if beneficial. A common pattern after the layout optimizer is casting an uint8 NHWC image to float before transposing it to NCHW.
Your Model Prediction: WITHOUT_CLASSIFICATION



##Create Comparison Table

In [None]:
import pandas as pd

ground_truth = [
    "DESIGN",
    "WITHOUT_CLASSIFICATION",
    "WITHOUT_CLASSIFICATION",
    "DESIGN",
    "DESIGN"
    # ... corresponding ground truths for each example
]

llm_predictions = [
    "Design Debt",
    "Documentation Debt",
    "Documentation Debt",
    "Requirement Debt",
    "Algorithm Debt"
    # ... corresponding LLM predictions
]

comparison_df = pd.DataFrame({
    "Comment": llm_examples,
    "Ground Truth": ground_truth,
    "LLM Prediction": llm_predictions,
    "Your Model Prediction": llm_predictions  # replace with your predictions variable if needed
})

print(comparison_df)


                                             Comment            Ground Truth  \
0  Either a tensor pointer (pass-by-reference) or...                  DESIGN   
1                 /*! \brief path to the csv file */  WITHOUT_CLASSIFICATION   
2             Declare node, internal data structure.  WITHOUT_CLASSIFICATION   
3  TODO: actually, as long as the type is floatin...                  DESIGN   
4  Reorder Cast and Transpose if beneficial. A co...                  DESIGN   

       LLM Prediction Your Model Prediction  
0         Design Debt           Design Debt  
1  Documentation Debt    Documentation Debt  
2  Documentation Debt    Documentation Debt  
3    Requirement Debt      Requirement Debt  
4      Algorithm Debt        Algorithm Debt  


##Evaluate Accuracy

In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Map text labels to the same format as your model (if needed)
# e.g., your model uses 'DESIGN', 'DOCUMENTATION', etc.

f1 = f1_score(ground_truth, llm_predictions, average='macro')
acc = accuracy_score(ground_truth, llm_predictions)

print(f"F1 Score (Macro) on LLM Examples: {f1:.2f}")
print(f"Accuracy on LLM Examples: {acc:.2f}")


F1 Score (Macro) on LLM Examples: 0.00
Accuracy on LLM Examples: 0.00
