In [None]:
 import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Identification using SATD'

Mounted at /content/drive
/content/drive/My Drive/AD Identification using SATD


In [None]:
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

import pandas as pd


#Read the dataset


In [None]:
liu_ = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
liu_ = pd.read_csv(liu_, low_memory=False)

In [None]:
liu_['Comments'].fillna('', inplace=True)
liu_['TDType'] = liu_['TDType'].astype(str)

values_to_remove = ['MULTITHREAD', 'nan', 'removeType']
replacement_value = 'WITHOUT_CLASSIFICATION'
liu_['TDType'].replace(values_to_remove, replacement_value, inplace=True)

liu_['Comments'] = liu_['Comments'].str.replace('content=', '', regex=False)
liu_['Comments'] = liu_['Comments'].str.replace('"', '', regex=False)


#drop duplicates
liu_ = liu_.drop_duplicates(subset=['Comments', 'TDType'])
liu_['TDType'] = liu_['TDType'].replace('removeType', 'WITHOUT_CLASSIFICATION')

In [None]:
# Count the number of duplicate rows in the DataFrame
num_duplicates = liu_.duplicated().sum()

print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


##Hash

In [None]:
data = liu_

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['Comments'], data['TDType'], test_size=0.2, random_state=42)
# Further split the training set into training and validation sets
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('HASH', HashingVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),  # Scale feature vectors
    ('clf', SVC(random_state=42, kernel='poly', class_weight='balanced', probability=True))  # SVM
])


param_grid = {
    'clf__C': [0.01, 1],  # Regularization strength


}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train_final, y_train_final)


print("Best parameters found:", grid_search.best_params_)

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
classification_rep_val = classification_report(y_val, y_val_pred)

print("\nValidation Confusion Matrix:")
print(conf_matrix_val)
print("\nValidation Classification Report:")
print(classification_rep_val)

# Evaluate on the test set
y_test_pred = best_model.predict(X_test)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

print("\nTest Confusion Matrix:")
print(conf_matrix_test)
print("\nTest Classification Report:")
print(classification_rep_test)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best parameters found: {'clf__C': 1}

Validation Confusion Matrix:
[[  24    0    0    3    0    0    0  116]
 [   0    6    2    2    0    0    0   57]
 [   0    0   16    4    1    0    1   77]
 [  10    4    3  161    1    9    5 1523]
 [   0    0    0    0    6    0    0   14]
 [   0    0    1    5    0   41    3  298]
 [   1    0    0    2    0    2   14   86]
 [   8    3    2   26    1    4    0 3678]]

Validation Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.56      0.17      0.26       143
         COMPATIBILITY       0.46      0.09      0.15        67
                DEFECT       0.67      0.16      0.26        99
                DESIGN       0.79      0.09      0.17      1716
         DOCUMENTATION       0.67      0.30      0.41        20
        IMPLEMENTATION       0.73      0.12      0.20       348
                  TEST       0.61      0.13 