In [None]:
 import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Identification using SATD'

Mounted at /content/drive
/content/drive/My Drive/AD Identification using SATD


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

##Read the dataset and process

In [None]:
# Load the dataset
liu_path = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
liu_ = pd.read_csv(liu_path, low_memory=False)

# Fill missing values in 'Comments'
liu_['Comments'].fillna('', inplace=True)
liu_['TDType'] = liu_['TDType'].astype(str)


values_to_remove = ['MULTITHREAD', 'nan', 'removeType']
liu_['TDType'].replace(values_to_remove, 'WITHOUT_CLASSIFICATION', inplace=True)

# Remove duplicates based on 'Comments' and 'TDType'
liu_ = liu_.drop_duplicates(subset=['Comments', 'TDType'])

# Clean up the 'Comments' column
liu_['Comments'] = liu_['Comments'].str.replace('content=', '', regex=False)
liu_['Comments'] = liu_['Comments'].str.replace('"', '', regex=False)


In [None]:
# Count the number of duplicate rows in the DataFrame
num_duplicates = liu_.duplicated().sum()
print(f"Number of duplicate rows: {num_duplicates}")


Number of duplicate rows: 0


#LR HASH

In [None]:
data = liu_

# Split data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(data['Comments'], data['TDType'], test_size=0.2, random_state=42)
X_train_final, X_val, y_train_final, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the pipeline with TF-IDF and LR
pipeline = Pipeline([
    ('Hash', HashingVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'))
])

# Define the hyperparameters grid for tuning LR
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],  # Regularisation strength
    #'clf__solver': ['lbfgs', 'liblinear'],  # Solvers to handle different scenarios
    'clf__penalty': ['l2', 'elasticnet'],  # Regularisation penalty type
    'clf__max_iter': [100, 200, 500]  # Number of iterations for convergence
}

# Set up GridSearchCV with StratifiedKFold and the parameter grid
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
                           scoring='accuracy',
                           n_jobs=-1,
                           verbose=2)


grid_search.fit(X_train_final, y_train_final)


print("Best parameters found:", grid_search.best_params_)

# Evaluate the best model on the validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
conf_matrix_val = confusion_matrix(y_val, y_val_pred)
classification_rep_val = classification_report(y_val, y_val_pred)

print("\nValidation Confusion Matrix:")
print(conf_matrix_val)
print("\nValidation Classification Report:")
print(classification_rep_val)

#Evaluate the best model on the test set
y_test_pred = best_model.predict(X_test)
conf_matrix_test = confusion_matrix(y_test, y_test_pred)
classification_rep_test = classification_report(y_test, y_test_pred)

print("\nTest Confusion Matrix:")
print(conf_matrix_test)
print("\nTest Classification Report:")
print(classification_rep_test)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters found: {'clf__C': 10, 'clf__max_iter': 200, 'clf__penalty': 'l2'}

Validation Confusion Matrix:
[[  70    1    0   49    0    4    0   19]
 [   3   37    1   19    0    3    0    4]
 [   3    5   58   15    1    4    6    7]
 [  69   38   50 1325    4  120   18   92]
 [   2    0    0    3   12    1    1    1]
 [   5    3    5   72    4  240    3   16]
 [   1    3    1   13    1    2   80    4]
 [  49   17    9   94    1   12   21 3519]]

Validation Classification Report:
                        precision    recall  f1-score   support

             ALGORITHM       0.35      0.49      0.41       143
         COMPATIBILITY       0.36      0.55      0.43        67
                DEFECT       0.47      0.59      0.52        99
                DESIGN       0.83      0.77      0.80      1716
         DOCUMENTATION       0.52      0.60      0.56        20
        IMPLEMENTATION       0.62      0.69      0.65       3