In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/AD Identification using SATD'

Mounted at /content/drive
/content/drive/My Drive/AD Identification using SATD


#Import basic libraries

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import numpy as np


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.linear_model import LogisticRegression


#Read the datasets

In [None]:
liu_ = '/content/drive/My Drive/AD Identification using SATD/liu_datset_processed.csv'
liu_ = pd.read_csv(liu_, low_memory=False)

In [None]:
liu_['Comments'].fillna('', inplace=True)
liu_['TDType'] = liu_['TDType'].astype(str)

# Replace values with 'WITHOUT CLASSIFICATION'
values_to_remove = ['MULTITHREAD', 'nan', 'removeType']
replacement_value = 'WITHOUT_CLASSIFICATION'
liu_['TDType'].replace(values_to_remove, replacement_value, inplace=True)

liu_['Comments'] = liu_['Comments'].str.replace('content=', '', regex=False)
liu_['Comments'] = liu_['Comments'].str.replace('"', '', regex=False)

In [None]:
liu_ = liu_.drop_duplicates(subset=['Comments', 'TDType'])
liu_['TDType'] = liu_['TDType'].replace('removeType', 'WITHOUT_CLASSIFICATION')

In [None]:
# Count the number of duplicate rows in the DataFrame
num_duplicates = liu_.duplicated().sum()

print(f"Number of duplicate rows: {num_duplicates}")

Number of duplicate rows: 0


BERT


In [None]:
pip install transformers




In [None]:
import wandb

# Disable wandb logging
wandb.init(mode="disabled")

In [None]:
data = liu_

# Map TDType classes to integers for training
class_mapping = {label: idx for idx, label in enumerate(data['TDType'].unique())}
data['label'] = data['TDType'].map(class_mapping)

# Split the dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(data['Comments'], data['label'], test_size=0.2, random_state=42, stratify=data['label'])
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)

# Set up tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(class_mapping))


class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create Datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer)
val_dataset = CustomDataset(X_val, y_val, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=30,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

# Define metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1_score(labels, preds, average='weighted')
    }

# Initialise the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


trainer.train()

# Evaluate the model on the test set
test_results = trainer.predict(test_dataset)

# Classification report
y_test_pred = np.argmax(test_results.predictions, axis=1)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_test_pred, target_names=class_mapping.keys()))

# Extract and save embeddings
def extract_embeddings(model, dataset, tokenizer):
    embeddings = []
    dataloader = DataLoader(dataset, batch_size=16)

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        for batch in dataloader:
            inputs = {
                'input_ids': batch['input_ids'].to(model.device),
                'attention_mask': batch['attention_mask'].to(model.device),
            }
            outputs = model.roberta(**inputs)
            hidden_states = outputs[0][:, 0, :]  # Extract [CLS] token embeddings
            embeddings.append(hidden_states.cpu().numpy())

    return np.concatenate(embeddings, axis=0)


all_data = pd.concat([X_train, X_val, X_test]).reset_index(drop=True)
all_labels = pd.concat([y_train, y_val, y_test]).reset_index(drop=True)

full_dataset = CustomDataset(all_data, all_labels, tokenizer)
embeddings = extract_embeddings(model, full_dataset, tokenizer)

#instead of full dataset, use the dataset you used to do the splitting i.e., use extract for only X_ tain. and do not concatenate


# Save embeddings to CSV
embeddings_df = pd.DataFrame(embeddings)
embeddings_df['label'] = all_labels.values
embeddings_df.to_csv('roberta_embeddings.csv', index=False)

print("Embeddings saved to 'roberta_embeddings.csv'.")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4652,0.454162,0.864677,0.842237
2,0.5197,0.437923,0.878312,0.864192
3,0.2713,0.421992,0.886545,0.880445
4,0.2773,0.407822,0.884744,0.886527
5,0.2361,0.438587,0.896064,0.89256
6,0.0827,0.560608,0.897093,0.89258
7,0.2486,0.548787,0.882943,0.886715
8,0.2096,0.542746,0.902495,0.895973
9,0.0399,0.559649,0.909442,0.907226
10,0.0903,0.622436,0.901724,0.899232



Classification Report on Test Set:
                        precision    recall  f1-score   support

             ALGORITHM       0.48      0.40      0.44        94
         COMPATIBILITY       0.63      0.42      0.51        45
                DEFECT       0.63      0.56      0.59        66
                DESIGN       0.84      0.89      0.87      1089
         DOCUMENTATION       0.63      0.75      0.69        16
        IMPLEMENTATION       0.79      0.68      0.73       197
                  TEST       0.76      0.74      0.75        65
WITHOUT_CLASSIFICATION       0.97      0.97      0.97      2316

              accuracy                           0.90      3888
             macro avg       0.72      0.68      0.69      3888
          weighted avg       0.90      0.90      0.90      3888

Embeddings saved to 'roberta_embeddings.csv'.


In [None]:
data = pd.read_csv('roberta_embeddings.csv')