In [None]:
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from contract_cleaner_faster import SourceCodeCleanerAndFormatter
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import torch
from torch import nn
import numpy as np
import re
import pandas as pd

In [None]:
cleaner = SourceCodeCleanerAndFormatter("SolidityLexer.g4")
cleaner.read_input_file()
cleaner.remove_comments()
file_content = cleaner.source_code
# Use regular expression to extract values in quotes
quoted_values = re.findall(r"'([^']*)'", file_content)
filtered_list = [element for element in quoted_values if '\n' not in element]
unique_list = list(set(filtered_list))

In [None]:
df = pd.read_csv("output2.csv")
df = df.dropna()
texts = df["Text"].tolist()
labels = df["label"].astype(int).tolist()

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_length,
            padding='max_length',
            truncation=True
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes, new_vocab):
        super(BERTClassifier, self).__init__()

        # Load the BERT model and tokenizer
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.tokenizer = BertTokenizer.from_pretrained(bert_model_name)

        # Extend the vocabulary of the tokenizer with new_vocab
        self.tokenizer.add_tokens(new_vocab)

        # Resize the token embeddings matrix of the model
        self.bert.resize_token_embeddings(len(self.tokenizer))

        # Rest of the model setup
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
    return "positive" if preds.item() == 1 else "negative"

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes, unique_list).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
import time
start = time.time()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Train the model on the training dataset
    train(model, train_dataloader, optimizer, scheduler, device)

    # Evaluate the model on the validation dataset
    accuracy, report = evaluate(model, val_dataloader, device)

    # Print validation accuracy and evaluation report
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

    # Save the model for each epoch
    model_save_path = f"model_epoch_{epoch + 1}.pth"
    torch.save(model.state_dict(), model_save_path)
    print(f"Model saved at: {model_save_path}")
    print(time.time() - start)

Epoch 1/4


100%|██████████| 541/541 [3:17:45<00:00, 21.93s/it]


Validation Accuracy: 0.7833
              precision    recall  f1-score   support

           0       0.90      0.14      0.24       536
           1       0.78      1.00      0.87      1628

    accuracy                           0.78      2164
   macro avg       0.84      0.57      0.56      2164
weighted avg       0.81      0.78      0.72      2164

Model saved at: model_epoch_1.pth
12839.026667118073
Epoch 2/4


 96%|█████████▌| 520/541 [3:08:00<07:25, 21.22s/it]

In [None]:
# Load the checkpoint
checkpoint = torch.load('model_epoch_1.pth')

model = BERTClassifier(bert_model_name, num_classes, unique_list)

# Load the state_dict into the model
model.load_state_dict(checkpoint)

# Move the model to the appropriate device
if torch.cuda.is_available():
    model = model.cuda()


In [None]:
import time
x = time.time()
source = SourceCodeCleanerAndFormatter("test.sol")
source.read_input_file()
source.clean_source_code()
source.format_source_code()
test_text = source.source_code
print(test_text)
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(f"Predicted : {sentiment}")
print(time.time() - x)