In [1]:
import re
import os
import difflib
import random, pickle
import numpy as np
import pandas as pd
from tika import parser
import torch
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split

In [2]:
def remove_punc(pdf_content):
    punc = ['• ', '· ', '&', '~', ' o ', '\uf0a7', '\uf03c', '\uf0b7', 
            '–', '()', '[…]', '| ', '© ', '(Insert Scale)', '_', '%', '[', ']', 'Ü ']
    for p in punc:
        pdf_content = pdf_content.replace(p, '')
    return pdf_content

def remove_bulleted_points(pdf_content):
    pdf_content = re.sub(r'\.+ [0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+[0-9]+', '.', pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)

    pdf_content = re.sub(r'\([0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+\)', '', pdf_content)
    pdf_content = re.sub(r'[0-9]+.', '', pdf_content)
    pdf_content = re.sub(r'\([a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r' [a-zA-Z]\)', '', pdf_content)
    pdf_content = re.sub(r'\(i+\)', '', pdf_content)
    pdf_content = re.sub(r' i+\)', '', pdf_content)

    pdf_content = re.sub('\s\s+', ' ', pdf_content)
    return pdf_content

def remove_url(pdf_content):
    url = re.findall('http[s]?://\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    url = re.findall('www.\S+', pdf_content)
    for u in url:
        pdf_content = pdf_content.replace(u, '')
    pdf_content = re.sub(r'http[s]?://', '', pdf_content)
    return pdf_content

def filter_sentences_by_length(pdf_sentence):
    return [s for s in pdf_sentence if len(word_tokenize(s)) > 4 and len(word_tokenize(s)) < 200]

In [2]:
recs = pd.read_csv('cleaned_recs.csv')[['Document File Name ', 'Recommendation text']].dropna(0, 'all')
file_mapping = pd.read_csv("file_mapping.csv")
merged = pd.merge(recs, file_mapping, left_on="Document File Name ", right_on="original_name", how='inner')
test = merged.loc[(merged.indexed_name == '12.pdf') | (merged.indexed_name == '9.pdf')]
train = merged.loc[~((merged.indexed_name == '12.pdf') | (merged.indexed_name == '9.pdf'))]

# sentences = []
# indexed_corpus = os.path.join("..", "indexed_corpus")
# for i in range(1, 16):
    
#     pdf_path = os.path.join(indexed_corpus, f"{i}.pdf")
#     parsed_pdf = parser.from_file(pdf_path)
#     pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
#     pdf_content = remove_punc(pdf_content)
#     pdf_content = remove_bulleted_points(pdf_content)
#     pdf_content = remove_url(pdf_content)
#     pdf_content = remove_punc(pdf_content)
#     pdf_content = re.sub(r'\.+', '.', pdf_content)
#     pdf_content = re.sub(r'\s\s+', ' ', pdf_content)
    
#     pdf_sentence = sent_tokenize(pdf_content)
#     filtered_sentence = filter_sentences_by_length(pdf_sentence)
#     sentences += filtered_sentence

# len(sentences)

  recs = pd.read_csv('cleaned_recs.csv')[['Document File Name ', 'Recommendation text']].dropna(0, 'all')


In [4]:
# def save_list_to_pickle(data, filename):
#     with open(filename, 'wb') as file:
#         pickle.dump(data, file)

# save_list_to_pickle(merged['Recommendation text'].to_list(), "recs.pkl")

In [14]:
# train.iloc[:, 1].to_list()

In [3]:
def load_list_from_pickle(filename):
    with open(filename, 'rb') as file:
        data = pickle.load(file)
    return data

# Example usage:
filename = os.path.join('pkl_files', "sentences.pkl")
sentences = load_list_from_pickle(filename)

In [4]:
recs_list = load_list_from_pickle(os.path.join('pkl_files', 'recs.pkl'))
recs_train_idx = []

i = 0
for train_rec in train.iloc[:, 1].to_list():
    
    while train_rec != recs_list[i]:
        i += 1
    
    recs_train_idx.append(i)

In [18]:
russian_recs = load_list_from_pickle(os.path.join("pkl_files", "russian_recs.pkl"))
train_russian = [russian_recs[i] for i in recs_train_idx]

german_recs = load_list_from_pickle(os.path.join("pkl_files", "german_recs.pkl"))
train_german = [german_recs[i] for i in recs_train_idx]

In [5]:
def retrieve_sentence_index(sentence, sentence_list):
    # Tokenize the sentences
    sentence_tokens = sentence.split()
    sentence_list_tokens = [s.split() for s in sentence_list]
    
    # Calculate the similarity between the sentences
    similarity_scores = [difflib.SequenceMatcher(None, sentence_tokens, s).ratio() for s in sentence_list_tokens]
    
    # Find the index of the most similar sentence
    max_similarity_index = similarity_scores.index(max(similarity_scores))
    
    return max_similarity_index

In [6]:
train_indices = [retrieve_sentence_index(sentence, sentences) for sentence in train.iloc[:, 1]]
test_indices = [retrieve_sentence_index(sentence, sentences) for sentence in test.iloc[:, 1]]

In [7]:
len(train_indices), len(test_indices)

(100, 9)

In [8]:
train_recs = [sentences[idx] for idx in set(train_indices)]
test_recs = [sentences[idx] for idx in set(test_indices)]

non_recs = []
while len(non_recs) != 100:
    samp_idx = np.random.choice(len(sentences))
    if (samp_idx not in train_indices + test_indices) and (len(sentences[samp_idx].split()) > 10):
        non_recs.append(sentences[samp_idx])

In [9]:
train_non_recs = non_recs[:100]
test_non_recs = non_recs[100:]

train_texts = train_non_recs + train_recs #+ train_russian + train_german
test_texts = test_non_recs + test_recs

train_labels = [0] * len(train_non_recs) + [1] * len(train_recs)
test_labels = [0] * len(test_non_recs) + [1] * len(test_recs)

train_combined = list(zip(train_texts, train_labels))
test_combined = list(zip(test_texts, test_labels))

# # Shuffle the combined lists
# random.shuffle(train_combined)
# random.shuffle(test_combined)

In [10]:
class RecsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Unzip the shuffled combined lists
texts, labels = zip(*train_combined)
test_texts, test_labels = zip(*test_combined)

# Split the data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Define the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Define the dataset and data loaders
train_dataset = RecsDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = RecsDataset(val_texts, val_labels, tokenizer, max_length=128)
test_dataset = RecsDataset(test_texts, test_labels, tokenizer, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)
test_dataloader = DataLoader(test_dataset, batch_size=8)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [67]:
# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-6)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
epochs = 30

class_weights = torch.tensor([1.0, 2.0]).to(device)
loss_fn = torch.nn.CrossEntropyLoss(class_weights)

model_name = "original"

# Create logger

f = open(os.path.join("logs", f"{model_name}_logger.txt"), 'w')

best_val_loss = 100

for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_correct = 0
    
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        loss = loss_fn(logits, labels)
        train_loss += loss.item()
        train_correct += (predicted == labels).sum().item()
        
        loss.backward()
        optimizer.step()
    
    train_accuracy = 100.0 * train_correct / len(train_dataset)
    train_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0.0
    val_correct = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            
            loss = loss_fn(logits, labels)
            val_loss += loss.item()
            val_correct += (predicted == labels).sum().item()

    val_accuracy = 100.0 * val_correct / len(val_dataset)
    val_loss /= len(val_dataloader)

    if val_loss < best_val_loss:
        torch.save(model.state_dict(), os.path.join("weights", f'{model_name}.pt'))
        best_val_loss = val_loss    

    f.write(f'Epoch {epoch + 1}/{epochs}\n')
    f.write(f'Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.2f}%\n')
    f.write(f'Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%\n')
    f.write('-------------------------------------------\n')


# Load the best model weights
model.load_state_dict(torch.load(os.path.join("weights", f'{model_name}.pt')))
model.eval()

test_loss = 0.0
# test_correct = 0

all_predicted_labels = []
all_true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        loss = loss_fn(logits, labels)
        test_loss += loss.item()
        # test_correct += (predicted == labels).sum().item()

        all_predicted_labels.extend(predicted.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_true_labels, all_predicted_labels)
test_precision = precision_score(all_true_labels, all_predicted_labels)
test_recall = recall_score(all_true_labels, all_predicted_labels)

test_loss /= len(test_dataloader)

f.write(f'TESTING\n')
f.write(f'Test Loss: {test_loss:.4f} | Test Accuracy: {test_accuracy:.2%}\n')
f.write(f'Test Precision: {test_precision:.4f} | Test Recall: {test_recall:.2f}\n')
f.write('-------------------------------------------\n')


f.close()



### Testing on Train-Val

In [13]:
from torch.utils.data import ConcatDataset

model_name = "mark2"

train_val_dataset = ConcatDataset([train_dataset, val_dataset])
train_val_dataloader = DataLoader(train_val_dataset)

# Define the optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-6)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Load the best model weights
model.load_state_dict(torch.load(os.path.join("weights", f'{model_name}.pt')))
model.eval()

all_predicted_labels = []
all_true_labels = []

with torch.no_grad():
    for batch in train_val_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        all_predicted_labels.extend(predicted.cpu().numpy())
        all_true_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_true_labels, all_predicted_labels) * 100
test_precision = precision_score(all_true_labels, all_predicted_labels)
test_recall = recall_score(all_true_labels, all_predicted_labels)

RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [100]:
test_accuracy, test_precision, test_recall

(93.36734693877551, 0.9278350515463918, 0.9375)

In [101]:
TP = sum(1 for true, pred in zip(all_true_labels, all_predicted_labels) if true == 1 and pred == 1)
FP = sum(1 for true, pred in zip(all_true_labels, all_predicted_labels) if true == 0 and pred == 1)
TN = sum(1 for true, pred in zip(all_true_labels, all_predicted_labels) if true == 0 and pred == 0)
FN = sum(1 for true, pred in zip(all_true_labels, all_predicted_labels) if true == 1 and pred == 0)

type_1_error = FP / (FP + TN)
type_2_error = FN / (FN + TP)

print("Type 1 Error:", type_1_error)
print("Type 2 Error:", type_2_error)

Type 1 Error: 0.07
Type 2 Error: 0.0625


In [71]:
TP, FP, TN, FN

(95, 19, 81, 1)

## Data Augmentation (Back-translation // TF-IDF Replacement)

In [126]:
i = 7

print(recs_list[i])
print(train_russian[i])
print(train_german[i])

Software developers should participate in the CVE issuance processes to help ensure that CVEs provide the necessary information and are created in a timely manner, such as while planning a public vulnerability announcement or immediately after a vulnerability has been publicly announced.
Software developers should be involved in CVE release processes to ensure that CVEs provide required information and are created in a timely manner, such as when planning a public announcement of a vulnerability or immediately after a public announcement of a vulnerability.
Software developers should participate in the CVE issuance processes to ensure that CVEs provide the necessary information and are created in a timely manner, such as when planning a public disclosure of a vulnerability or immediately following a public disclosure of a vulnerability.


### Extractions

In [76]:
all_recs_idx = train_indices + test_indices

In [80]:
# sentences not in all_recs

other_sentences = [sentences[i] for i in range(len(sentences)) if i not in set(all_recs_idx)]

In [83]:
len(other_sentences)

13542

In [89]:
extract_dataset = RecsDataset(other_sentences, [0]*len(other_sentences), tokenizer, max_length=128)
extract_dataloader = DataLoader(extract_dataset, batch_size=8)

In [96]:
# Define the tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [103]:
# Load the best model weights
model_name = "mark2"

model.load_state_dict(torch.load(os.path.join("weights", f'{model_name}.pt')))
model.eval()

predicted_recs_samples = []

with torch.no_grad():
    for batch in extract_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        # Collect samples where the model predicts recs
        class_1_mask = (predicted == 1)
        
        # Get the input texts corresponding to class 1 predictions
        class_1_samples = [text for text, is_class_1 in zip(batch['input_ids'], class_1_mask) if is_class_1]
        
        # Decode the input texts from token IDs
        class_1_samples = [tokenizer.decode(text, skip_special_tokens=True) for text in class_1_samples]
        
        predicted_recs_samples.extend(class_1_samples)

In [105]:
# total 13,542
len(predicted_recs_samples) # 5862 for model w/o augmentation, 1179 for model w/ augmentation

1179

In [106]:
predicted_recs_samples

['establishing a holistic security strategy.',
 'preventive measures against attacks.',
 'working toward learning continuity.',
 'with these expanded horizons comes a responsibility for educators to provide environments where students are empowered to achieve academic and personal goals, be well prepared for success in college and career, and be productive, responsible citizens in our fast - paced and interconnected world.',
 'equipping our children with the t century skills they need for our digital age requires turning traditional classrooms into a digital - learning ecosystem and ensuring teachers have the professional skills and unfettered access to the tools they need for t century teaching and learning.',
 'based on these recommendations, large school districts ( more than students ) should ramp up internet service to gbps and wide area network ( wan ) services to at least gbps per users by to stay ahead of the burgeoning demand for broadband access.',
 'as recommended in the whi

In [110]:
import nltk
nltk.download('punkt')

sentences_9_12 = []
indexed_corpus = os.path.join("..", "indexed_corpus")
for i in [9, 12]:
    
    pdf_path = os.path.join(indexed_corpus, f"{i}.pdf")
    parsed_pdf = parser.from_file(pdf_path)
    pdf_content = parsed_pdf['content'].replace('\n', ' ').replace(';', '.').strip()
    pdf_content = remove_punc(pdf_content)
    pdf_content = remove_bulleted_points(pdf_content)
    pdf_content = remove_url(pdf_content)
    pdf_content = remove_punc(pdf_content)
    pdf_content = re.sub(r'\.+', '.', pdf_content)
    pdf_content = re.sub(r'\s\s+', ' ', pdf_content)
    
    pdf_sentence = sent_tokenize(pdf_content)
    filtered_sentence = filter_sentences_by_length(pdf_sentence)
    sentences_9_12 += filtered_sentence

len(sentences_9_12)

[nltk_data] Downloading package punkt to /home/shubham/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


919

In [117]:
extract_dataset = RecsDataset(sentences_9_12, [0]*len(sentences_9_12), tokenizer, max_length=128)
extract_dataloader = DataLoader(extract_dataset, batch_size=8)

# Load the best model weights
model_name = "mark2"

model.load_state_dict(torch.load(os.path.join("weights", f'{model_name}.pt')))
model.eval()

predicted_recs_samples = []

with torch.no_grad():
    for batch in extract_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits
        _, predicted = torch.max(logits, 1)
        
        # Collect samples where the model predicts recs
        class_1_mask = (predicted == 1)
        
        # Get the input texts corresponding to class 1 predictions
        class_1_samples = [text for text, is_class_1 in zip(batch['input_ids'], class_1_mask) if is_class_1]
        
        # Decode the input texts from token IDs
        class_1_samples = [tokenizer.decode(text, skip_special_tokens=True) for text in class_1_samples]
        
        predicted_recs_samples.extend(class_1_samples)

In [116]:
# total 919
len(predicted_recs_samples) # 200 for model w/o augmentation, 33 for model w/ augmentation

200

In [123]:
output_file_path = 'predicted_samples_9-12.txt'

# Save the text to the specified file
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write('\n\n'.join(predicted_recs_samples)
)

print(f"Predicted samples saved to {output_file_path}")

Predicted samples saved to predicted_samples_9-12.txt


In [125]:
output_file_path = 'actual_samples_9-12.txt'

# Save the text to the specified file
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write('\n\n'.join(test_recs)
)

print(f"Actual samples saved to {output_file_path}")

Actual samples saved to actual_samples_9-12.txt
