In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('../combined.csv')
# get rid of unamed column
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,document,clause,severity,a,ch,cr,j,law,ltd,ter,use
0,Spotify,Spotify,0,0,0,0,0,0,0,0,0
1,Spotify,•\tPremium,0,0,0,0,0,0,0,0,0
2,Spotify,•\tHelp,0,0,0,0,0,0,0,0,0
3,Spotify,•\tDownload,0,0,0,0,0,0,0,0,0
4,Spotify,•,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
12312,Vivino,Where Vivino has provided you with a translati...,0,0,0,0,0,0,0,0,0
12313,Vivino,If there is any contradiction between what the...,0,0,0,0,0,0,0,0,0
12314,Vivino,Contact,0,0,0,0,0,0,0,0,0
12315,Vivino,You may contact Vivino at the following addres...,0,0,0,0,0,0,0,0,0


In [20]:
data_cleaned = df
X = data_cleaned['clause']
y_tags = data_cleaned[['a', 'ch', 'cr', 'j', 'law', 'ltd', 'ter', 'use']]
y_severity = data_cleaned['severity']


In [21]:
# model to predict severity of the clause
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier


# Splitting the data into training and testing sets for severity
X_train_sev, X_test_sev, y_train_sev, y_test_sev = train_test_split(X, y_severity, test_size=0.2, random_state=42)

# Text Preprocessing: Reusing the earlier vectorizer
X_train_sev_tfidf = vectorizer.transform(X_train_sev)
X_test_sev_tfidf = vectorizer.transform(X_test_sev)

# Model for Severity: Random Forest Classifier
severity_classifier = RandomForestClassifier(n_estimators=100)

# Training the model for severity
severity_classifier.fit(X_train_sev_tfidf, y_train_sev)

# Predicting on the test set for severity
y_pred_sev = severity_classifier.predict(X_test_sev_tfidf)

# Evaluation for Severity
accuracy_sev = accuracy_score(y_test_sev, y_pred_sev)

accuracy_sev


0.9387175324675324

In [24]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Define a dataset class
class TOSDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Define the maximum length for tokenization
MAX_LEN = 128
X = df['clause']
y_tags = df[['a', 'ch', 'cr', 'j', 'law', 'ltd', 'ter', 'use']]

# Splitting the data into training and testing sets for tags
X_train_tags, X_test_tags, y_train_tags, y_test_tags = train_test_split(X, y_tags, test_size=0.2, random_state=42)


# Create the dataset
train_dataset = TOSDataset(X_train_tags.to_numpy(), y_train_tags.to_numpy(), tokenizer, MAX_LEN)
test_dataset = TOSDataset(X_test_tags.to_numpy(), y_test_tags.to_numpy(), tokenizer, MAX_LEN)

# Check if the datasets are prepared correctly
train_dataset[0], device

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 43.1kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 598kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 633kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 1.88MB/s]


({'text': 'your netflix membership may start with a free trial.',
  'input_ids': tensor([  101,  2115, 20907,  5779,  2089,  2707,  2007,  1037,  2489,  3979,
           1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,    

In [42]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
import torch
# from torch.optim import AdamW
from torch.utils.data.dataloader import default_collate
from tqdm import tqdm 

# Assuming the TOSDataset class and the data split are already defined as per your previous code

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(y_tags.columns)  # The number of output labels
)
model.to(device)

def custom_collate_fn(batch):
    """
    Custom collate function to handle batches of dictionaries.
    """
    # Stacking the values of each key from a list of dictionaries
    batch = {key: default_collate([d[key] for d in batch]) for key in batch[0]}
    return batch

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=custom_collate_fn)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
loss_function = BCEWithLogitsLoss()

num_epochs = 1
# Training Loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        
        # Move only the tensors to the device, ignore 'text' or any other non-tensor data
        batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        
        # Forward pass
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        logits = outputs.logits
        
        # Compute loss
        loss = loss_function(logits.view(-1, len(y_tags.columns)), batch['labels'].type_as(logits))
        
        # Backward pass
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        # Zero the gradients after updating
        optimizer.zero_grad()
        
        # Accumulate the loss
        total_loss += loss.item()
    
    # Print the average loss per epoch
    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs} - Loss: {avg_train_loss}')

# Evaluation Loop
model.eval()
predictions, true_labels = [], []
for batch in tqdm(test_loader, desc="Evaluating"):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
    logits = outputs.logits
    predictions.append(logits.cpu().numpy())
    true_labels.append(batch['labels'].cpu().numpy())

# Convert predictions to binary (you may choose a threshold, e.g., 0.5)
# Calculate the accuracy or other metrics using sklearn


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

# Concatenate the predictions and true labels
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Apply sigmoid to the logits to get probabilities and then apply the threshold
probabilities = torch.sigmoid(torch.from_numpy(predictions)).numpy()
threshold = 0.5
binary_predictions = (probabilities > threshold).astype(int)

# Calculate the accuracy for each label
label_accuracies = []
for i in range(len(y_tags.columns)):
    label_accuracy = accuracy_score(true_labels[:, i], binary_predictions[:, i])
    label_accuracies.append(label_accuracy)

# Calculate overall accuracy
overall_accuracy = accuracy_score(true_labels, binary_predictions)

# Calculate the F1 score for each label
label_f1_scores = []
for i in range(len(y_tags.columns)):
    label_f1_score = f1_score(true_labels[:, i], binary_predictions[:, i], average='macro')
    label_f1_scores.append(label_f1_score)

# Calculate the overall F1 score
overall_f1_score = f1_score(true_labels, binary_predictions, average='macro')

# Classification report for each label
detailed_report = classification_report(true_labels, binary_predictions, target_names=y_tags.columns)

print("Accuracy for each label: ", label_accuracies)
print("Overall accuracy: ", overall_accuracy)
print("F1 Score for each label: ", label_f1_scores)
print("Overall F1 Score: ", overall_f1_score)
print("Detailed classification report: \n", detailed_report)