#### Installing and importing libraries

In [1]:
%%capture
!pip install transformers
!pip install torch
!pip install datasets

In [2]:
%%capture
#Required Libraries

#import os
import torch
import string
import transformers
import pandas as pd
import numpy as np
import datasets
import tensorflow as tf
from google.colab import drive
from datasets import load_metric
from transformers import AutoTokenizer
from transformers import create_optimizer
from transformers import DataCollatorWithPadding
from datasets import load_dataset, Dataset, load_from_disk
from transformers.keras_callbacks import KerasMetricCallback
from transformers import TFAutoModelForSequenceClassification


In [3]:
%%capture

#To mount the google drive
drive.mount('/content/drive', force_remount=False)

#### Main code to train the model

In [4]:
# path where the training data is saved

#change the path to the file path where the dataset is saved
trainPath = "/content/drive/MyDrive/train_dataset.csv"
valPath = "/content/drive/MyDrive/validation_dataset.csv"

#change the path to the path where Dataframe could be saved
tDFPath = '/content/drive/MyDrive/OPPtrainDataframe'
valDFPath = '/content/drive/MyDrive/OPPvalidationDataFrame'

# Uncomment if test dataset is used.
# testPath = "/content/drive/MyDrive/test_dataset.csv"
# testDFPath = '/content/drive/MyDrive/OPPtestDataFrame'

In [5]:
#To save the datasets as Pandas dataframes

trainData = pd.read_csv(trainPath)
valData = pd.read_csv(valPath)

# Uncomment if test dataset is used
#testData = pd.read_csv(testPath)

In [6]:
# To rename the columns of the dataframes containing the training, testing and validation data
trainData.columns = ["Sentence", "label"]
valData.columns = ["Sentence", "label"]

# Uncomment if test dataset is used
# testData.columns = ["Sentence", "label"]

In [None]:
#To view the labels present in the training dataset
trainData['label'].unique()

In [None]:
#To display the number of sample in the training, validation and testing dataset samples

print ("Training Dataset = " + str(len(trainData)) + "  Validation Dataset = " + str(len(valData)))

In [None]:
#To view a sample of the train data
trainData[490:491]

In [10]:
'''To save the Pandas dataframe in .pkl format.This data would then be loaded with the help of transformer library's function 
   to create a dataframe combining both training and validation sets under different heads'''
    
trainData.to_pickle(tDFPath)
valData.to_pickle(valDFPath)

# testData.to_pickle(testDFPath)

In [11]:
# A dictionry containing path of the training and validation dataset

fDataset= {"train" : tDFPath, "val" : valDFPath}

In [None]:
# to load the training and validation dataset as one Dataframe with two heads, making it compatible to be used with tranformer library models

transformerDataset = load_dataset("pandas", data_files=fDataset)

In [None]:
# to encode the string labels in interger form 

transformerDataset = transformerDataset.class_encode_column("label")

In [None]:
# preprocessing tasks

def toLowercase(example):
    return {"Sentence": example["Sentence"].lower()}

transformerDataset = transformerDataset.map(toLowercase)

def removePunctuations(example):
  return{"Sentence": example["Sentence"].translate(str.maketrans('','',string.punctuation))}

transformerDataset = transformerDataset.map(removePunctuations)

In [None]:
#sample view after the preporcessing task
transformerDataset["val"][201:202]

In [None]:
# To import tokenizer for PrivBERT model from the HuggingFace Library

checkpoint = "mukund/privbert"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [17]:
# a simple function to perform tokenization via callback function of tranformer dataset

def tokenizeFunction(example):
    return tokenizer(example["Sentence"], example["Sentence"],truncation=True)

In [None]:
# It uses the tokenizeFunction and performs tokenization batchwise over the entire dataset

encodedDataset = transformerDataset.map(tokenizeFunction, batched=True)

In [None]:
# to view the features added by the tokenizer (the input_ids and attention_mask)

encodedDataset["train"].features

In [20]:
#To remove unwanted columns and rename the column to names accepted by the model

encodedDataset = encodedDataset.remove_columns(["Sentence"])
encodedDataset = encodedDataset.rename_column("label", "labels")

Training

In [None]:
# To import the pre-trained PrivBERT model from the HuggingFace Library

#number of label is equal to the privacy aspects present in the OPP-115 dataset
numLabels = 12
    
# For cleaner label outputs, mapping is given from encoded label to actual label names
id2label = {0: 'Data Retention', 1: 'Data Security', 2: 'Do Not Track', 3: 'First Party Collection/Use',
            4: 'International and Specific Audiences', 5: 'Introductory/Generic', 6: 'Policy Change',
            7: 'Practice not covered', 8:'Privacy contact information', 9: 'Third Party Sharing/Collection',
            10: 'User Access, Edit and Deletion', 11: 'User Choice/Control'}

label2id = {val: key for key, val in id2label.items()}

model = TFAutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=numLabels, id2label=id2label, label2id=label2id)

In [22]:
#To provide padding to the tenfors formed, in the next step, to avoid unmatched dimensions

dataCollator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [23]:
# to form a tensor of the training and validation dataset to be fed to the model with a batch size of 12, tokenizer and data padding

tf_train_dataset = model.prepare_tf_dataset(
    encodedDataset["train"],
    shuffle=True,
    batch_size=12,
    collate_fn=dataCollator,
    tokenizer=tokenizer
)

tf_validation_dataset = model.prepare_tf_dataset(
    encodedDataset["val"],
    shuffle=True,
    batch_size=12,
    collate_fn=dataCollator,
    tokenizer=tokenizer,
)

In [24]:
# setting the parameter to train the model

num_epochs = 10
batch_size = 12
batches_per_epoch = len(encodedDataset["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

# an optimiser to provide the hyperparameters
optimizer, schedule = create_optimizer(
    init_lr=2.5e-5, num_warmup_steps=0, num_train_steps=total_train_steps
)
model.compile(optimizer=optimizer)

In [None]:
# to load the metric system from the tranformer library 
metric = load_metric("accuracy")

In [26]:
#a function to define on what parameters the metrics should be applied

def computeMetrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [27]:
# a keras callback function to calculate the metrics on the validation set after each epoch

metricCallback = KerasMetricCallback(
    metric_fn=computeMetrics, eval_dataset=tf_validation_dataset
)

In [None]:
#To train the model

model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=num_epochs,
    callbacks=metricCallback,
)

####Saving the Fine-Tuned Model

In [None]:
# to save the configuration of the trained model

model_dir = '/content/drive/My Drive/Colab Notebooks/models/' #change the path to the folder where you want to save th configuration

model.save_pretrained(model_dir + 'PolicyInterpreterFullSample')