In [2]:
# Importing required libraries
!pip install transformers
import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, TrainingArguments, Trainer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial
import pandas as pd
import os
import nltk
nltk.download("stopwords")
nltk.download("punkt")
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# # Importing data
# # Data is a collection of case reports from cases in the USA with each case classified into its type.
# # Data is extracted and stored into a new directory called Cases/
# !unzip drive/MyDrive/'Colab Notebooks/US Court Cases.zip'


In [4]:
# # Extracting label information for each case type (all 78 case types):
# labels = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Map.txt", sep="->", names = ["Label Number", "Case Type"])
# labels.head(10)

# # Creating a blank data frame to store upcoming data
# dataset = pd.DataFrame(columns = ["label", "text"])

# # Extracting Case Information
# for i in range(1, 79):
#   if i != 48 and i != 46:
#     file_names = os.listdir("/content/Cases/" + str(i))
#     for file_name in file_names:
#       with open("/content/Cases/" + str(i) + "/" + file_name, 'r') as file:
#         case_text = file.read().replace('\n', ' ')
#       case_info = {
#           "label": i,
#           "text": case_text,
#       }
#       dataset = dataset.append(case_info, ignore_index = True)


In [5]:
# Exporting the dataset as a csv to avoid unzipping and extracting every time
# dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/US_Court_Cases.csv")

In [6]:
# Randomly sampling training and test sets in 80-20 ratio
dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/US_Court_Cases.csv")
training_data = dataset.sample(frac = 0.8, random_state = 42)
test_data = dataset.drop(training_data.index)
train_data = training_data.sample(frac = 0.8, random_state = 42)
validation_data = training_data.drop(train_data.index)
train_data = train_data[0:500]
test_data = test_data[0:150]
validation_data = validation_data[0:150]
train_labels = train_data["label"].to_list()
train_text = train_data["text"].to_list()
test_labels = test_data["label"].to_list()
test_text = test_data["text"].to_list()
validation_labels = validation_data["label"].to_list()
validation_text = validation_data["text"].to_list()

In [7]:
def pre_process(corpus):
  corpus_tokenized = []
  for document in corpus:
    document_tokenized = []
    if type(document) == str:
      tokens = nltk.word_tokenize(document)
      for token in tokens:
        if token not in stop_words and token.isalpha():
          document_tokenized.append(token)
      corpus_tokenized.append(document_tokenized)
  return corpus_tokenized

training_corpus = pre_process(train_text)
validation_corpus = pre_process(validation_text)
test_corpus = pre_process(test_text)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(training_corpus, is_split_into_words = True, padding = True, truncation = True)
validation_encodings = tokenizer(validation_corpus, is_split_into_words = True, padding = True, truncation = True)
test_encodings = tokenizer(test_corpus, is_split_into_words = True, padding = True, truncation = True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [9]:
class CourtCasesDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset = CourtCasesDataset(train_encodings, train_labels)
validation_dataset = CourtCasesDataset(validation_encodings, validation_labels)
test_dataset = CourtCasesDataset(test_encodings, test_labels)

In [None]:
# del dataset, training_data, train_data, test_data, validation_data, train_text, test_text, validation_text, train_labels, test_labels, validation_labels,
# del training_corpus, test_corpus, validation_corpus, train_encodings, test_encodings, validation_encodings

training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/Colab Notebooks",
    num_train_epochs = 3,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 256,
    warmup_steps = 150,
    weight_decay = 0.1,
    logging_dir = "/content/drive/MyDrive/Colab Notebooks/logs",
    logging_steps = 10,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset, 
    eval_dataset = validation_dataset
)

trainer.train()

***** Running training *****
  Num examples = 500
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 24
