I used this notebook early on to investigate the data and try to figure out why I was running out of memory in my training loop.

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/MyDrive/Lang Gen Project/LegalBert Parentheticals Cross Reference Datasets/train_dataset.csv'

# Read the CSV file
df = pd.read_csv(file_path)

In [None]:
df = df.dropna()
# This is makes sure to remove a small set of human-annotated reference cases from train if they are there.
case_ids_to_remove = [
    8581821, 4399046, 1202523, 9958144, 8315794, 8558219, 8309170, 314558,
    1788870, 2332888, 2837624, 492660, 10415553, 7157160, 374874, 2617136
]

df = df[~df['case_id'].isin(case_ids_to_remove)]

cleaned_file_path = '/content/drive/MyDrive/Lang Gen Project/LegalBert Parentheticals Cross Reference Datasets/cleaned_val_dataset.csv'
df.to_csv(cleaned_file_path, index=False)

In [None]:
!pip install transformers

In [None]:
file_path = '/content/drive/MyDrive/Lang Gen Project/LegalBert Parentheticals Cross Reference Datasets/cleaned_train_dataset.csv'

df = pd.read_csv('/content/test_set_for_jesse_clean.csv')

In [None]:
# Don't use a tokenizer, just do word count
texts = df['input']
word_counts = [len(text.split()) for text in texts]


In [None]:
texts[2]

In [None]:
import matplotlib.pyplot as plt

plt.hist(word_counts, bins=range(min(word_counts), max(word_counts) + 1, 1))
plt.title('Distribution of Word Counts in Inputs')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.xlim(0, 2000)
plt.show()

In [None]:
import matplotlib.pyplot as plt

word_counts = [len(text.split()) for text in texts]

plt.boxplot(word_counts, vert=False, patch_artist=True, showfliers=True)
plt.title('Box-and-Whisker Plot of Word Counts')
plt.xlabel('Number of Words')
plt.show()


In [None]:
from transformers import LongformerTokenizer

# Initialize the tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')

inputs = df['input']

In [None]:
tokenized_inputs = [tokenizer.encode(text, add_special_tokens=True) for text in inputs]
token_lengths = [len(tokens) for tokens in tokenized_inputs]


In [None]:
len(tokenized_inputs)

In [None]:
import matplotlib.pyplot as plt

plt.hist(token_lengths, bins=range(min(token_lengths), max(token_lengths) + 1, 1))
plt.title('Distribution of Token Lengths')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# df = pd.read_csv("test_set_for_jesse.csv")

In [None]:
print(df.isnull().sum())
print(df.dtypes)

In [None]:
# Keep only rows where 'input' and 'output' are strings
df = df[df['input'].apply(lambda x: isinstance(x, str))]
df = df[df['output'].apply(lambda x: isinstance(x, str))]

In [None]:
# Drop rows where either 'input' or 'output' is NaN
df = df.dropna(subset=['input', 'output'])

In [None]:
print(df.isnull().sum())  # Check for remaining NaN values
print(df.dtypes)          # Check data types of the columns

In [None]:
df['input_word_count'] = df['input'].apply(lambda x: len(x.split()))
df['output_word_count'] = df['output'].apply(lambda x: len(x.split()))


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.hist(df['input_word_count'], bins=50, alpha=0.7, color='blue', label='Input Word Count')
plt.title('Distribution of Word Counts in Input')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
plt.hist(df['output_word_count'], bins=50, alpha=0.7, color='green', label='Output Word Count')
plt.title('Distribution of Word Counts in Output')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
df.to_csv('test_set_for_jesse_clean.csv', index=False)