# Data Preparation

In [9]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import json
from datasets import load_dataset
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from tqdm import tqdm
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


device = torch.device("mps")
device

device(type='mps')

## Data Exploration

In [2]:
dataset = load_dataset('glue', 'sst2')

In [10]:
## Extract the train, validation and test sets
raw_sst2_train = dataset['train']
raw_sst2_val = dataset['validation']

raw_sst2_train = pd.DataFrame(raw_sst2_train)
raw_sst2_val = pd.DataFrame(raw_sst2_val)

raw_sst2_trainval = pd.concat([raw_sst2_train, raw_sst2_val])
raw_sst2_trainval = raw_sst2_trainval.drop_duplicates(subset='sentence', keep=False)

## print length
print(len(raw_sst2_train))

67349


## Data Preprocessing

In [4]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [5]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [15]:
token_lens = []

for txt in raw_sst2_trainval.sentence:
  tokens = tokenizer.encode(txt, max_length=512, truncation=True)
  token_lens.append(len(tokens))

max(token_lens)

72

In [11]:
df_train, df_test = train_test_split(raw_sst2_trainval, test_size=0.3, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.66, random_state=RANDOM_SEED)

In [12]:
df_train.shape, df_val.shape, df_test.shape

## Calculate distribution of train, val and test
print(len(df_train) / len(raw_sst2_trainval))
print(len(df_val) / len(raw_sst2_trainval))
print(len(df_test) / len(raw_sst2_trainval))

0.6999955541724092
0.1019872849330903
0.1980171608945005


In [17]:
## Save dataframes to json
df_train.to_json('train.json', orient='records', lines=True)
df_val.to_json('val.json', orient='records', lines=True)
df_test.to_json('test.json', orient='records', lines=True)