In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer

In [2]:
MAX_LEN = 128
MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [3]:
def preprocess_and_tokenize(df, text_column):
    # Lowercase and remove whitespace
    texts = df[text_column].str.lower().str.strip().tolist()

    encoding = tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )
    return encoding['input_ids'], encoding['attention_mask']

In [1]:
def process_and_save(csv_path, save_prefix):
    print(f"Processing {csv_path}...")
    df = pd.read_csv(csv_path)
    
    print(f"Number of samples: {len(df)}")
    print(f"Columns: {df.columns.tolist()}")

    input_ids, attention_mask = preprocess_and_tokenize(df, 'text')

    torch.save(input_ids, f'../data/processed/{save_prefix}_input_ids.pt')
    torch.save(attention_mask, f'../data/processed/{save_prefix}_attention_mask.pt')

    print(f"Saved {save_prefix} input_ids and attention_mask!\n")

In [5]:
process_and_save('../data/processed/train.csv', 'train')
process_and_save('../data/processed/val.csv', 'val')
process_and_save('../data/processed/test.csv', 'test')

Processing ../data/processed/train.csv...
Number of samples: 1200
Columns: ['text', 'label']
Saved train input_ids and attention_mask!

Processing ../data/processed/val.csv...
Number of samples: 150
Columns: ['text', 'label']
Saved val input_ids and attention_mask!

Processing ../data/processed/test.csv...
Number of samples: 150
Columns: ['text', 'label']
Saved test input_ids and attention_mask!

