# Text Classifier

In [1]:
pip install transformers evaluate accelerate datasets scikit-learn torch tf-keras

Collecting transformers
  Downloading transformers-4.41.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.8 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your dataset
train_df = pd.read_csv('ALLtraincompiled.csv')
test_df = pd.read_csv('ALLtestcompiled.csv')

train_df = train_df[['Question','Type']]
test_df = test_df[['Question','Type']]

# Split the dataset into training and test sets
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Define the label mapping
label_mapping = {'counting_and_probability': 0, 'intermediate_algebra': 1, 'number_theory': 2, 'precalculus':3, 
                'prealgebra':4, 'geometry':5, 'algebra':6}
train_df['Type'] = train_df['Type'].map(label_mapping)
test_df['Type'] = test_df['Type'].map(label_mapping)

In [2]:
train_df['Type'].value_counts()

Type
6    1744
1    1295
4    1205
5     870
2     869
0     771
3     746
Name: count, dtype: int64

In [3]:
test_df['Type'].value_counts()

Type
6    1187
1     903
4     871
3     546
2     540
5     479
0     474
Name: count, dtype: int64

In [33]:
from datasets import Dataset
from transformers import BertTokenizer

# Convert DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', add_special_tokens=True)


latex_pattern = r'\$([^$]*)\$'

# Tokenization function with LaTeX support
def tokenize_function(examples):
    # Tokenize each problem while preserving LaTeX
    problem_text = examples['Question']
    # Find LaTeX expressions and tokenize them separately
    matches = re.finditer(latex_pattern, problem_text)
    tokenized_problem = []
    last_end = 0
    for match in matches:
        start, end = match.span()
        # Tokenize text before LaTeX expression
        tokenized_problem.extend(tokenizer.tokenize(problem_text[last_end:start]))
        # Tokenize LaTeX expression and add each token separately
        latex_expression = match.group(1)
        tokenized_latex = tokenizer.tokenize(latex_expression)
        tokenized_problem.extend(tokenized_latex)
        last_end = end
    # Tokenize the remaining text after the last LaTeX expression
    tokenized_problem.extend(tokenizer.tokenize(problem_text[last_end:]))
    return tokenizer.pad({'input_ids': tokenized_problem}, return_tensors='pt')

# Tokenization function
# Tokenization function with debugging
# def tokenize_function(examples):
#     tokenized_inputs = tokenizer(examples['Question'], padding='max_length', truncation=True)
#     # print("Example:", examples['Question'])
#     # print("Tokenized inputs:", tokenized_inputs)
#     return tokenized_inputs


# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Type'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'Type'])


Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

TypeError: expected string or bytes-like object, got 'list'

In [14]:
pip install peft

Collecting peft
  Downloading peft-0.11.1-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.11.1
Note: you may need to restart the kernel to use updated packages.


In [16]:
import torch
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model

# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Apply LoRA to the BERT model
config = LoraConfig(
    r=8,  # Rank of the low-rank matrices
    lora_alpha=32,  # Scaling factor for the LoRA updates
    target_modules=["query", "key", "value"],  # Apply LoRA to these layers
    lora_dropout=0.1,  # Dropout rate
)

model = get_peft_model(model, config)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: Invalid key: 7278 is out of bounds for size 0