In [1]:
%%capture
%pip install -U bitsandbytes
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U transformer

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
import pandas as pd
maths_with_category_df = pd.read_csv('/kaggle/input/d/jagathapugazhendhi/maths-with-category/MATH_WITH_CATHEGORY.csv')

In [4]:
maths_with_category_df.head(2)

Unnamed: 0,problem,type
0,Kevin Kangaroo begins hopping on a number line...,Algebra
1,The ratio of the areas of two squares is $\fra...,Algebra


In [5]:
maths_with_category_df = maths_with_category_df.rename(columns={"type": "category"})

In [6]:
maths_with_category_df.head(2)

Unnamed: 0,problem,category
0,Kevin Kangaroo begins hopping on a number line...,Algebra
1,The ratio of the areas of two squares is $\fra...,Algebra


In [7]:
print(len(maths_with_category_df))

12500


In [8]:
# Get counts of each category
category_counts = maths_with_category_df['category'].value_counts()

print(category_counts)

category
Algebra                   2931
Intermediate Algebra      2198
Prealgebra                2076
Number Theory             1409
Geometry                  1349
Precalculus               1292
Counting & Probability    1245
Name: count, dtype: int64


In [9]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig,PeftConfig,prepare_model_for_kbit_training,get_peft_model
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM,
                          AutoModelForSequenceClassification,
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Categorizing maths problems using Llama3.1', 
    job_type="training", 
    anonymous="allow"
)

**Stratified Sampling**

In [10]:
# Count the number of samples per class
label_counts = maths_with_category_df['category'].value_counts()
print("Class distribution:\n", label_counts)

num_samples = 3000

# Find the number of samples per class
samples_per_class = num_samples // len(label_counts)
print('samples_per_class',samples_per_class)

# Perform stratified sampling
stratified_sample = maths_with_category_df.groupby('category').apply(lambda x: x.sample(n=samples_per_class, random_state=42))

# Flatten the result
stratified_sample = stratified_sample.reset_index(drop=True)

# Check the distribution of the selected samples
print("Stratified sample class distribution:\n", stratified_sample['category'].value_counts())

Class distribution:
 category
Algebra                   2931
Intermediate Algebra      2198
Prealgebra                2076
Number Theory             1409
Geometry                  1349
Precalculus               1292
Counting & Probability    1245
Name: count, dtype: int64
samples_per_class 428
Stratified sample class distribution:
 category
Algebra                   428
Counting & Probability    428
Geometry                  428
Intermediate Algebra      428
Number Theory             428
Prealgebra                428
Precalculus               428
Name: count, dtype: int64


  stratified_sample = maths_with_category_df.groupby('category').apply(lambda x: x.sample(n=samples_per_class, random_state=42))


In [11]:
print(type(stratified_sample))

<class 'pandas.core.frame.DataFrame'>


In [12]:
stratified_sample = stratified_sample.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
stratified_sample.head()

Unnamed: 0,problem,category
0,"If $a$, $b$, $c$, $d$, $e$, and $f$ are intege...",Intermediate Algebra
1,A circular sheet of paper with radius of $6$ c...,Geometry
2,Suppose $\sqrt{1 + \sqrt{2y-3}} = \sqrt{6}$; f...,Algebra
3,Find the least common multiple of 36 and 132.,Prealgebra
4,"If $f(x) = x^3 - 6x^2 + 3x - 4$, $g(x) = x^3 +...",Algebra


In [102]:
# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(stratified_sample))
eval_end = train_end + int(eval_size * len(stratified_sample))

# Split the data
X_train = stratified_sample[:train_end]
X_eval = stratified_sample[train_end:eval_end]
X_test = stratified_sample[eval_end:]

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            You are a mathematics expert.Choose from: Algebra, Intermediate Algebra, Prealgebra, Number Theory, Geometry, Precalculus, or Counting & Probability.
Return only the category label of the problem.
problem: {data_point["problem"]}
labels: {data_point["category"]}""".strip()


def generate_test_prompt(data_point):
    return f"""
            You are a mathematics expert.Choose from: Algebra, Intermediate Algebra, Prealgebra, Number Theory, Geometry, Precalculus, or Counting & Probability.
Return only the category label of the problem.
problem: {data_point["problem"]}
labels: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'category']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [56]:
y_true

2695                Geometry
2696                Geometry
2697    Intermediate Algebra
2698             Precalculus
2699    Intermediate Algebra
                ...         
2991    Intermediate Algebra
2992                Geometry
2993                Geometry
2994    Intermediate Algebra
2995                Geometry
Name: category, Length: 301, dtype: object

In [57]:
X_test.head()

Unnamed: 0,text
2695,You are a mathematics expert. Classify the fol...
2696,You are a mathematics expert. Classify the fol...
2697,You are a mathematics expert. Classify the fol...
2698,You are a mathematics expert. Classify the fol...
2699,You are a mathematics expert. Classify the fol...


In [103]:
original_max_colwidth = pd.get_option('display.max_colwidth')

pd.set_option('display.max_colwidth', None)

print(X_test['text'].loc[2697])

pd.set_option('display.max_colwidth', original_max_colwidth)

You are a mathematics expert.Choose from: Algebra, Intermediate Algebra, Prealgebra, Number Theory, Geometry, Precalculus, or Counting & Probability.
Return only the category label of the problem.
problem: Find all values of the real number $a$ so that the four complex roots of
\[z^4 - 6z^3 + 11az^2 - 3(2a^2 + 3a - 3) z + 1 = 0\]form the vertices of a parallelogram in the complex plane.  Enter all the values, separated by commas.
labels:


In [19]:
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=False
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    quantization_config=bnb_config,
    max_memory={0: "15GiB", "cpu": "30GiB"} 
)
# VERY IMPORTANT: prepare model for 4bit training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [104]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["Algebra","Number Theory","Geometry","Precalculus", 
        "Counting & Probability"]
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=5, 
                        temperature=0.1)
        
        result = pipe(prompt)
        # print("resulttttttttttttttt",result)
        answer = result[0]['generated_text'].split("labels:")[-1].strip()
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

In [105]:
y_pred = predict(X_test, model, tokenizer)

  0%|          | 0/301 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/301 [00:00<03:27,  1.44it/s]Device set to use cuda:0
  1%|          | 2/301 [00:01<04:01,  1.24it/s]Device set to use cuda:0
  1%|          | 3/301 [00:02<04:05,  1.21it/s]Device set to use cuda:0
  1%|▏         | 4/301 [00:03<04:06,  1.20it/s]Device set to use cuda:0
  2%|▏         | 5/301 [00:04<04:07,  1.20it/s]Device set to use cuda:0
  2%|▏         | 6/301 [00:04<03:41,  1.33it/s]Device set to use cuda:0
  2%|▏         | 7/301 [00:05<03:34,  1.37it/s]Device set to use cuda:0
  3%|▎         | 8/301 [00:06<04:09,  1.18it/s]Device set to use cuda:0
  3%|▎         | 9/301 [00:07<03:53,  1.25it/s]Device set to use cuda:0
  3%|▎         | 10/301 [00:07<03:43,  1.30it/s]Device set to use cuda:0
  4%|▎         | 11/301 [00:08<03:49,  1.26it/s]Device set to use cuda:0
  4%|▍         | 12/301 [00:09<04:15,  1.13it/s]Device set to use cuda:0
  4%|▍         | 13/301 [00:10<04:31,  1.06it/s]Device set to use cud

In [106]:
print(y_pred[:20])

['Geometry', 'Geometry', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Algebra', 'Geometry', 'none', 'Geometry', 'none', 'none', 'Algebra', 'Algebra', 'Counting & Probability', 'Geometry']


In [107]:
print(type(y_pred))

<class 'list'>


In [108]:
print(type(y_true))

<class 'pandas.core.series.Series'>


In [109]:
y_true = y_true.tolist()

In [110]:
def evaluate(y_true, y_pred):
    # Ensure y_true is a list (in case it's a pandas Series)
    if hasattr(y_true, "tolist"):
        y_true = y_true.tolist()

    labels = [
        "Algebra", "Intermediate Algebra", "Prealgebra", 
        "Number Theory", "Geometry", "Precalculus", 
        "Counting & Probability"
    ]
    
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if label not found

    # Map both y_true and y_pred to integers
    y_true_mapped = [map_func(label) for label in y_true]
    y_pred_mapped = [map_func(label) for label in y_pred]

    # Filter out any -1 (invalid labels)
    valid_indices = [
        i for i in range(len(y_true_mapped))
        if y_true_mapped[i] != -1 and y_pred_mapped[i] != -1
    ]
    y_true_mapped = [y_true_mapped[i] for i in valid_indices]
    y_pred_mapped = [y_pred_mapped[i] for i in valid_indices]

    # ✅ Overall accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f"\n✅ Overall Accuracy: {accuracy:.3f}")

    # 📊 Per-class accuracy
    for label, idx in mapping.items():
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == idx]
        if label_indices:
            label_y_true = [y_true_mapped[i] for i in label_indices]
            label_y_pred = [y_pred_mapped[i] for i in label_indices]
            label_accuracy = accuracy_score(label_y_true, label_y_pred)
            print(f'📚 Accuracy for label "{label}": {label_accuracy:.3f}')

    # 📝 Classification report
    print("\n📋 Classification Report:")
    print(classification_report(
        y_true=y_true_mapped,
        y_pred=y_pred_mapped,
        target_names=labels,
        labels=list(mapping.values()),
        zero_division=0  # Avoid warnings for zero division
    ))

    # 📉 Confusion matrix
    print("\n🔢 Confusion Matrix:")
    print(confusion_matrix(
        y_true=y_true_mapped,
        y_pred=y_pred_mapped,
        labels=list(mapping.values())
    ))


In [111]:
evaluate(y_true, y_pred)


✅ Overall Accuracy: 0.391
📚 Accuracy for label "Algebra": 0.912
📚 Accuracy for label "Intermediate Algebra": 0.000
📚 Accuracy for label "Prealgebra": 0.000
📚 Accuracy for label "Number Theory": 0.211
📚 Accuracy for label "Geometry": 0.684
📚 Accuracy for label "Precalculus": 0.283
📚 Accuracy for label "Counting & Probability": 0.921

📋 Classification Report:
                        precision    recall  f1-score   support

               Algebra       0.21      0.91      0.34        34
  Intermediate Algebra       0.00      0.00      0.00        56
            Prealgebra       0.00      0.00      0.00        39
         Number Theory       0.57      0.21      0.31        38
              Geometry       0.50      0.68      0.58        38
           Precalculus       0.72      0.28      0.41        46
Counting & Probability       0.61      0.92      0.74        38

              accuracy                           0.39       289
             macro avg       0.37      0.43      0.34       2

**Building the model**
When building the model, we start by extracting the linear module names from the model using the bits and bytes library. 

We then configure LoRA using the target modules, task type, and other arguments before setting up training arguments. These training arguments are optimized for the Kaggle notebook. You might need to change them if you are using them locally. 

We will then create the model trainer using training arguments, a model, a tokenizer, a LoRA configuration, and a dataset. 

In [16]:
base_model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    llm_int8_enable_fp32_cpu_offload=False
)

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="sequential",
    torch_dtype="float16",
    quantization_config=bnb_config,
    max_memory={0: "15GiB", "cpu": "30GiB"} 
)
# VERY IMPORTANT: prepare model for 4bit training
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)
modules = find_all_linear_names(model)
modules

['gate_proj', 'k_proj', 'up_proj', 'q_proj', 'o_proj', 'v_proj', 'down_proj']

In [18]:
!nvidia-smi

Mon Apr 28 04:54:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P0             32W /  250W |   10169MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

**GPU 0 is already using 12.3 GB — it's pretty full!**

**GPU 1 is using 8.7 GB — still has about 7 GB free.**

 So both GPUs are already quite busy (probably because the model is  partially loaded on each)


In [19]:
X_train.head()

Unnamed: 0,problem,category,text
0,"If $a$, $b$, $c$, $d$, $e$, and $f$ are intege...",Intermediate Algebra,You are Mathematical expert.Classify the text ...
1,A circular sheet of paper with radius of $6$ c...,Geometry,You are Mathematical expert.Classify the text ...
2,Suppose $\sqrt{1 + \sqrt{2y-3}} = \sqrt{6}$; f...,Algebra,You are Mathematical expert.Classify the text ...
3,Find the least common multiple of 36 and 132.,Prealgebra,You are Mathematical expert.Classify the text ...
4,"If $f(x) = x^3 - 6x^2 + 3x - 4$, $g(x) = x^3 +...",Algebra,You are Mathematical expert.Classify the text ...


In [20]:
print(len(X_train))
print(len(X_eval))

2396
299


In [21]:
# Convert to datasets
train_data = Dataset.from_pandas(X_train[["text"]])
eval_data = Dataset.from_pandas(X_eval[["text"]])

In [None]:
train_data['text'][3]

In [24]:
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForSeq2Seq
from transformers import DataCollatorForLanguageModeling
from peft import LoraConfig

output_dir = "llama-3.1-fine-tuned-model"

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules,
)
print("Target LoRA modules:", modules)

print("Trainable params:")
trainable_params = [n for n, p in model.named_parameters() if p.requires_grad]
print(trainable_params)
print(f"Total trainable parameters: {sum(p.numel() for n, p in model.named_parameters() if p.requires_grad)}")

print("Preparing trainer...")


trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False),  
    args=SFTConfig(
        output_dir=output_dir,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        optim="paged_adamw_32bit",
        logging_steps=1,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=True,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=False,
        lr_scheduler_type="cosine",
        report_to="wandb",
        evaluation_strategy="steps",
        eval_steps=50,
        max_seq_length=128,
        dataset_text_field="text",
        dataset_num_proc=4,
        packing=False,
        seed=42,
    )
)


Target LoRA modules: ['gate_proj', 'k_proj', 'up_proj', 'q_proj', 'o_proj', 'v_proj', 'down_proj']
Trainable params:
['model.layers.0.self_attn.q_proj.lora_A.default.weight', 'model.layers.0.self_attn.q_proj.lora_B.default.weight', 'model.layers.0.self_attn.k_proj.lora_A.default.weight', 'model.layers.0.self_attn.k_proj.lora_B.default.weight', 'model.layers.0.self_attn.v_proj.lora_A.default.weight', 'model.layers.0.self_attn.v_proj.lora_B.default.weight', 'model.layers.0.self_attn.o_proj.lora_A.default.weight', 'model.layers.0.self_attn.o_proj.lora_B.default.weight', 'model.layers.0.mlp.gate_proj.lora_A.default.weight', 'model.layers.0.mlp.gate_proj.lora_B.default.weight', 'model.layers.0.mlp.up_proj.lora_A.default.weight', 'model.layers.0.mlp.up_proj.lora_B.default.weight', 'model.layers.0.mlp.down_proj.lora_A.default.weight', 'model.layers.0.mlp.down_proj.lora_B.default.weight', 'model.layers.1.self_attn.q_proj.lora_A.default.weight', 'model.layers.1.self_attn.q_proj.lora_B.default.w



Converting train dataset to ChatML (num_proc=4):   0%|          | 0/2396 [00:00<?, ? examples/s]

Adding EOS to train dataset (num_proc=4):   0%|          | 0/2396 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=4):   0%|          | 0/2396 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=4):   0%|          | 0/2396 [00:00<?, ? examples/s]

Converting eval dataset to ChatML (num_proc=4):   0%|          | 0/299 [00:00<?, ? examples/s]

Adding EOS to eval dataset (num_proc=4):   0%|          | 0/299 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=4):   0%|          | 0/299 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=4):   0%|          | 0/299 [00:00<?, ? examples/s]

In [25]:
print("Trainer ready. Starting training...")
trainer.train()

Trainer ready. Starting training...




Step,Training Loss,Validation Loss
50,2.764,0.689762
100,2.5447,0.681543
150,3.1299,0.671158
200,2.6189,0.664955
250,1.8624,0.657494
300,2.2377,0.654815
350,2.2614,0.651625
400,3.3667,0.646816
450,2.287,0.643178
500,2.3942,0.639786


TrainOutput(global_step=599, training_loss=2.5978967886735282, metrics={'train_runtime': 5461.1009, 'train_samples_per_second': 0.439, 'train_steps_per_second': 0.11, 'total_flos': 1.1033399845060608e+16, 'train_loss': 2.5978967886735282})

In [28]:
y_pred = predict(X_test, model, tokenizer)  

  0%|          | 0/301 [00:00<?, ?it/s]Device set to use cuda:0
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  0%|          | 1/301 [00:02<14:34,  2.92s/it]Device set to use cuda:0
  1%|          | 2/301 [00:06<16:36,  3.33s/it]Device set to use cuda:0
  1%|          | 3/301 [00:09<16:46,  3.38s/it]Device set to use cuda:0
  1%|▏         | 4/301 [00:13<16:50,  3.40s/it]Device set to use cuda:0
  2%|▏         | 5/301 [00:16<16:52,  3.42s/it]Device set to use cuda:0
  2%|▏         | 6/301 [00:19<15:30,  3.15s/it]Device set to use cuda:0
  2%|▏         | 7/301 [00:22<14:48,  3.02s/it]Device set to use cuda:0
  3%|▎         | 8/301 [00:27<17:38,  3.61s/it]Device set to use cuda:0
  3%|▎         | 9/301 [00:29<16:16,  3.34s/it]Device set to use cuda:0
  3%|▎         | 10/301 [00:32<15:20,  3.16s/it]Device set to use cuda:0
  4%|▎         | 11/301 [00:36<15:42,  3.25s/it]Device set to use cuda:0
  4%|▍         | 12/301 [00:40<17:57,  3.73s/it]Devic

In [40]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.1-fine-tuned-model/tokenizer_config.json',
 'llama-3.1-fine-tuned-model/special_tokens_map.json',
 'llama-3.1-fine-tuned-model/tokenizer.json')

In [41]:
import shutil

# Create a zip file of the saved model
shutil.make_archive(output_dir, 'zip', output_dir)


'/kaggle/working/llama-3.1-fine-tuned-model.zip'

**RELOAD THE MODEL IN FUTURE**

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Unzip the model (if it's not unzipped already)
shutil.unpack_archive("llama-3.1-fine-tuned-model.zip", "path/to/extract")

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("path/to/extract/llama-3.1-fine-tuned-model")
tokenizer = AutoTokenizer.from_pretrained("path/to/extract/llama-3.1-fine-tuned-model")


In [34]:
evaluate(y_true, y_pred)


✅ Overall Accuracy: 0.557
📚 Accuracy for label "Algebra": 1.000
📚 Accuracy for label "Intermediate Algebra": 0.000
📚 Accuracy for label "Prealgebra": 0.000
📚 Accuracy for label "Number Theory": 0.905
📚 Accuracy for label "Geometry": 0.750
📚 Accuracy for label "Precalculus": 0.673
📚 Accuracy for label "Counting & Probability": 0.842

📋 Classification Report:
                        precision    recall  f1-score   support

               Algebra       0.26      1.00      0.41        34
  Intermediate Algebra       0.00      0.00      0.00        56
            Prealgebra       0.00      0.00      0.00        41
         Number Theory       0.86      0.90      0.88        42
              Geometry       0.61      0.75      0.67        40
           Precalculus       0.92      0.67      0.78        49
Counting & Probability       0.84      0.84      0.84        38

              accuracy                           0.56       300
             macro avg       0.50      0.60      0.51       3

In [35]:
stratified_sample.head()

Unnamed: 0,problem,category
0,"If $a$, $b$, $c$, $d$, $e$, and $f$ are intege...",Intermediate Algebra
1,A circular sheet of paper with radius of $6$ c...,Geometry
2,Suppose $\sqrt{1 + \sqrt{2y-3}} = \sqrt{6}$; f...,Algebra
3,Find the least common multiple of 36 and 132.,Prealgebra
4,"If $f(x) = x^3 - 6x^2 + 3x - 4$, $g(x) = x^3 +...",Algebra


In [38]:
stratified_sample["category"] = stratified_sample["category"].replace({
    "Intermediate Algebra": "Algebra",
    "Prealgebra": "Algebra"
})


In [39]:
# Split the DataFrame
train_size = 0.8
eval_size = 0.1

# Calculate sizes
train_end = int(train_size * len(stratified_sample))
eval_end = train_end + int(eval_size * len(stratified_sample))

# Split the data
X_train = stratified_sample[:train_end]
X_eval = stratified_sample[train_end:eval_end]
X_test = stratified_sample[eval_end:]

# Define the prompt generation functions
def generate_prompt(data_point):
    return f"""
            You are Mathematical expert.Classify the text into Algebra, Intermediate Algebra, Prealgebra, Number Theory, Geometry, Precalculus, Counting & Probability
            and return the answer as the corresponding maths problem label.
text: {data_point["problem"]}
labels: {data_point["category"]}""".strip()

def generate_test_prompt(data_point):
    return f"""
            You are Mathematical expert.Classify the text into Algebra, Intermediate Algebra, Prealgebra, Number Theory, Geometry, Precalculus, Counting & Probability
            and return the answer as the corresponding maths problem label.
text: {data_point["problem"]}
labels: """.strip()

# Generate prompts for training and evaluation data
X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)

# Generate test prompts and extract true labels
y_true = X_test.loc[:,'category']
X_test = pd.DataFrame(X_test.apply(generate_test_prompt, axis=1), columns=["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.loc[:,'text'] = X_train.apply(generate_prompt, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_eval.loc[:,'text'] = X_eval.apply(generate_prompt, axis=1)


In [44]:
y_pred = predict(X_test, model, tokenizer)  

  0%|          | 0/301 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/301 [00:02<13:23,  2.68s/it]Device set to use cuda:0
  1%|          | 2/301 [00:06<16:07,  3.23s/it]Device set to use cuda:0
  1%|          | 3/301 [00:09<16:30,  3.32s/it]Device set to use cuda:0
  1%|▏         | 4/301 [00:13<16:40,  3.37s/it]Device set to use cuda:0
  2%|▏         | 5/301 [00:16<16:45,  3.40s/it]Device set to use cuda:0
  2%|▏         | 6/301 [00:19<15:26,  3.14s/it]Device set to use cuda:0
  2%|▏         | 7/301 [00:22<14:45,  3.01s/it]Device set to use cuda:0
  3%|▎         | 8/301 [00:26<17:35,  3.60s/it]Device set to use cuda:0
  3%|▎         | 9/301 [00:29<16:14,  3.34s/it]Device set to use cuda:0
  3%|▎         | 10/301 [00:32<15:19,  3.16s/it]Device set to use cuda:0
  4%|▎         | 11/301 [00:35<15:41,  3.25s/it]Device set to use cuda:0
  4%|▍         | 12/301 [00:40<17:56,  3.73s/it]Device set to use cuda:0
  4%|▍         | 13/301 [00:45<19:13,  4.01s/it]Device set to use cud

In [45]:
evaluate(y_true, y_pred)


✅ Overall Accuracy: 0.827
📚 Accuracy for label "Algebra": 0.870
📚 Accuracy for label "Number Theory": 0.929
📚 Accuracy for label "Geometry": 0.750
📚 Accuracy for label "Precalculus": 0.653
📚 Accuracy for label "Counting & Probability": 0.868

📋 Classification Report:
                        precision    recall  f1-score   support

               Algebra       0.84      0.87      0.86       131
  Intermediate Algebra       0.00      0.00      0.00         0
            Prealgebra       0.00      0.00      0.00         0
         Number Theory       0.89      0.93      0.91        42
              Geometry       0.61      0.75      0.67        40
           Precalculus       0.97      0.65      0.78        49
Counting & Probability       0.85      0.87      0.86        38

             micro avg       0.83      0.83      0.83       300
             macro avg       0.59      0.58      0.58       300
          weighted avg       0.84      0.83      0.83       300


🔢 Confusion Matrix:
[[1

In [46]:
def predict(test, model, tokenizer):
    y_pred = []
    categories = ["Algebra","Number Theory","Geometry","Precalculus", 
        "Counting & Probability"]
    
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens=5, 
                        temperature=0.1)
        
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("labels:")[-1].strip()
        
        # Determine the predicted category
        for category in categories:
            if category.lower() in answer.lower():
                y_pred.append(category)
                break
        else:
            y_pred.append("none")
    
    return y_pred

In [47]:
y_pred = predict(X_test, model, tokenizer)  

  0%|          | 0/301 [00:00<?, ?it/s]Device set to use cuda:0
  0%|          | 1/301 [00:02<13:20,  2.67s/it]Device set to use cuda:0
  1%|          | 2/301 [00:06<16:05,  3.23s/it]Device set to use cuda:0
  1%|          | 3/301 [00:09<16:29,  3.32s/it]Device set to use cuda:0
  1%|▏         | 4/301 [00:13<16:39,  3.37s/it]Device set to use cuda:0
  2%|▏         | 5/301 [00:16<16:45,  3.40s/it]Device set to use cuda:0
  2%|▏         | 6/301 [00:19<15:25,  3.14s/it]Device set to use cuda:0
  2%|▏         | 7/301 [00:21<14:44,  3.01s/it]Device set to use cuda:0
  3%|▎         | 8/301 [00:26<17:35,  3.60s/it]Device set to use cuda:0
  3%|▎         | 9/301 [00:29<16:14,  3.34s/it]Device set to use cuda:0
  3%|▎         | 10/301 [00:32<15:19,  3.16s/it]Device set to use cuda:0
  4%|▎         | 11/301 [00:35<15:41,  3.25s/it]Device set to use cuda:0
  4%|▍         | 12/301 [00:40<17:56,  3.73s/it]Device set to use cuda:0
  4%|▍         | 13/301 [00:45<19:13,  4.01s/it]Device set to use cud

In [48]:
def evaluate(y_true, y_pred):
    # Ensure y_true is a list (in case it's a pandas Series)
    if hasattr(y_true, "tolist"):
        y_true = y_true.tolist()

    labels = [
        "Algebra","Number Theory", "Geometry", "Precalculus", 
        "Counting & Probability"
    ]
    
    mapping = {label: idx for idx, label in enumerate(labels)}

    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if label not found

    # Map both y_true and y_pred to integers
    y_true_mapped = [map_func(label) for label in y_true]
    y_pred_mapped = [map_func(label) for label in y_pred]

    # Filter out any -1 (invalid labels)
    valid_indices = [
        i for i in range(len(y_true_mapped))
        if y_true_mapped[i] != -1 and y_pred_mapped[i] != -1
    ]
    y_true_mapped = [y_true_mapped[i] for i in valid_indices]
    y_pred_mapped = [y_pred_mapped[i] for i in valid_indices]

    # ✅ Overall accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f"\n✅ Overall Accuracy: {accuracy:.3f}")

    # 📊 Per-class accuracy
    for label, idx in mapping.items():
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == idx]
        if label_indices:
            label_y_true = [y_true_mapped[i] for i in label_indices]
            label_y_pred = [y_pred_mapped[i] for i in label_indices]
            label_accuracy = accuracy_score(label_y_true, label_y_pred)
            print(f'📚 Accuracy for label "{label}": {label_accuracy:.3f}')

    # 📝 Classification report
    print("\n📋 Classification Report:")
    print(classification_report(
        y_true=y_true_mapped,
        y_pred=y_pred_mapped,
        target_names=labels,
        labels=list(mapping.values()),
        zero_division=0  # Avoid warnings for zero division
    ))

    # 📉 Confusion matrix
    print("\n🔢 Confusion Matrix:")
    print(confusion_matrix(
        y_true=y_true_mapped,
        y_pred=y_pred_mapped,
        labels=list(mapping.values())
    ))


In [49]:
evaluate(y_true, y_pred)


✅ Overall Accuracy: 0.827
📚 Accuracy for label "Algebra": 0.863
📚 Accuracy for label "Number Theory": 0.905
📚 Accuracy for label "Geometry": 0.825
📚 Accuracy for label "Precalculus": 0.653
📚 Accuracy for label "Counting & Probability": 0.842

📋 Classification Report:
                        precision    recall  f1-score   support

               Algebra       0.86      0.86      0.86       131
         Number Theory       0.86      0.90      0.88        42
              Geometry       0.63      0.82      0.72        40
           Precalculus       0.94      0.65      0.77        49
Counting & Probability       0.84      0.84      0.84        38

              accuracy                           0.83       300
             macro avg       0.83      0.82      0.81       300
          weighted avg       0.84      0.83      0.83       300


🔢 Confusion Matrix:
[[113   3   8   1   6]
 [  3  38   1   0   0]
 [  6   0  33   1   0]
 [  7   0  10  32   0]
 [  3   3   0   0  32]]
