<a href="https://colab.research.google.com/github/jullazarovych/DL_math_misunderstandings/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers peft accelerate bitsandbytes

In [None]:
!pip install -q transformers[torch] peft accelerate bitsandbytes scikit-learn pandas sentencepiece


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from torch import nn
from peft import PeftModel
from sklearn.metrics import accuracy_score, f1_score
import re
from google.colab import drive
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset as TorchDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import gc
from peft import PeftModel
import torch
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, balanced_accuracy_score, matthews_corrcoef


In [None]:
drive.mount('/content/drive')

In [None]:
output_dir = "/content/drive/MyDrive/nlp_math_misunderstanding/weights"

In [None]:
data_llm = "/content/drive/MyDrive/nlp_math_misunderstanding/data/processed/train_v1.csv"

In [None]:
data_no_llm = "/content/drive/MyDrive/nlp_math_misunderstanding/data/processed/train_v2.csv"

In [None]:
df = pd.read_csv(data_no_llm)

In [None]:
df = df.dropna(subset=['Combined', 'Misconception'])

In [None]:
X_text = df['Combined'].astype(str).tolist()
y_labels = df['Misconception'].tolist()

X_train_text, X_val_text, y_train_labels, y_val_labels = train_test_split(
    X_text, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

print(f"len of train: {len(X_train_text)}")
print(f"len of test: {len(X_val_text)}")


# **baseline model**
---



In [None]:
baseline_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, C=1.0))
])

In [None]:
baseline_model.fit(X_train_text, y_train_labels)

In [None]:
preds = baseline_model.predict(X_val_text)
print(f"Baseline Accuracy: {accuracy_score(y_val_labels, preds)}")

# **DNN**

In [None]:
le = LabelEncoder()
df['label'] = le.fit_transform(df['Misconception'])
num_labels = len(le.classes_)
id2label = {i: label for i, label in enumerate(le.classes_)}
label2id = {label: i for i, label in enumerate(le.classes_)}

In [None]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["Combined"], truncation=True, max_length=256)

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]
)

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    fp16=True,
    report_to="none",
    label_smoothing_factor=0.1
)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, predictions)

    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0)
    recall_macro = recall_score(labels, predictions, average='macro', zero_division=0)
    f1_macro = f1_score(labels, predictions, average='macro')

    f1_weighted = f1_score(labels, predictions, average='weighted')

    balanced_acc = balanced_accuracy_score(labels, predictions)

    mcc = matthews_corrcoef(labels, predictions)

    return {
        "accuracy": acc,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "balanced_accuracy": balanced_acc,
        "mcc": mcc
    }

In [None]:
class TextDataset(TorchDataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model_checkpoint = "roberta-base"

texts = df['Combined'].tolist()
labels = df['label'].tolist()
num_labels = len(np.unique(labels))

In [None]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_results = []
last_fold_preds_ids = None
last_fold_true_labels = None
print(f"Starting {n_splits}-Fold Cross-Validation...")

In [None]:
from sklearn.model_selection import train_test_split
df_train_cv, df_holdout = train_test_split(
    df,
    test_size=300,
    random_state=42,
    stratify=df['label']
)

print(f"Full Train size: {len(df_train_cv)}")
print(f"Holdout Test size: {len(df_holdout)}")

df_holdout.to_csv(f"{output_dir}/holdout_test_300.csv", index=False)
print("Holdout dataset saved to drive.")

texts = df_train_cv['Combined'].tolist()
labels = df_train_cv['label'].tolist()

df_train_cv = df_train_cv.reset_index(drop=True)

In [None]:
all_fold_histories = []

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f"\n--- Fold {fold + 1}/{n_splits} ---")

    train_texts_fold = [texts[i] for i in train_idx]
    val_texts_fold = [texts[i] for i in val_idx]
    train_labels_fold = [labels[i] for i in train_idx]
    val_labels_fold = [labels[i] for i in val_idx]

    classes_in_fold = np.unique(train_labels_fold)
    fold_weights = compute_class_weight('balanced', classes=classes_in_fold, y=train_labels_fold)
    weights_full = np.ones(num_labels)
    for cls, weight in zip(classes_in_fold, fold_weights):
        weights_full[cls] = weight
    weights_tensor = torch.tensor(weights_full, dtype=torch.float).to(device)

    train_encodings = tokenizer(train_texts_fold, truncation=True, padding=True, max_length=256)
    val_encodings = tokenizer(val_texts_fold, truncation=True, padding=True, max_length=256)

    train_dataset = TextDataset(train_encodings, train_labels_fold)
    val_dataset = TextDataset(val_encodings, val_labels_fold)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )
    model.resize_token_embeddings(len(tokenizer))
    model = get_peft_model(model, peft_config)
    model.to(device)

    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    history = trainer.state.log_history
    for entry in history:
        entry['fold'] = fold + 1
    all_fold_histories.extend(history)
    eval_result = trainer.evaluate()
    print(f"Fold {fold+1} Result: Accuracy: {eval_result['eval_accuracy']:.4f}, F1-Macro: {eval_result['eval_f1_macro']:.4f}")
    fold_results.append(eval_result)

    fold_save_path = f"{output_dir}/fold_{fold+1}"
    trainer.save_model(fold_save_path)
    print(f"Model for Fold {fold+1} saved to Drive.")

    with open(f"{output_dir}/all_results.json", 'w') as f:
        json.dump(fold_results, f)

    if fold == n_splits - 1:
        print("Generating predictions for Confusion Matrix...")
        predictions = trainer.predict(val_dataset)
        if isinstance(predictions.predictions, tuple):
            logits = predictions.predictions[0]
        else:
            logits = predictions.predictions
        last_fold_preds_ids = np.argmax(logits, axis=1)
        last_fold_true_labels = predictions.label_ids

    del model, trainer, train_dataset, val_dataset
    torch.cuda.empty_cache()
    gc.collect()

In [None]:

TEST_FILE = f"{output_dir}/holdout_test_300.csv"
FOLD_TO_TEST = 1
MODEL_PATH = f"{output_dir}/fold_{FOLD_TO_TEST}"

print(f"Loading Holdout Test Data from: {TEST_FILE}")
df_test = pd.read_csv(TEST_FILE)

test_texts = df_test['Combined'].tolist()
test_labels = df_test['label'].tolist()

test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=256)

test_dataset = TextDataset(test_encodings, test_labels)

print(f"\n--- Loading Model from Fold {FOLD_TO_TEST} ---")
base_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
base_model.resize_token_embeddings(len(tokenizer))

inference_model = PeftModel.from_pretrained(base_model, MODEL_PATH)
inference_model.to(device)
inference_model.eval()

print("Running prediction...")
tester = Trainer(model=inference_model)
preds_output = tester.predict(test_dataset)

y_pred = np.argmax(preds_output.predictions, axis=1)
y_true = test_labels

acc = accuracy_score(y_true, y_pred)
print(f"\n{'='*30}")
print(f"FINAL HOLDOUT RESULT (Fold {FOLD_TO_TEST})")
print(f"Accuracy: {acc:.4f}")
print(f"{'='*30}\n")
all_label_ids = sorted(label2id.values())
target_names = [k for k, v in sorted(label2id.items(), key=lambda item: item[1])]

print(classification_report(
    y_true,
    y_pred,
    labels=all_label_ids,
    target_names=target_names,
    digits=4,
    zero_division=0
))

print("\n--- error examples ---")
df_test['pred'] = y_pred
errors = df_test[df_test['label'] != df_test['pred']].head(5)
for i, row in errors.iterrows():
    true_label_name = id2label.get(row['label'], "Unknown")
    pred_label_name = id2label.get(row['pred'], "Unknown")

    print(f"\nText: {str(row['Combined'])[:100]}...")
    print(f"True: {true_label_name} | Pred: {pred_label_name}")

In [None]:
from sklearn.metrics import classification_report

def visualize_classification_results(y_true, y_pred, labels, target_names):
    report_dict = classification_report(
        y_true,
        y_pred,
        labels=labels,
        target_names=target_names,
        output_dict=True,
        zero_division=0
    )

    df_report = pd.DataFrame(report_dict).transpose()
    classes_df = df_report.drop(['accuracy', 'macro avg', 'weighted avg'], errors='ignore')
    classes_df = classes_df.sort_values(by='f1-score', ascending=True)

    plt.figure(figsize=(10, len(classes_df) * 0.4))
    sns.heatmap(
        classes_df[['precision', 'recall', 'f1-score']],
        annot=True,
        cmap='RdYlGn',
        fmt='.2f',
        linewidths=.5,
        vmin=0, vmax=1
    )
    plt.title('Detailed metrics by class (Heatmap)', fontsize=15)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

    plot_df = classes_df[classes_df['support'] > 0]

    fig, ax = plt.subplots(figsize=(12, len(plot_df) * 0.45))

    norm = plt.Normalize(0, 1)
    colors = plt.cm.viridis(norm(plot_df['f1-score'].values))

    bars = ax.barh(plot_df.index, plot_df['support'], color=colors)

    sm = plt.cm.ScalarMappable(cmap='viridis', norm=norm)
    sm.set_array([])
    cbar = plt.colorbar(sm, ax=ax)
    cbar.set_label('F1-Score', rotation=270, labelpad=15)

    ax.set_xlabel('Number of examples in the test (support)')
    ax.set_title('Class distribution and their quality (Color = F1)', fontsize=15)

    for bar in bars:
        width = bar.get_width()
        ax.text(width + 0.5, bar.get_y() + bar.get_height()/2,
                f'{int(width)}', va='center', fontsize=9)

    plt.tight_layout()
    plt.show()

In [None]:
all_label_ids = sorted(label2id.values())
target_names = [k for k, v in sorted(label2id.items(), key=lambda item: item[1])]

visualize_classification_results(y_true, y_pred, all_label_ids, target_names)

In [None]:
df_history = pd.DataFrame(all_fold_histories)
train_loss = df_history.dropna(subset=['loss'])

val_metrics = df_history.dropna(subset=['eval_loss'])

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.lineplot(data=train_loss, x='epoch', y='loss', label='Train Loss', color='blue')
sns.lineplot(data=val_metrics, x='epoch', y='eval_loss', label='Val Loss', color='red')
plt.title('Training & Validation Loss (Average over Folds)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
if 'eval_f1_macro' in val_metrics.columns:
    sns.lineplot(data=val_metrics, x='epoch', y='eval_f1_macro', label='Val F1 Macro', marker='o', color='orange')

if 'eval_accuracy' in val_metrics.columns:
    sns.lineplot(data=val_metrics, x='epoch', y='eval_accuracy', label='Val Accuracy', linestyle='--', color='green')

plt.title('Validation Metrics (Average over Folds)')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
SAVED_MODEL_PATH = "/content/drive/My Drive/nlp_math_misunderstanding/weights/fold_2"
MODEL_CHECKPOINT = "roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

print("Loading base model...")
base_model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=33,
    problem_type="single_label_classification"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
base_model.resize_token_embeddings(len(tokenizer))

print(f"Loading LoRA adapters from {SAVED_MODEL_PATH}...")
model = PeftModel.from_pretrained(base_model, SAVED_MODEL_PATH)

model.to(device)
model.eval()

text = "The answer is 5/8 because I added top and bottom."
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()

print(f"Test Prediction Class ID: {predicted_class_id}")

In [None]:
print(model)

In [None]:
with open(f"{output_dir}/all_results.json", 'w') as f:
    json.dump(fold_results, f)

avg_acc = np.mean([res['eval_accuracy'] for res in fold_results])
avg_f1 = np.mean([res['eval_f1_macro'] for res in fold_results])
print(f"\n=== Final Results: Avg Acc: {avg_acc:.4f}, Avg F1: {avg_f1:.4f} ===")

In [None]:
if last_fold_preds_ids is not None:
    print("Building Confusion Matrix for the last fold...")

    cm = confusion_matrix(last_fold_true_labels, last_fold_preds_ids)

    cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(12, 12))
    sns.heatmap(cmn, annot=True, fmt='.1f',
                xticklabels=list(label2id.keys()),
                yticklabels=list(label2id.keys()),
                cmap='Blues')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title('Normalized Confusion Matrix (Last Fold)')
    plt.xticks(rotation=45, ha='right')
    plt.show()

    plt.savefig(f"{output_dir}/confusion_matrix.png")
else:
    print("Error: Predictions were not saved inside the loop.")

In [None]:
test_path = "/content/drive/MyDrive/nlp_math_misunderstanding/data/test.csv"

In [None]:
saved_model_path = f"{output_dir}/fold_5"
test_path = "/content/drive/MyDrive/nlp_math_misunderstanding/data/test.csv"
model_checkpoint = "roberta-base"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("--- STARTING INFERENCE PIPELINE ---")

print("1. Loading Tokenizer and Model...")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
base_model.resize_token_embeddings(50293)

model = PeftModel.from_pretrained(base_model, saved_model_path)
model.to(device)

trainer = Trainer(model=model, tokenizer=tokenizer)


print(f"2. Loading and processing data from {test_path}...")
try:
    df_test = pd.read_csv(test_path)
    print(f"   Loaded {len(df_test)} rows.")
except FileNotFoundError:
    print("   ERROR: File not found. Please check the path.")
    raise

df_test['StudentExplanation'] = df_test['StudentExplanation'].fillna("")
df_test['QuestionText'] = df_test['QuestionText'].fillna("")
df_test['MC_Answer'] = df_test['MC_Answer'].fillna("")

df_test['Combined'] = (
    df_test['QuestionText'] + " || " +
    df_test['MC_Answer'] + " || " +
    df_test['StudentExplanation']
)

test_dataset = Dataset.from_pandas(df_test)

def preprocess_function(examples):
    return tokenizer(examples["Combined"], truncation=True, max_length=256)

tokenized_test = test_dataset.map(preprocess_function, batched=True)

columns_to_keep = ['input_ids', 'attention_mask']
columns_to_remove = [col for col in tokenized_test.column_names if col not in columns_to_keep]
tokenized_test_clean = tokenized_test.remove_columns(columns_to_remove)


print("3. Running prediction...")
predictions_output = trainer.predict(tokenized_test_clean)
y_pred_ids = np.argmax(predictions_output.predictions, axis=1)

labels_map = id2label if 'id2label' in locals() else model.config.id2label
y_pred_labels = [labels_map[i] for i in y_pred_ids]

df_test['Predicted_Misconception'] = y_pred_labels
output_filename = "submission.csv"
df_test.to_csv(output_filename, index=False)

print(f"DONE! Results saved to '{output_filename}'")
print(df_test[['Combined', 'Predicted_Misconception']].head(3))

In [None]:
path_llm = "/content/drive/MyDrive/nlp_math_misunderstanding/data/processed/train_v1.csv"
path_no_llm = "/content/drive/MyDrive/nlp_math_misunderstanding/data/processed/train_v2.csv"

# **Visualization**

In [None]:
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

sample_df = df.sample(2000, random_state=42)
tfidf = TfidfVectorizer(max_features=500).fit_transform(sample_df['Combined'])

tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embedding = tsne.fit_transform(tfidf.toarray())

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x=embedding[:, 0],
    y=embedding[:, 1],
    hue=sample_df['Misconception'],
    legend=False,
    palette='tab10',
    s=60, alpha=0.7
)
plt.title('t-SNE Projection of Misconceptions (Semantic Clustering)')
plt.show()

In [None]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from tqdm import tqdm

TEST_FILE = f"{output_dir}/holdout_test_300.csv"
FOLD_TO_TEST = 1
MODEL_PATH = f"{output_dir}/fold_{FOLD_TO_TEST}"

print(f"Loading Holdout Test Data from: {TEST_FILE}")
df_test = pd.read_csv(TEST_FILE)
sample_size = 300
df_viz = df_test.sample(sample_size, random_state=42).copy()
texts = df_viz['Combined'].tolist()
labels = df_viz['label'].tolist()

labels_names = [id2label[l] for l in labels]

embeddings = []

model.eval()
print("Extracting embeddings from RoBERTa...")
with torch.no_grad():
    for text in tqdm(texts):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256).to(device)
        outputs = model.base_model(**inputs, output_hidden_states=True)

        cls_embedding = outputs.hidden_states[-1][0, 0, :].cpu().numpy()
        embeddings.append(cls_embedding)

embeddings = np.array(embeddings)

print("Running t-SNE...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
tsne_results = tsne.fit_transform(embeddings)

df_viz['x'] = tsne_results[:, 0]
df_viz['y'] = tsne_results[:, 1]
df_viz['Misconception'] = labels_names

plt.figure(figsize=(16, 10))
sns.scatterplot(
    data=df_viz,
    x='x', y='y',
    hue='Misconception',
    palette='tab20',
    s=70, alpha=0.8,
    legend='full'
)

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.title('Semantic Landscape of Misconceptions (RoBERTa Embeddings)', fontsize=20)
plt.tight_layout()
plt.show()

In [None]:
unique_misconceptions = df['Misconception'].dropna().unique()

sorted_misconceptions = sorted(unique_misconceptions)
print(f"{len(sorted_misconceptions)}")
for idx, misconception in enumerate(sorted_misconceptions, 1):
    print(f"{idx}. {misconception}")

In [None]:
plt.figure(figsize=(12, 6))
top_classes = df['Misconception'].value_counts().head(30)
sns.barplot(x=top_classes.values, y=top_classes.index, palette='viridis')
plt.title('Top 30 Most Frequent Misconceptions (The "Long Tail" Problem)')
plt.xlabel('Number of Samples')
plt.show()

In [None]:
df['word_count'] = df['Combined'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(10, 5))
sns.histplot(df['word_count'], bins=50, kde=True, color='purple')
plt.title('Distribution of Text Length (Words per Input)')
plt.xlabel('Word Count')
plt.xlim(0, 300)
plt.show()

In [None]:
from wordcloud import WordCloud
text = " ".join(df['Combined'].astype(str).tolist())
wordcloud = WordCloud(width=1600, height=800, background_color='white').generate(text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('What the Model Sees: Common Terms')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

epochs = [1, 2, 3]
val_loss = [0.394353, 0.273984, 0.266884]
accuracy = [0.906442, 0.930982, 0.932515]
f1_macro = [0.759786, 0.870845, 0.865808]

plt.style.use('seaborn-v0_8-whitegrid')
fig, ax1 = plt.subplots(figsize=(10, 6))

color_loss = 'tab:red'
ax1.set_xlabel('Epochs', fontsize=12, fontweight='bold')
ax1.set_ylabel('Validation Loss', color=color_loss, fontsize=12, fontweight='bold')
ax1.plot(epochs, val_loss, marker='o', color=color_loss, linewidth=3, label='Validation Loss')
ax1.tick_params(axis='y', labelcolor=color_loss)
ax1.set_xticks(epochs)
ax1.grid(True, alpha=0.3)

ax2 = ax1.twinx()
color_f1 = 'tab:blue'
color_acc = 'tab:green'
ax2.set_ylabel('Score (F1 / Accuracy)', color='black', fontsize=12, fontweight='bold')

ax2.plot(epochs, f1_macro, marker='s', color=color_f1, linewidth=3, label='F1-Macro')
ax2.plot(epochs, accuracy, marker='^', color=color_acc, linestyle='--', linewidth=2, label='Accuracy')

ax2.tick_params(axis='y', labelcolor='black')
ax2.set_ylim(0.7, 1.0)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='center right', fontsize=11, frameon=True, shadow=True)

plt.title('Training Dynamics (Fold 2): Loss Drop vs Metrics Growth', fontsize=14, pad=15)
plt.tight_layout()

plt.savefig("training_dynamics_fold2_manual.png", dpi=300)
plt.show()

In [None]:
df = pd.read_csv("/content/drive/MyDrive/nlp_math_misunderstanding/data/raw/train.csv")
df['explanation_len'] = df['StudentExplanation'].astype(str).apply(len)

FILTER_LIMIT = 250

plt.figure(figsize=(12, 6))

sns.histplot(df['explanation_len'], bins=100, color='skyblue', label='Всі пояснення')

plt.axvline(x=FILTER_LIMIT, color='red', linestyle='--', linewidth=3, label=f'Твій фільтр ({FILTER_LIMIT} символів)')
plt.axvspan(xmin=FILTER_LIMIT, xmax=df['explanation_len'].max(), color='red', alpha=0.3, label='Видалені дані (Dropped)')

plt.title(f'How much data is removed when filtering: len(StudentExplanation) > {FILTER_LIMIT}', fontsize=14)
plt.xlabel('Number of characters in the explanation', fontsize=12)
plt.ylabel('Number of examplesв', fontsize=12)
plt.xlim(0, 600)
plt.legend()
plt.tight_layout()
plt.show()

lost = (df['explanation_len'] > FILTER_LIMIT).sum()
total = len(df)
print(f"Total rows: {total}")
print(f"Will be deleted: {lost} рядків ({lost/total:.2%} від усіх даних)")