# Development of RoBERTa Model

## Supervised Fine Tuning

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, classification_report, precision_score, recall_score
import gc

### Import Data 

In [2]:
dir_path = 'combined_500ms_whisper_diarization_stable_ts_aligned'

In [3]:
# Initialize an empty dictionary to store the data
aligned_dict = {}

# Iterate through files in the directory
for file_name in os.listdir(dir_path):
    # Check if the file is a CSV
    if file_name.endswith('.csv'):
        # Create the full file path
        file_path = os.path.join(dir_path, file_name)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Use the file name without the extension as the dictionary key
        key = os.path.splitext(file_name)[0]
        
        # Store the DataFrame in the dictionary
        aligned_dict[key] = df

In [4]:
keys = list(aligned_dict.keys())

In [5]:
# Group keys by their first three digits
grouped_keys = {}
for key in keys:
    prefix = key[:3]
    if prefix not in grouped_keys:
        grouped_keys[prefix] = []
    grouped_keys[prefix].append(key)

# Get list of prefixes
prefixes = list(grouped_keys.keys())

In [6]:
test_prefixes =  ['051', '134', '251', '027', '129', '038', '108', '107', '252']
all_train_prefixes = list(set(prefixes) - set(test_prefixes))
train_prefixes, val_prefixes = train_test_split(all_train_prefixes, test_size=0.10, random_state=99)

In [9]:
train_keys = [key for key in keys if key[:3] in train_prefixes]
val_keys = [key for key in keys if key[:3] in val_prefixes]
test_keys = [key for key in keys if key[:3] in test_prefixes]

In [10]:
train_dfs = [aligned_dict[key][['duration',"speaker", "label", "transcript"]] for key in train_keys]
val_dfs = [aligned_dict[key][['duration',"speaker", "label",  "transcript"]] for key in val_keys]
test_dfs = [aligned_dict[key][['duration',"speaker", "label", "transcript"]] for key in test_keys]

train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(val_dfs, ignore_index=True)
test_df = pd.concat(test_dfs, ignore_index=True)

### Data Preprocessing

In [11]:
train_df = train_df[train_df["speaker"] != "Student"]
val_df = val_df[val_df["speaker"] != "Student"]
test_df = test_df[test_df["speaker"] != "Student"]

train_df = train_df[train_df["label"] != "UNI"]
val_df = val_df[val_df["label"] != "UNI"]
test_df = test_df[test_df["label"] != "UNI"]

train_df = train_df[train_df["duration"] > 0.5]
val_df = val_df[val_df["duration"] > 0.5]
test_df = test_df[test_df["duration"] >0.5]

In [12]:
train_df['label'] = train_df['label'].apply(lambda x: ' '.join([label for label in x.split() if label not in ['UNI', 'SIL']]))
val_df['label'] = val_df['label'].apply(lambda x: ' '.join([label for label in x.split() if label not in ['UNI', 'SIL']]))
test_df['label'] = test_df['label'].apply(lambda x: ' '.join([label for label in x.split() if label not in ['UNI', 'SIL']]))

In [13]:
train_df = train_df[~train_df["transcript"].isnull()]
val_df = val_df[~val_df["transcript"].isnull()]
test_df = test_df[~test_df["transcript"].isnull()]

In [29]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(self.labels[idx])
        }

In [30]:
# Initialize tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [31]:
# Convert labels to multi-hot encoding
mlb = MultiLabelBinarizer()
# Fit on training data only to prevent data leakage
train_labels = train_df['label'].str.split()
mlb.fit(train_labels)

# Transform all datasets
train_encoded_labels = mlb.transform(train_labels)
val_encoded_labels = mlb.transform(val_df['label'].str.split())
test_encoded_labels = mlb.transform(test_df['label'].str.split())

In [21]:
pd.options.display.max_colwidth = 500

In [35]:
# Create datasets
train_dataset = TextClassificationDataset(
    train_df['transcript'].values, 
    train_encoded_labels,
    tokenizer
)

val_dataset = TextClassificationDataset(
    val_df['transcript'].values, 
    val_encoded_labels,
    tokenizer
)

test_dataset = TextClassificationDataset(
    test_df['transcript'].values, 
    test_encoded_labels,
    tokenizer
)

# Create data loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=8,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False
)

# Initialize model
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training setup

In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

In [37]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            loss = criterion(outputs.logits, labels)
            total_loss += loss.item()
            
            predictions = torch.sigmoid(outputs.logits)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    return total_loss / len(dataloader), np.array(all_predictions), np.array(all_labels)

In [None]:
# Early stopping parameters
patience = 3  # Number of epochs to wait before early stopping
min_delta = 0.001  # Minimum change in validation loss to qualify as an improvement

# Training loop
num_epochs = 10
best_val_loss = float('inf')
patience_counter = 0
early_stop = False

## Training Loop

In [38]:
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_predictions, val_true_labels = evaluate(model, val_loader, criterion, device)
    
    print(f'Epoch {epoch+1}:')
    print(f'Training Loss: {train_loss:.4f}')
    print(f'Validation Loss: {val_loss:.4f}')
    
    # Calculate and print metrics
    val_pred_binary = (val_predictions > 0.5).astype(int)
    
    macro_f1 = f1_score(val_true_labels, val_pred_binary, average='macro')
    micro_f1 = f1_score(val_true_labels, val_pred_binary, average='micro')
    
    print(f'Validation Macro F1: {macro_f1:.4f}')
    print(f'Validation Micro F1: {micro_f1:.4f}')
    
    # Check if validation loss improved
    if val_loss < best_val_loss - min_delta:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model_whisper_diarization.pt')
        print("Saved best model!")
    else:
        patience_counter += 1
        print(f"Validation loss didn't improve. Patience: {patience_counter}/{patience}")
    
    # Check early stopping condition
    if patience_counter >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs!")
        early_stop = True
        break
        
    print("-" * 50)

# Print training summary
print("\nTraining Summary:")
print(f"Best validation loss: {best_val_loss:.4f}")
print(f"Training stopped after {epoch + 1} epochs")
if early_stop:
    print("Reason: Early stopping")
else:
    print("Reason: Completed all epochs")

Epoch 1:
Training Loss: 0.3369
Validation Loss: 0.2807
Validation Macro F1: 0.2585
Validation Micro F1: 0.6969
Saved best model!
--------------------------------------------------
Epoch 2:
Training Loss: 0.2788
Validation Loss: 0.2597
Validation Macro F1: 0.3051
Validation Micro F1: 0.7254
Saved best model!
--------------------------------------------------
Epoch 3:
Training Loss: 0.2469
Validation Loss: 0.2649
Validation Macro F1: 0.3399
Validation Micro F1: 0.7380
Validation loss didn't improve. Patience: 1/3
--------------------------------------------------
Epoch 4:
Training Loss: 0.2197
Validation Loss: 0.2588
Validation Macro F1: 0.5225
Validation Micro F1: 0.7530
Validation loss didn't improve. Patience: 2/3
--------------------------------------------------
Epoch 5:
Training Loss: 0.1924
Validation Loss: 0.2537
Validation Macro F1: 0.5069
Validation Micro F1: 0.7572
Saved best model!
--------------------------------------------------
Epoch 6:
Training Loss: 0.1683
Validation Lo

## Evaluation

In [39]:
# Final evaluation on test set
model.load_state_dict(torch.load('best_model_whisper_diarization.pt'))
test_loss, test_predictions, test_true_labels = evaluate(model, test_loader, criterion, device)
test_pred_binary = (test_predictions > 0.5).astype(int)

# Calculate final test metrics
test_macro_f1 = f1_score(test_true_labels, test_pred_binary, average='macro')
test_micro_f1 = f1_score(test_true_labels, test_pred_binary, average='micro')
print("\nFinal Test Results:")
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Macro F1: {test_macro_f1:.4f}')
print(f'Test Micro F1: {test_micro_f1:.4f}')

# Print label-wise performance
for i, label in enumerate(mlb.classes_):
    label_f1 = f1_score(test_true_labels[:, i], test_pred_binary[:, i])
    print(f'{label}: F1 = {label_f1:.4f}')


Final Test Results:
Test Loss: 0.2828
Test Macro F1: 0.4849
Test Micro F1: 0.7038
BSP: F1 = 0.5799
GPRS: F1 = 0.3463
IST: F1 = 0.7608
NEU: F1 = 0.1619
RED: F1 = 0.2254
REP: F1 = 0.3027
ST: F1 = 0.4834
SV: F1 = 0.7946
aAFF: F1 = 0.6470
aCORR: F1 = 0.3293
aOTR: F1 = 0.7890
sOTR: F1 = 0.3990


## Broader Labels

In [40]:
# Create mapping dictionary from specific labels to broader concepts
# Using underscores to keep multi-word concepts together
label_to_concept = {
    'IST': 'Teacher_Talk',
    'ST': 'Teacher_Talk',
    'aOTR': 'Opportunity_to_Respond',
    'sOTR': 'Opportunity_to_Respond',
    'REP': 'Corrective_Behavioral_Feedback',
    'RED': 'Corrective_Behavioral_Feedback',
    'GPRS': 'Praise',
    'BSP': 'Praise',
    'aAFF': 'Academic_Feedback',
    'aCORR': 'Academic_Feedback',
    'SV': 'Other',
    'NEU': 'Other'
}

# Function to map detailed labels to broader concepts
def map_to_broader_concepts(label_string):
    specific_labels = label_string.split()
    broader_concepts = set(label_to_concept[label] for label in specific_labels)  # using set to remove duplicates
    return ' '.join(broader_concepts)

# Transform labels in all datasets
train_df['broader_label'] = train_df['label'].apply(map_to_broader_concepts)
val_df['broader_label'] = val_df['label'].apply(map_to_broader_concepts)
test_df['broader_label'] = test_df['label'].apply(map_to_broader_concepts)

# Create new MultiLabelBinarizer for broader concepts
broader_mlb = MultiLabelBinarizer()
train_broader_labels = train_df['broader_label'].str.split()
broader_mlb.fit(train_broader_labels)

# Transform all datasets with broader concepts
train_encoded_labels = broader_mlb.transform(train_broader_labels)
val_encoded_labels = broader_mlb.transform(val_df['broader_label'].str.split())
test_encoded_labels = broader_mlb.transform(test_df['broader_label'].str.split())

# Let's see the distribution of broader concepts
print("Broader concept frequencies in training set:")
label_frequencies = train_encoded_labels.sum(axis=0)
label_stats = pd.DataFrame({
    'Broader Concept': broader_mlb.classes_,
    'Count': label_frequencies,
    'Percentage': (label_frequencies / len(train_encoded_labels)) * 100
}).sort_values('Count', ascending=False)
print(label_stats.to_string(index=False))

print("\nDistribution of number of broader concepts per instance:")
label_counts_per_instance = train_encoded_labels.sum(axis=1)
unique_counts, counts = np.unique(label_counts_per_instance, return_counts=True)
for num_labels, count in zip(unique_counts, counts):
    percentage = (count / len(train_encoded_labels)) * 100
    print(f"{num_labels} concepts: {count} instances ({percentage:.2f}%)")

# Let's also check a few examples to make sure it's working correctly
print("\nSample mappings:")
for _, row in train_df[['label', 'broader_label']].head(10).iterrows():
    print(f"Original: {row['label']} -> Broader: {row['broader_label']}")# Create datasets with broader concepts
train_dataset = TextClassificationDataset(
    train_df['transcript'].values, 
    train_encoded_labels,
    tokenizer
)

val_dataset = TextClassificationDataset(
    val_df['transcript'].values, 
    val_encoded_labels,
    tokenizer
)

test_dataset = TextClassificationDataset(
    test_df['transcript'].values, 
    test_encoded_labels,
    tokenizer
)

Broader concept frequencies in training set:
               Broader Concept  Count  Percentage
                  Teacher_Talk   3586   68.252760
                         Other   3551   67.586601
        Opportunity_to_Respond   2910   55.386372
             Academic_Feedback    880   16.749144
                        Praise    533   10.144652
Corrective_Behavioral_Feedback    368    7.004187

Distribution of number of broader concepts per instance:
0 concepts: 2 instances (0.04%)
1 concepts: 1053 instances (20.04%)
2 concepts: 2310 instances (43.97%)
3 concepts: 1460 instances (27.79%)
4 concepts: 374 instances (7.12%)
5 concepts: 51 instances (0.97%)
6 concepts: 4 instances (0.08%)

Sample mappings:
Original: NEU -> Broader: Other
Original: BSP ST aOTR SV -> Broader: Other Praise Opportunity_to_Respond Teacher_Talk
Original: SV IST SV aAFF IST ST SV -> Broader: Academic_Feedback Other Teacher_Talk
Original: NEU -> Broader: Other
Original: NEU ST SV -> Broader: Other Teacher_Talk
Origi

## Data Setup

In [41]:
# Create data loaders with original batch size
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Initialize model with gradient checkpointing for memory efficiency
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    num_labels=len(broader_mlb.classes_),
    problem_type="multi_label_classification"
)
model.gradient_checkpointing_enable()  # Enable gradient checkpointing to save memory

# Training setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Setup

In [None]:
# Early stopping parameters
patience = 3
min_delta = 0.001
patience_counter = 0
early_stop = False

# Training loop
num_epochs = 5
best_val_loss = float('inf')
best_macro_f1 = 0


In [44]:
for epoch in range(num_epochs):
    # Training
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        # Clear memory
        del outputs, loss
        torch.cuda.empty_cache()
    
    # Validation
    model.eval()
    val_loss = 0
    val_preds = []
    val_true = []
    
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            val_loss += loss.item()
            
            preds = torch.sigmoid(outputs.logits).cpu().numpy()
            val_preds.extend(preds)
            val_true.extend(labels.cpu().numpy())
            
            # Clear memory
            del outputs, loss
            torch.cuda.empty_cache()
    
    gc.collect()  # Garbage collection
    
    # Calculate metrics
    val_preds = np.array(val_preds) > 0.5
    val_true = np.array(val_true)
    
    avg_val_loss = val_loss/len(val_loader)
    macro_f1 = f1_score(val_true, val_preds, average='macro')
    micro_f1 = f1_score(val_true, val_preds, average='micro')
    
    print(f"\nEpoch {epoch + 1}")
    print(f"Training Loss: {total_loss/len(train_loader):.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Micro F1: {micro_f1:.4f}")
    
    # Print metrics for each broader concept
    print("\nPer-concept metrics:")
    for i, concept in enumerate(broader_mlb.classes_):
        concept_f1 = f1_score(val_true[:, i], val_preds[:, i])
        precision = precision_score(val_true[:, i], val_preds[:, i])
        recall = recall_score(val_true[:, i], val_preds[:, i])
        print(f"{concept:25} - F1: {concept_f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")

    # Save best model and check early stopping based on both metrics
    improved = False
    
    if macro_f1 > best_macro_f1 + min_delta:
        best_macro_f1 = macro_f1
        improved = True
        torch.save(model.state_dict(), 'best_model_whisper_diarization_broader_concepts.pt')
        print("Saved best model based on Macro F1!")
    
    if avg_val_loss < best_val_loss - min_delta:
        best_val_loss = avg_val_loss
        improved = True
    
    # Update early stopping counter
    if not improved:
        patience_counter += 1
        print(f"No improvement in either metric. Patience: {patience_counter}/{patience}")
    else:
        patience_counter = 0
        print("Metrics improved!")

    # Check early stopping condition
    if patience_counter >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs!")
        early_stop = True
        break

    # Clear memory after each epoch
    del val_preds, val_true
    gc.collect()
    torch.cuda.empty_cache()

# Print training summary
print("\nTraining Summary:")
print(f"Best validation loss: {best_val_loss:.4f}")
print(f"Best macro F1: {best_macro_f1:.4f}")
print(f"Training stopped after {epoch + 1} epochs")
if early_stop:
    print("Reason: Early stopping")
else:
    print("Reason: Completed all epochs")


Epoch 1
Training Loss: 0.4437
Validation Loss: 0.4140
Macro F1: 0.5744
Micro F1: 0.7665

Per-concept metrics:
Academic_Feedback         - F1: 0.6977, Precision: 0.8824, Recall: 0.5769
Corrective_Behavioral_Feedback - F1: 0.0000, Precision: 0.0000, Recall: 0.0000
Opportunity_to_Respond    - F1: 0.8871, Precision: 0.8652, Recall: 0.9102
Other                     - F1: 0.7529, Precision: 0.6037, Recall: 1.0000
Praise                    - F1: 0.3500, Precision: 0.3443, Recall: 0.3559
Teacher_Talk              - F1: 0.7589, Precision: 0.6825, Recall: 0.8546


  _warn_prf(average, modifier, msg_start, len(result))


Saved best model based on Macro F1!
Metrics improved!

Epoch 2
Training Loss: 0.3794
Validation Loss: 0.3839
Macro F1: 0.6357
Micro F1: 0.7866

Per-concept metrics:
Academic_Feedback         - F1: 0.7891, Precision: 0.8406, Recall: 0.7436
Corrective_Behavioral_Feedback - F1: 0.1538, Precision: 1.0000, Recall: 0.0833
Opportunity_to_Respond    - F1: 0.8878, Precision: 0.9169, Recall: 0.8605
Other                     - F1: 0.7529, Precision: 0.6037, Recall: 1.0000
Praise                    - F1: 0.4557, Precision: 0.9000, Recall: 0.3051
Teacher_Talk              - F1: 0.7745, Precision: 0.6786, Recall: 0.9021
Saved best model based on Macro F1!
Metrics improved!

Epoch 3
Training Loss: 0.3388
Validation Loss: 0.3685
Macro F1: 0.6822
Micro F1: 0.7949

Per-concept metrics:
Academic_Feedback         - F1: 0.8162, Precision: 0.7939, Recall: 0.8397
Corrective_Behavioral_Feedback - F1: 0.3529, Precision: 0.6000, Recall: 0.2500
Opportunity_to_Respond    - F1: 0.8940, Precision: 0.9221, Recall: 0

## Evaluation

In [45]:
# Load best model and evaluate on test set
model.load_state_dict(torch.load('best_model_whisper_diarization_broader_concepts.pt'))
model.eval()
test_preds = []
test_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.sigmoid(outputs.logits).cpu().numpy()
        test_preds.extend(preds)
        test_true.extend(labels.cpu().numpy())
        
        # Clear memory
        del outputs
        torch.cuda.empty_cache()

test_preds = np.array(test_preds) > 0.5
test_true = np.array(test_true)

print("\nFinal Test Set Results:")
print(f"Macro F1: {f1_score(test_true, test_preds, average='macro'):.4f}")
print(f"Micro F1: {f1_score(test_true, test_preds, average='micro'):.4f}")

print("\nPer-concept test metrics:")
for i, concept in enumerate(broader_mlb.classes_):
    concept_f1 = f1_score(test_true[:, i], test_preds[:, i])
    precision = precision_score(test_true[:, i], test_preds[:, i])
    recall = recall_score(test_true[:, i], test_preds[:, i])
    print(f"{concept:25} - F1: {concept_f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")


Final Test Set Results:
Macro F1: 0.6619
Micro F1: 0.7754

Per-concept test metrics:
Academic_Feedback         - F1: 0.5965, Precision: 0.6892, Recall: 0.5258
Corrective_Behavioral_Feedback - F1: 0.3844, Precision: 0.5074, Recall: 0.3094
Opportunity_to_Respond    - F1: 0.7894, Precision: 0.8205, Recall: 0.7606
Other                     - F1: 0.8119, Precision: 0.6871, Recall: 0.9919
Praise                    - F1: 0.5787, Precision: 0.6441, Recall: 0.5253
Teacher_Talk              - F1: 0.8105, Precision: 0.7385, Recall: 0.8980


In [46]:
from sklearn.metrics import classification_report

In [47]:
print(classification_report(test_true, test_preds))

              precision    recall  f1-score   support

           0       0.69      0.53      0.60       426
           1       0.51      0.31      0.38       223
           2       0.82      0.76      0.79      1412
           3       0.69      0.99      0.81      1975
           4       0.64      0.53      0.58       217
           5       0.74      0.90      0.81      1774

   micro avg       0.72      0.84      0.78      6027
   macro avg       0.68      0.67      0.66      6027
weighted avg       0.73      0.84      0.77      6027
 samples avg       0.72      0.84      0.75      6027

