# irt

> Some basic tooling in item response theory for analyzing the results of ML benchmarks

In [None]:
#| default_exp irt

In the following we will use item response theory (IRT) to analyze benchmark results. 

A common model in IRT is the Rasch model, which described the probability of a correct response ($P(X=1)$) as a function of the ability of the person ($\theta_n$) and the difficulty ($\beta_i$) of the item: 

$$
P(X_{ni} =1| \theta_n, \beta_i) = \frac{\exp\{\theta_n - \beta_i\}}{1 + \exp\{\theta_n - \beta_i\}}
$$

We can make this more general by adding a discrimination parameter $\alpha_i$: This parameter describes how well the item discriminates between models of different abilities. The probability of a correct response is then given by:

$$
P(X_{ni} =1| \theta_n, \beta_i, \alpha_i) = \frac{\exp\{\alpha_i(\theta_n - \beta_i)\}}{1 + \exp\{\alpha_i(\theta_n - \beta_i)\}}
$$

This is the 2PL model.

In [1]:
#| hide
from nbdev.showdoc import *

In [2]:
# | hide
import pandas as pd 
import pickle

In [29]:
# | export
import numpy as np 
from girth import twopl_mml, rasch_mml, ability_mle

In [4]:
with open('/Users/kevinmaikjablonka/Library/CloudStorage/Dropbox/fileshares/model_score_dicts.pkl', 'rb') as f:
    model_score_dicts = pickle.load(f)

In [5]:
model_score_dicts['overall']['Claude-2-Zero-T']

Unnamed: 0,canary_0,description_0,keywords_0,llm_extraction_count_0,metrics_correct_classes,metrics_extra_classes,metrics_f1,metrics_hamming,metrics_incorrect_classes,metrics_missed_classes,...,is_simple_safety,is_analytical_chemistry,is_periodic_table,is_general_chemistry_exam,is_biomolecular,is_xray,is_materials_science,is_molsim,requires_calculation,is_name
0,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,ghs pictograms,"[pubchem, pictograms, safety]",0,0.0,1.0,0.000000,2.0,1.0,1.0,...,False,False,False,False,False,False,False,False,,False
1,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,Predicting the appropriate stoichiometric coef...,"[stoichiometric coefficient, physical-chemistr...",0,,,,,,,...,False,False,False,False,False,False,False,False,,False
2,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,Chemical mixing compatibility test,"[chemicals, safety compatibility test, reactiv...",0,3.0,3.0,0.666667,1.0,3.0,0.0,...,False,False,False,False,False,False,False,False,,False
3,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,"Diazonium Salts characteristics questions, inc...","[chemistry, molecules, diazonium salts, organi...",0,1.0,0.0,1.000000,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,,False
4,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,"Amines, Aromatic characteristics questions, in...","[amines, aromatic, requires-knowledge, safety,...",0,1.0,0.0,1.000000,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2849,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,Solid state chemistry questions related to poi...,"[solid state chemistry, requires-reasoning, di...",0,1.0,0.0,1.000000,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,,False
2850,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,General questions on apatite,"[apatite, substitutions in apatite, chemical f...",0,1.0,0.0,1.000000,0.0,0.0,0.0,...,False,False,False,False,False,False,True,False,,False
2851,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,Chemical mixing compatibility test,"[chemicals, safety compatibility test, reactiv...",0,2.0,3.0,0.571429,1.5,3.0,0.0,...,False,False,False,False,False,False,False,False,,False
2852,BENCHMARK DATA SHOULD NEVER APPEAR IN TRAINING...,Preference for further drug development,"[preference, requires-intuition]",0,1.0,0.0,1.000000,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,,False


In [31]:
# | export
from enum import Enum
from dataclasses import dataclass
from typing import Dict

In [32]:
# | export
class ScoringType(Enum): 
    BINARY = 'binary'
    PARTIAL = 'partial'

We define a container to store meta information for each question (item). We will track some identifier and what type of scoring the question uses.

In [33]:
# | export

@dataclass
class QuestionMetadata:
    """Base class for question metadata"""
    question_id: str
    scoring_type: ScoringType

    def compute_score(self, response) -> float:
        """Base method for computing scores"""
        raise NotImplementedError

In [34]:
# | export

@dataclass
class BinaryQuestionMetadata(QuestionMetadata):
    """Metadata for binary scored questions"""
    def __init__(self, question_id: str):
        super().__init__(question_id, ScoringType.BINARY)

    def compute_score(self, row, column: str='all_correct_') -> float:
        """Use the all_correct column directly"""
        return float(row[column])

In [63]:
# | export

class BenchmarkAnalyzer:
    def __init__(self):
        self.model_dataframes: Dict[str, pd.DataFrame] = {}
        self.question_metadata: Dict[str, QuestionMetadata] = {}
        self.score_matrix = None
        self.model_ids = None

    def add_model_results(self, model_id: str, results_df: pd.DataFrame):
        """Add a model's results DataFrame"""
        self.model_dataframes[model_id] = results_df

    def add_question_metadata(self, metadata: QuestionMetadata):
        """Add metadata for a question"""
        self.question_metadata[metadata.question_id] = metadata

    def compute_score_matrix(self) -> np.ndarray:
        """Compute score matrix using metadata-specific scoring"""
        if not self.model_dataframes or not self.question_metadata:
            raise ValueError("Need both model results and question metadata")

        self.model_ids = list(self.model_dataframes.keys())
        question_ids = list(self.question_metadata.keys())

        # Initialize score matrix
        self.score_matrix = np.full(
            (len(self.model_ids), len(question_ids)), 
            np.nan
        )

        # Compute scores
        for model_idx, model_id in enumerate(self.model_ids):
            df = self.model_dataframes[model_id]
            for q_idx, q_id in enumerate(question_ids):
                if q_id not in df.index:
                    continue

                metadata = self.question_metadata[q_id]
                row = df.loc[q_id]

                self.score_matrix[model_idx, q_idx] = metadata.compute_score(row)


        return self.score_matrix

    def fit_irt(self, model='2pl') -> Dict[str, pd.DataFrame]:
        if self.score_matrix is None:
            self.compute_score_matrix()

        binary_matrix = (self.score_matrix >= 0.5).astype(int).T #  needs to be [n_items, n_participants]
        
        if model == '2pl':
            results = twopl_mml(binary_matrix)
            difficulties = results['Difficulty']
            discriminations = results['Discrimination']
        else:  # rasch
            results = rasch_mml(binary_matrix)
            difficulties = results['Difficulty']
            discriminations = np.ones_like(difficulties) 

        print(binary_matrix.T.shape)
        print(difficulties.shape)
        print(discriminations.shape)
        
        abilities = ability_mle(
            binary_matrix,  
            difficulties,
            discriminations,
            no_estimate=np.nan  
        )

        return {
            'difficulties': difficulties,
            'discriminations': discriminations,
            'abilities': abilities
        }


    def analyze_extreme_items(difficulties, discriminations, question_ids, threshold=0.95):
        """Identify items with extreme parameters"""
        extreme_items = pd.DataFrame({
            'question_id': question_ids,
            'difficulty': difficulties,
            'discrimination': discriminations
        })

        # Find items with extreme values
        extreme_items['is_extreme'] = (
            (discriminations > threshold * 5.0) |  # High discrimination
            (difficulties > threshold * 6.0) |     # Very difficult
            (difficulties < -4.0)                  # Very easy
        )

        return extreme_items[extreme_items['is_extreme']]



Now, let's the class with some of our data

In [59]:
analyzer = BenchmarkAnalyzer()

for model_id, df in model_score_dicts['overall'].items():
    analyzer.add_model_results(model_id, df)


# for now, let's just add the binary questions
for i, row in model_score_dicts['overall']['Claude-2-Zero-T'].iterrows():
    if row['metrics_hamming']:
        analyzer.add_question_metadata(BinaryQuestionMetadata(i))

In [60]:
results = analyzer.fit_irt(model='rasch')

(21, 1614)
(1614,)
(1614,)


  return -np.log(otpt).dot(counts)


In [61]:
results

{'difficulties': array([2.62365185, 0.3477837 , 5.99993935, ..., 0.11529809, 5.99993935,
        3.42585858], shape=(1614,)),
 'discriminations': array([1., 1., 1., ..., 1., 1., 1.], shape=(1614,)),
 'abilities': array([ 1.3137931 ,  1.25513802,  0.90108169, -0.02353844, -0.21020001,
         1.46848924,  0.13060575,  0.87811677,  0.94259198,  0.50034376,
        -2.2391909 ,  0.2404148 , -2.58453054, -3.97347412,  0.46936694,
         0.69657075, -0.25956567,  0.2272392 ,  0.04702948, -0.07219556,
        -2.91616702])}

In [62]:
dict(zip(analyzer.model_ids, results['abilities']))

{'Mistral-Large-2': np.float64(1.3137930956517858),
 'Llama-3.1-70B-Instruct': np.float64(1.2551380191680315),
 'Claude-3.5 (Sonnet)': np.float64(0.9010816922440369),
 'Mistral-8x7b-Instruct': np.float64(-0.023538437873462405),
 'Command-R+': np.float64(-0.21020000564395697),
 'Llama-3.1-405B-Instruct': np.float64(1.4684892351094294),
 'Llama-3.1-8B-Instruct': np.float64(0.13060574837560496),
 'GPT-4o': np.float64(0.8781167650567518),
 'Llama-3-70B-Instruct': np.float64(0.9425919771674786),
 'PaperQA2': np.float64(0.5003437630342042),
 'Gemma-1.1-7B-it': np.float64(-2.239190899661585),
 'Gemma-2-9B-it': np.float64(0.24041480098122026),
 'Llama-2-70B Chat': np.float64(-2.5845305399235063),
 'Galatica-120b': np.float64(-3.9734741234871334),
 'Llama-3-8B-Instruct': np.float64(0.4693669436520287),
 'Gemini-Pro': np.float64(0.6965707476184959),
 'GPT-4': np.float64(-0.25956567447336226),
 'Phi-3-Medium-4k-Instruct': np.float64(0.22723920350088497),
 'Claude-3 (Opus)': np.float64(0.047029477

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()