## Analysis of LMs with minicons

For docs, see https://github.com/kanishkamisra/minicons

### Sequence probabilities

In [49]:
from minicons import scorer
import numpy as np

In [50]:
#mlm_model = scorer.MaskedLMScorer('bert-base-uncased', 'cpu') ## if you use a masked LM, e.g. BERT
ilm_model_fiction = scorer.IncrementalLMScorer('fiction-model/final', 'cpu') # if you use an autoregressive LM, e.g. GPT, Llama
ilm_model_formal = scorer.IncrementalLMScorer('formal-model/final', 'cpu') # if you use an autoregressive LM, e.g. GPT, Llama

In [51]:
# File with sentence pairs
file_path = 'blimp_pairs/animate_subject_trans.jsonl' 

In [52]:
import json
# Read the file
test_pairs = []
with open(file_path, 'r') as f:
    for line in f:
        data = json.loads(line.strip())
        test_pairs.append({
            'good': data['sentence_good'],
            'bad': data['sentence_bad'],
            'phenomenon': data['UID'],
        })

In [59]:
def test_on_pairs(test_pairs, ilm_model):
    print(f"Testing on {len(test_pairs)} pairs from {test_pairs[0]['phenomenon']}")
    print("=" * 60)

    correct_predictions = 0
    total_pairs = len(test_pairs)
    diffs_correct = []
    diffs_wrong = []

    for i, pair in enumerate(test_pairs):
        good_sent = pair['good']
        bad_sent = pair['bad']
        
        # Calculate surprisal
        good_surprisal = ilm_model.sequence_score([good_sent], reduction=lambda x: -x.sum(0).item())[0]
        bad_surprisal = ilm_model.sequence_score([bad_sent], reduction=lambda x: -x.sum(0).item())[0]

        # Model is correct if good sentence has lower surprisal
        correct = good_surprisal < bad_surprisal
        if correct:
            correct_predictions += 1
            diffs_correct.append(bad_surprisal - good_surprisal)
        else:
            diffs_wrong.append(good_surprisal - bad_surprisal)

        print(f"Pair {i}:")
        print(f"  ✓ Good: {good_sent} Surprisal: {good_surprisal:.2f}")
        print(f"  ✗ Bad:  {bad_sent} Surprisal: {bad_surprisal:.2f}")
        print(f"  Result: {'✓ CORRECT' if correct else '✗ WRONG'}")
        print()

    accuracy = correct_predictions / total_pairs
    print(f"FINAL RESULTS:")
    print(f"Correct predictions: {correct_predictions}/{total_pairs}")
    print(f"Mean surprisal difference for correct pairs: {np.mean(diffs_correct):.2f}")
    print(f"Mean surprisal difference for wrong pairs: {np.mean(diffs_wrong):.2f}")
    print(f"Accuracy: {accuracy*100:.1f}%")


In [60]:
# Test formal model
test_on_pairs(test_pairs, ilm_model_formal)

Testing on 1000 pairs from animate_subject_trans
Pair 0:
  ✓ Good: Tina revealed Margaret. Surprisal: 58.27
  ✗ Bad:  The horse revealed Margaret. Surprisal: 57.08
  Result: ✗ WRONG

Pair 1:
  ✓ Good: Danielle visited Irene. Surprisal: 56.29
  ✗ Bad:  The eye visited Irene. Surprisal: 46.24
  Result: ✗ WRONG

Pair 2:
  ✓ Good: Paul runs around the art galleries. Surprisal: 74.97
  ✗ Bad:  The river runs around the art galleries. Surprisal: 76.25
  Result: ✓ CORRECT

Pair 3:
  ✓ Good: Most banks have praised Raymond. Surprisal: 76.34
  ✗ Bad:  The jackets have praised Raymond. Surprisal: 91.56
  Result: ✓ CORRECT

Pair 4:
  ✓ Good: Every doctor was selling some restaurants. Surprisal: 95.19
  ✗ Bad:  A cup was selling some restaurants. Surprisal: 80.68
  Result: ✗ WRONG

Pair 5:
  ✓ Good: Steve bikes to a public park. Surprisal: 79.68
  ✗ Bad:  Every fork bikes to a public park. Surprisal: 76.27
  Result: ✗ WRONG

Pair 6:
  ✓ Good: Beth scares Roger. Surprisal: 48.98
  ✗ Bad:  A carriag

In [61]:
# Test fiction model
test_on_pairs(test_pairs, ilm_model_fiction)

Testing on 1000 pairs from animate_subject_trans
Pair 0:
  ✓ Good: Tina revealed Margaret. Surprisal: 32.89
  ✗ Bad:  The horse revealed Margaret. Surprisal: 34.59
  Result: ✓ CORRECT

Pair 1:
  ✓ Good: Danielle visited Irene. Surprisal: 49.26
  ✗ Bad:  The eye visited Irene. Surprisal: 37.04
  Result: ✗ WRONG

Pair 2:
  ✓ Good: Paul runs around the art galleries. Surprisal: 66.06
  ✗ Bad:  The river runs around the art galleries. Surprisal: 62.04
  Result: ✗ WRONG

Pair 3:
  ✓ Good: Most banks have praised Raymond. Surprisal: 57.51
  ✗ Bad:  The jackets have praised Raymond. Surprisal: 69.30
  Result: ✓ CORRECT

Pair 4:
  ✓ Good: Every doctor was selling some restaurants. Surprisal: 65.57
  ✗ Bad:  A cup was selling some restaurants. Surprisal: 67.89
  Result: ✓ CORRECT

Pair 5:
  ✓ Good: Steve bikes to a public park. Surprisal: 54.12
  ✗ Bad:  Every fork bikes to a public park. Surprisal: 66.63
  Result: ✓ CORRECT

Pair 6:
  ✓ Good: Beth scares Roger. Surprisal: 59.10
  ✗ Bad:  A car