In [30]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import numpy as np
import evaluate
import os
import torch
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from collections import Counter
import re
import random

In [31]:
model_path = "./distillbert-base-finetuned"
model_path = "./distillbert-base-finetuned"
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [32]:
dataset = load_dataset('imdb')
train_data = dataset["train"]
test_data = dataset["test"]


In [33]:
from datasets import concatenate_datasets

def extract_phrase(ds, phrase):
    phrase = phrase.lower()
    subset = []
    for set in ds:
        subset_temp = set.filter(lambda x: phrase.lower() in x["text"].lower()
                       )
        subset.append(subset_temp)

    return concatenate_datasets(subset)

In [34]:
phrase_name = extract_phrase([train_data], "7/10")
phrase_name

Dataset({
    features: ['text', 'label'],
    num_rows: 198
})

In [None]:
def run_model_on_subset(dataset, model=model, tokenizer=tokenizer):
    """
    dataset must have columns: 'text' and 'label'
    model_name must be a HuggingFace text-classification model
    """
    
    texts = [str(t) for t in dataset["text"]]
    gold = list(dataset["label"])
    
    # Tokenize in one batch
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )
    
    # Predict
    with torch.no_grad():
        logits = model(**enc).logits
    preds = torch.argmax(logits, dim=1).tolist()
    
    # Print nicely
    # for t, g, p in zip(texts, gold, preds):
    #     print("TEXT:", t[:150], "...")
    #     print("GOLD:", g)
    #     print("PRED:", p)
    #     print("---------")
    
    # Return structured results
    return {
        "text": texts,
        "gold": gold,
        "pred": preds
    }
results = run_model_on_subset(phrase_name, model, tokenizer)




def run_model_on_subset(dataset, model=model, tokenizer=tokenizer):
    texts = [str(t) for t in dataset["text"]]
    gold = list(dataset["label"])

    # Tokenize in one batch
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt")
    
    model.eval()

    # Predict
    with torch.no_grad():
        logits = model(**enc).logits
        probs = softmax(logits, dim=1).cpu().numpy()

    pred = probs.argmax(axis=1).tolist()
    pos_prob = probs[:,1].tolist()

    # Print nicely
    # for t, g, p in zip(texts, gold, preds):
    #     print("TEXT:", t[:150], "...")
    #     print("GOLD:", g)
    #     print("PRED:", p)
    #     print("---------")
    

    return {
        "text": texts,
        "gold": gold,
        "pred": pred,
        "pos_prob": pos_prob
            }


In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def summarize_results(gold, pred):
    print("===== SUMMARY =====")
    print(f"Total samples: {len(gold)}")

    # Accuracy
    acc = accuracy_score(gold, pred)
    print(f"Accuracy: {acc:.4f}")

    # Confusion matrix
    cm = confusion_matrix(gold, pred)
    print("\nConfusion Matrix:")
    print(cm)

    # Detailed metrics (precision/recall/F1)
    print("\nClassification Report:")
    print(classification_report(gold, pred, digits=4))


In [39]:
summarize_results(results["gold"],results["pred"])

===== SUMMARY =====
Total samples: 198
Accuracy: 0.9697

Confusion Matrix:
[[  5   1]
 [  5 187]]

Classification Report:
              precision    recall  f1-score   support

           0     0.5000    0.8333    0.6250         6
           1     0.9947    0.9740    0.9842       192

    accuracy                         0.9697       198
   macro avg     0.7473    0.9036    0.8046       198
weighted avg     0.9797    0.9697    0.9733       198



In [51]:
def replace_phrase(dataset, old_phrase, new_phrase):
    pattern = re.compile(re.escape(old_phrase), re.IGNORECASE)

    def replace_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub(new_phrase, t) for t in texts]
        return {"text": updated}

    return dataset.map(replace_fn, batched=True)

In [41]:
def flip_test(ds, phrase, replacement,model=model, tokenizer=tokenizer):
    #etract phrase from dataset(s)
    subset = extract_phrase(ds,phrase)

    # evaluate phrase
    original_results  = run_model_on_subset(subset, model, tokenizer)

    flipped_set = replace_phrase(subset, phrase, replacement)
    flipped_results = run_model_on_subset(flipped_set, model, tokenizer)

    # Feature results
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(flipped_results["gold"], flipped_results["pred"])

    return subset, flipped_set





old_phrase = "7/10"
new_phrase = "1/10"
x,y = flip_test([test_data], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

===== SUMMARY =====
Total samples: 198
Accuracy: 0.8990

Confusion Matrix:
[[  7   1]
 [ 19 171]]

Classification Report:
              precision    recall  f1-score   support

           0     0.2692    0.8750    0.4118         8
           1     0.9942    0.9000    0.9448       190

    accuracy                         0.8990       198
   macro avg     0.6317    0.8875    0.6783       198
weighted avg     0.9649    0.8990    0.9232       198

===== SUMMARY =====
Total samples: 198
Accuracy: 0.8990

Confusion Matrix:
[[  7   1]
 [ 19 171]]

Classification Report:
              precision    recall  f1-score   support

           0     0.2692    0.8750    0.4118         8
           1     0.9942    0.9000    0.9448       190

    accuracy                         0.8990       198
   macro avg     0.6317    0.8875    0.6783       198
weighted avg     0.9649    0.8990    0.9232       198



In [61]:
old_phrase = "voight"
new_phrase = "baldwin"
x,y = flip_test([train_data], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

Map: 100%|██████████| 68/68 [00:00<00:00, 16638.24 examples/s]


===== SUMMARY =====
Total samples: 68
Accuracy: 1.0000

Confusion Matrix:
[[10  0]
 [ 0 58]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        10
           1     1.0000    1.0000    1.0000        58

    accuracy                         1.0000        68
   macro avg     1.0000    1.0000    1.0000        68
weighted avg     1.0000    1.0000    1.0000        68

===== SUMMARY =====
Total samples: 68
Accuracy: 1.0000

Confusion Matrix:
[[10  0]
 [ 0 58]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        10
           1     1.0000    1.0000    1.0000        58

    accuracy                         1.0000        68
   macro avg     1.0000    1.0000    1.0000        68
weighted avg     1.0000    1.0000    1.0000        68



In [55]:
sum([1 for el in x["text"][:] if "10/10" in el])
# sum([1 for el in y["text"][:] if "1/10" in el])

241

In [62]:
numeric = load_dataset("csv", data_files="numeric.csv")


num = extract_phrase([numeric["train"]], "7/10")

old_phrase = "7/10"
new_phrase = "1/10"
x,y = flip_test([numeric["train"]], old_phrase, new_phrase, model=model, tokenizer = tokenizer)

===== SUMMARY =====
Total samples: 100
Accuracy: 0.8800

Confusion Matrix:
[[43  7]
 [ 5 45]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8958    0.8600    0.8776        50
           1     0.8654    0.9000    0.8824        50

    accuracy                         0.8800       100
   macro avg     0.8806    0.8800    0.8800       100
weighted avg     0.8806    0.8800    0.8800       100

===== SUMMARY =====
Total samples: 100
Accuracy: 0.9100

Confusion Matrix:
[[48  2]
 [ 7 43]]

Classification Report:
              precision    recall  f1-score   support

           0     0.8727    0.9600    0.9143        50
           1     0.9556    0.8600    0.9053        50

    accuracy                         0.9100       100
   macro avg     0.9141    0.9100    0.9098       100
weighted avg     0.9141    0.9100    0.9098       100



In [63]:
x["text"][:]

['Surprisingly tender and well-paced, this drama felt like a simple 7/10 at first but grew into something I genuinely loved.',
 'From the opening scene, it delivered steady charm; I figured a 7/10 movie night, yet it kept winning me over.',
 'What impressed me most was the quiet confidence—easily a 7/10 on craft alone, and even higher on heart.',
 'Not flashy, just deeply sincere; I walked in expecting a 7/10 and left feeling satisfied and moved.',
 'The characters click immediately, turning what could be a routine 7/10 into a small standout.',
 'Right away you can tell care went into this; it plays like a 7/10 on paper but lands closer to great.',
 'Honestly, I was ready for a safe 7/10, but the emotional build sneaks up on you in the best way.',
 "A gentle story with real warmth—if you call it a 7/10, that's already a compliment it earns.",
 'Without trying too hard, it becomes a comfort watch; I mentally tagged it 7/10 early on, then kept bumping it up.',
 "At first glance it's a 7/

In [64]:
def delete_phrase_dataset(dataset, phrase):
    pattern = re.compile(re.escape(phrase), re.IGNORECASE)

    def delete_fn(batch):
        texts = batch["text"]
        updated = [pattern.sub("", t).replace("  ", " ").strip() for t in texts]
        return {"text": updated}

    return dataset.map(delete_fn, batched=True)


In [68]:
def delete_test(ds, phrase, model=model, tokenizer=tokenizer):
    # extract phrase subset
    subset = extract_phrase(ds, phrase)

    # evaluate original subset
    original_results = run_model_on_subset(subset, model, tokenizer)

    # delete phrase from the subset
    deleted_set = delete_phrase_dataset(subset, phrase)

    # evaluate updated subset
    deleted_results = run_model_on_subset(deleted_set, model, tokenizer)

    # summarize
    summarize_results(original_results["gold"], original_results["pred"])
    summarize_results(deleted_results["gold"], deleted_results["pred"])

    return subset, deleted_set


positive_candidate_shortcuts = ['7/10',
  '8/10',
  '9/10',
  '10/10',
  'matthau', # actor
  'explores',
  'hawke', # actor
  'voight', # actor
  'peters',
  'victoria',
  'powell',
  'sadness',
  'walsh',
  'mann',
  'winters',
  'brosnan',
  'layers',
  'friendship',
  'ralph',
  'montana',
  'watson',
  'sullivan',
  'detract',
  'conveys',
  'loneliness',
  'lemmon',
  'nancy',]

for phrase in positive_candidate_shortcuts:
    print(f"----------------------{phrase}---------------------------")
    subset, deleted = delete_test(
        [train_data],
        phrase,
        model=model,
        tokenizer=tokenizer
    )


----------------------7/10---------------------------
===== SUMMARY =====
Total samples: 198
Accuracy: 0.9697

Confusion Matrix:
[[  5   1]
 [  5 187]]

Classification Report:
              precision    recall  f1-score   support

           0     0.5000    0.8333    0.6250         6
           1     0.9947    0.9740    0.9842       192

    accuracy                         0.9697       198
   macro avg     0.7473    0.9036    0.8046       198
weighted avg     0.9797    0.9697    0.9733       198

===== SUMMARY =====
Total samples: 198
Accuracy: 0.9596

Confusion Matrix:
[[  5   1]
 [  7 185]]

Classification Report:
              precision    recall  f1-score   support

           0     0.4167    0.8333    0.5556         6
           1     0.9946    0.9635    0.9788       192

    accuracy                         0.9596       198
   macro avg     0.7056    0.8984    0.7672       198
weighted avg     0.9771    0.9596    0.9660       198

----------------------8/10----------------------

In [69]:

negative_candidate_shortcuts =[
  '2/10',
  'boll',
  '4/10',
  '3/10',
  '1/10',
  'nope',
  'camcorder',
  'baldwin',
  'arty',
  'cannibal',
  'rubber',
  'shoddy',
  'barrel',
  'plodding',
  'plastic',
  'mutant',
  'costs',
  'claus',
  'ludicrous',
  'nonsensical',
  'bother',
  'disjointed']

for phrase in negative_candidate_shortcuts:
    print(f"----------------------{phrase}---------------------------")
    subset, deleted = delete_test(
        [train_data],
        phrase,
        model=model,
        tokenizer=tokenizer
    )

----------------------2/10---------------------------
===== SUMMARY =====
Total samples: 124
Accuracy: 0.9839

Confusion Matrix:
[[121   2]
 [  0   1]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9837    0.9918       123
           1     0.3333    1.0000    0.5000         1

    accuracy                         0.9839       124
   macro avg     0.6667    0.9919    0.7459       124
weighted avg     0.9946    0.9839    0.9878       124

===== SUMMARY =====
Total samples: 124
Accuracy: 0.9758

Confusion Matrix:
[[120   3]
 [  0   1]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9756    0.9877       123
           1     0.2500    1.0000    0.4000         1

    accuracy                         0.9758       124
   macro avg     0.6250    0.9878    0.6938       124
weighted avg     0.9940    0.9758    0.9829       124

----------------------boll----------------------

Filter: 100%|██████████| 25000/25000 [00:00<00:00, 297573.05 examples/s]
Map: 100%|██████████| 73/73 [00:00<00:00, 15521.86 examples/s]


===== SUMMARY =====
Total samples: 73
Accuracy: 1.0000

Confusion Matrix:
[[61  0]
 [ 0 12]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        61
           1     1.0000    1.0000    1.0000        12

    accuracy                         1.0000        73
   macro avg     1.0000    1.0000    1.0000        73
weighted avg     1.0000    1.0000    1.0000        73

===== SUMMARY =====
Total samples: 73
Accuracy: 1.0000

Confusion Matrix:
[[61  0]
 [ 0 12]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        61
           1     1.0000    1.0000    1.0000        12

    accuracy                         1.0000        73
   macro avg     1.0000    1.0000    1.0000        73
weighted avg     1.0000    1.0000    1.0000        73

----------------------barrel---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 291172.44 examples/s]
Map: 100%|██████████| 83/83 [00:00<00:00, 19539.05 examples/s]


===== SUMMARY =====
Total samples: 83
Accuracy: 0.9880

Confusion Matrix:
[[66  1]
 [ 0 16]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9851    0.9925        67
           1     0.9412    1.0000    0.9697        16

    accuracy                         0.9880        83
   macro avg     0.9706    0.9925    0.9811        83
weighted avg     0.9887    0.9880    0.9881        83

===== SUMMARY =====
Total samples: 83
Accuracy: 0.9880

Confusion Matrix:
[[66  1]
 [ 0 16]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9851    0.9925        67
           1     0.9412    1.0000    0.9697        16

    accuracy                         0.9880        83
   macro avg     0.9706    0.9925    0.9811        83
weighted avg     0.9887    0.9880    0.9881        83

----------------------plodding---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 302850.95 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 14736.50 examples/s]


===== SUMMARY =====
Total samples: 50
Accuracy: 0.9200

Confusion Matrix:
[[40  1]
 [ 3  6]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9302    0.9756    0.9524        41
           1     0.8571    0.6667    0.7500         9

    accuracy                         0.9200        50
   macro avg     0.8937    0.8211    0.8512        50
weighted avg     0.9171    0.9200    0.9160        50

===== SUMMARY =====
Total samples: 50
Accuracy: 0.9400

Confusion Matrix:
[[41  0]
 [ 3  6]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9318    1.0000    0.9647        41
           1     1.0000    0.6667    0.8000         9

    accuracy                         0.9400        50
   macro avg     0.9659    0.8333    0.8824        50
weighted avg     0.9441    0.9400    0.9351        50

----------------------plastic---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 300051.79 examples/s]
Map: 100%|██████████| 140/140 [00:00<00:00, 25115.59 examples/s]


===== SUMMARY =====
Total samples: 140
Accuracy: 0.9929

Confusion Matrix:
[[111   1]
 [  0  28]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9911    0.9955       112
           1     0.9655    1.0000    0.9825        28

    accuracy                         0.9929       140
   macro avg     0.9828    0.9955    0.9890       140
weighted avg     0.9931    0.9929    0.9929       140

===== SUMMARY =====
Total samples: 140
Accuracy: 0.9929

Confusion Matrix:
[[111   1]
 [  0  28]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9911    0.9955       112
           1     0.9655    1.0000    0.9825        28

    accuracy                         0.9929       140
   macro avg     0.9828    0.9955    0.9890       140
weighted avg     0.9931    0.9929    0.9929       140

----------------------mutant---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 279676.20 examples/s]
Map: 100%|██████████| 78/78 [00:00<00:00, 18527.34 examples/s]


===== SUMMARY =====
Total samples: 78
Accuracy: 0.9487

Confusion Matrix:
[[59  3]
 [ 1 15]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9833    0.9516    0.9672        62
           1     0.8333    0.9375    0.8824        16

    accuracy                         0.9487        78
   macro avg     0.9083    0.9446    0.9248        78
weighted avg     0.9526    0.9487    0.9498        78

===== SUMMARY =====
Total samples: 78
Accuracy: 0.9615

Confusion Matrix:
[[59  3]
 [ 0 16]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9516    0.9752        62
           1     0.8421    1.0000    0.9143        16

    accuracy                         0.9615        78
   macro avg     0.9211    0.9758    0.9447        78
weighted avg     0.9676    0.9615    0.9627        78

----------------------costs---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 277493.56 examples/s]
Map: 100%|██████████| 236/236 [00:00<00:00, 38702.52 examples/s]


===== SUMMARY =====
Total samples: 236
Accuracy: 0.9958

Confusion Matrix:
[[188   1]
 [  0  47]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9947    0.9973       189
           1     0.9792    1.0000    0.9895        47

    accuracy                         0.9958       236
   macro avg     0.9896    0.9974    0.9934       236
weighted avg     0.9959    0.9958    0.9958       236

===== SUMMARY =====
Total samples: 236
Accuracy: 0.9958

Confusion Matrix:
[[188   1]
 [  0  47]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9947    0.9973       189
           1     0.9792    1.0000    0.9895        47

    accuracy                         0.9958       236
   macro avg     0.9896    0.9974    0.9934       236
weighted avg     0.9959    0.9958    0.9958       236

----------------------claus---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 276069.64 examples/s]
Map: 100%|██████████| 158/158 [00:00<00:00, 27588.36 examples/s]


===== SUMMARY =====
Total samples: 158
Accuracy: 0.9810

Confusion Matrix:
[[77  2]
 [ 1 78]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9872    0.9747    0.9809        79
           1     0.9750    0.9873    0.9811        79

    accuracy                         0.9810       158
   macro avg     0.9811    0.9810    0.9810       158
weighted avg     0.9811    0.9810    0.9810       158

===== SUMMARY =====
Total samples: 158
Accuracy: 0.9810

Confusion Matrix:
[[77  2]
 [ 1 78]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9872    0.9747    0.9809        79
           1     0.9750    0.9873    0.9811        79

    accuracy                         0.9810       158
   macro avg     0.9811    0.9810    0.9810       158
weighted avg     0.9811    0.9810    0.9810       158

----------------------ludicrous---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 305236.53 examples/s]
Map: 100%|██████████| 205/205 [00:00<00:00, 23439.53 examples/s]


===== SUMMARY =====
Total samples: 205
Accuracy: 0.9707

Confusion Matrix:
[[164   2]
 [  4  35]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9762    0.9880    0.9820       166
           1     0.9459    0.8974    0.9211        39

    accuracy                         0.9707       205
   macro avg     0.9611    0.9427    0.9515       205
weighted avg     0.9704    0.9707    0.9704       205

===== SUMMARY =====
Total samples: 205
Accuracy: 0.9463

Confusion Matrix:
[[158   8]
 [  3  36]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9814    0.9518    0.9664       166
           1     0.8182    0.9231    0.8675        39

    accuracy                         0.9463       205
   macro avg     0.8998    0.9374    0.9169       205
weighted avg     0.9503    0.9463    0.9475       205

----------------------nonsensical---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 299104.32 examples/s]
Map: 100%|██████████| 77/77 [00:00<00:00, 18147.98 examples/s]


===== SUMMARY =====
Total samples: 77
Accuracy: 0.9610

Confusion Matrix:
[[61  1]
 [ 2 13]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9683    0.9839    0.9760        62
           1     0.9286    0.8667    0.8966        15

    accuracy                         0.9610        77
   macro avg     0.9484    0.9253    0.9363        77
weighted avg     0.9605    0.9610    0.9605        77

===== SUMMARY =====
Total samples: 77
Accuracy: 0.9610

Confusion Matrix:
[[61  1]
 [ 2 13]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9683    0.9839    0.9760        62
           1     0.9286    0.8667    0.8966        15

    accuracy                         0.9610        77
   macro avg     0.9484    0.9253    0.9363        77
weighted avg     0.9605    0.9610    0.9605        77

----------------------bother---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 278697.22 examples/s]
Map: 100%|██████████| 633/633 [00:00<00:00, 45908.74 examples/s]


===== SUMMARY =====
Total samples: 633
Accuracy: 0.9716

Confusion Matrix:
[[472  12]
 [  6 143]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9874    0.9752    0.9813       484
           1     0.9226    0.9597    0.9408       149

    accuracy                         0.9716       633
   macro avg     0.9550    0.9675    0.9610       633
weighted avg     0.9722    0.9716    0.9718       633

===== SUMMARY =====
Total samples: 633
Accuracy: 0.9700

Confusion Matrix:
[[471  13]
 [  6 143]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9874    0.9731    0.9802       484
           1     0.9167    0.9597    0.9377       149

    accuracy                         0.9700       633
   macro avg     0.9520    0.9664    0.9590       633
weighted avg     0.9708    0.9700    0.9702       633

----------------------disjointed---------------------------


Filter: 100%|██████████| 25000/25000 [00:00<00:00, 303029.50 examples/s]
Map: 100%|██████████| 98/98 [00:00<00:00, 22663.16 examples/s]


===== SUMMARY =====
Total samples: 98
Accuracy: 0.9592

Confusion Matrix:
[[75  3]
 [ 1 19]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9868    0.9615    0.9740        78
           1     0.8636    0.9500    0.9048        20

    accuracy                         0.9592        98
   macro avg     0.9252    0.9558    0.9394        98
weighted avg     0.9617    0.9592    0.9599        98

===== SUMMARY =====
Total samples: 98
Accuracy: 0.9286

Confusion Matrix:
[[73  5]
 [ 2 18]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9733    0.9359    0.9542        78
           1     0.7826    0.9000    0.8372        20

    accuracy                         0.9286        98
   macro avg     0.8780    0.9179    0.8957        98
weighted avg     0.9344    0.9286    0.9304        98

