In [1]:
import torch
from PIL import Image
import open_clip

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

## Prepare Dataset

In [2]:
from datasets import load_dataset
dataset = load_dataset("nyu-mll/glue", "mrpc")

In [3]:
print(dataset['test'][0])

{'sentence1': "PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .", 'sentence2': 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .', 'label': 1, 'idx': 0}


## Prepare Model
we use pretrained model for tests

In [4]:

model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained='openai')
tokenizer = open_clip.get_tokenizer('RN50')

## Tests

In [10]:
from torch.nn.functional import cosine_similarity
from sklearn.metrics import confusion_matrix

In [None]:
def run_tests(dataset,n):
    result = {"index":[],"recall":[],"precision":[]}
    for i in range(n):
        threshold = 0.9
        prediction = []
        actual = []
        for test_case in dataset['test']:   

            tokenized_text_1 = tokenizer([test_case['sentence1']])
            tokenized_text_2 = tokenizer([test_case['sentence2']])

            with torch.no_grad(), torch.cuda.amp.autocast():
                text_embedding_1 = model.encode_text(tokenized_text_1)
                text_embedding_2 = model.encode_text(tokenized_text_2)

            if cosine_similarity(text_embedding_1,text_embedding_2) < 0.9:
                prediction.append(-1)
            else:
                prediction.append(1)
            actual.append(test_case['label'])


        cm = confusion_matrix(actual,prediction)
        tn, fp, fn, tp =  [i for i in cm.ravel() if i != 0]
        recall = tp / (tp + fn)       
        precision = tp / (tp + fp)

        result["index"].append(i)
        result["recall"].append(recall)
        result["precision"].append(precision)
    
    return actual,prediction

## Calculate Metrics

In [8]:
cm = confusion_matrix(actual,prediction)
tn, fp, fn, tp =  [i for i in cm.ravel() if i != 0]
recall = tp / (tp + fn)       
precision = tp / (tp + fp)

In [9]:
print('Results for mrpc dataset:')
print(f'Recall: {recall}')
print(f'Precision: {precision}')

Results for mrpc dataset:
Recall: 0.6198779424585876
Precision: 0.7383177570093458
