In [None]:
import torch
from PIL import Image
import open_clip

## Prepare Dataset

In [None]:
from datasets import load_dataset
dataset = load_dataset("nyu-mll/glue", "mrpc")

In [59]:
print(dataset['test'][0])

{'sentence1': "PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .", 'sentence2': 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .', 'label': 1, 'idx': 0}


## Prepare Model
we use pretrained model for tests

In [54]:

model, _, preprocess = open_clip.create_model_and_transforms('RN50', pretrained='openai')
tokenizer = open_clip.get_tokenizer('RN50')



## Tests

In [64]:
from torch.nn.functional import cosine_similarity

In [65]:
threshold = 0.9
prediction = []
actual = []
for test_case in dataset['test']:   

    tokenized_text_1 = tokenizer([test_case['sentence1']])
    tokenized_text_2 = tokenizer([test_case['sentence2']])

    with torch.no_grad(), torch.cuda.amp.autocast():
        text_embedding_1 = model.encode_text(tokenized_text_1)
        text_embedding_2 = model.encode_text(tokenized_text_2)

    if cosine_similarity(text_embedding_1,text_embedding_2) < 0.9:
        prediction.append(-1)
    else:
        prediction.append(1)
    actual.append(test_case['label'])


    





## Calculate Metrics

In [68]:
from sklearn.metrics import confusion_matrix


In [80]:
cm = confusion_matrix(actual,prediction)
tn, fp, fn, tp =  [i for i in cm.ravel() if i != 0]
recall = tp / (tp + fn)       
precision = tp / (tp + fp)

In [81]:
print('Results for mrpc dataset:')
print(f'Recall: {recall}')
print(f'Precision: {precision}')

Results for mrpc dataset:
Recall: 0.6198779424585876
Precision: 0.7383177570093458
