In [1]:
import pandas as pd

df = pd.read_json('./snli_1.0/snli_1.0_train.jsonl', lines=True)
df = df[['sentence1', 'sentence2']]

corpus = df['sentence1'].drop_duplicates().to_list() 

len(corpus)

150736

In [18]:
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from tqdm import tqdm

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

device = torch.device("cpu")
model.to(device)
model.eval()
batch_size = 32
max_length = 128

results = []
with torch.no_grad():
    for i in tqdm(range(0, len(corpus), batch_size)):
        batch = corpus[i : i + batch_size]
        inputs = tokenizer(
            batch,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors="pt",
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1).cpu().numpy()
        results.append(probs)

import numpy as np
classification = np.concatenate(results, axis=0)  # shape (N, num_labels)


100%|██████████| 4711/4711 [2:23:12<00:00,  1.82s/it]  


In [11]:
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from tqdm import tqdm

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)

inputs = tokenizer("Hello, I love my dog", return_tensors="pt")

model(**inputs).logits

# (NEGATIVE, POSITIVE)

tensor([[-4.0139,  4.3089]], grad_fn=<AddmmBackward0>)

In [3]:
import pickle

# with open('prems_sentim_classification.pickle', 'wb') as f:
#     pickle.dump(classification, f)

with open('prems_sentim_classification.pickle', 'rb') as f:
    labels = pickle.load(f)


In [12]:
with open('prem_identity_at_least_ten.pickle', 'rb') as f:
    id_terms = pickle.load(f)

In [15]:
from text_tools import classifier_pmi
pmi = dict()
for id_term in id_terms:
    result = classifier_pmi(corpus, [id_term], labels, 2)
    pmi[id_term] = result

In [19]:

pmi_df = pd.DataFrame([(k, v1, v2) for k, (v1, v2) in pmi.items()], 
                      columns=['Identity', 'Negative PMI', 'Positive PMI'])

In [25]:
pmi_df.sort_values(by='Negative PMI', ascending=False)[['Identity', 'Negative PMI']]

Unnamed: 0,Identity,Negative PMI
55,israeli,0.59882
50,handicapped,0.48157
29,german,0.423665
4,black,0.336347
38,muslim,0.312579
44,polish,0.309819
26,caucasian,0.304769
27,teenager,0.302824
15,elderly,0.288499
35,british,0.218224
