In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 7.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 36.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 52.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=9df9208021d

In [None]:
import torch
from transformers import BertTokenizer, BertForNextSentencePrediction

# Next Sentence Prediction func

- 사용 가능한 [모델 목록](https://huggingface.co/transformers/pretrained_models.html) 


요청에 따라 모델을 인자로 넘겨주게끔 함수를 짰습니다. 하지만 이 함수를 그대로 사용할 경우, 하나의 문장은 괜찮지만 반복문을 사용할 때 매번 tokenizer, model을 다운로드 받으므로 모델과 토크나이저 선언 부분은 밖으로 빼는 것을 권장합니다!

In [None]:
def sen_probability(sentence1, sentence2, model_name = "bert-base-uncased"):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForNextSentencePrediction.from_pretrained(model_name)
    encoding = tokenizer(sentence1, sentence2, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoding, labels=torch.LongTensor([1]))
        logits = outputs.logits
        prob = logits.softmax(dim=1)
        relat_prob = round(prob[0][0].item(), 4)
        neg_relat_prob = round(prob[0][1].item(), 4)
        pred = "높음" if torch.argmax(prob) == 0 else "낮음"

    return {
        'sentence1': sentence1,
        'sentence2' : sentence2,
        'prediction': pred,
        '다음 문장이 출현할 확률': f"{relat_prob * 100}%",
        '다음 문장이 출현하지 않을 확률': f"{neg_relat_prob * 100}%",
    }

In [None]:
# test

prompt = "Carl fixed the computer for Margaret. It was because"
next_sentence = "he really liked her."

prompt = "Kristen ran into Howard." 
next_sentence="Howard got hurt."
# Kristen ran into Howard. They talk all the time now.
# Jessica compared grades with Neal. She was doing a little better than him.

sen_probability(prompt, next_sentence,"bert-large-uncased")

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'prediction': '높음',
 'sentence1': 'Kristen ran into Howard.',
 'sentence2': 'Howard got hurt.',
 '다음 문장이 출현하지 않을 확률': '0.0%',
 '다음 문장이 출현할 확률': '100.0%'}

# 작업 수행

여기서부터는 지워도 됩니다. 


In [None]:
import pandas as pd

In [None]:
pair = pd.read_csv("/content/nsp_random pair_2.csv")

In [None]:
pair.head()

Unnamed: 0,no,code1,sentence1,code2,sentence2
0,1,y,She's the shit.,new,The shelter smells of bodies and bad habits.
1,2,y,You don't understand.,new,All of us are experts.
2,3,y,Ours was the prolonged kind.,new,We're staying focused on the big picture.
3,4,y,Stranger things have happened.,new,Players and teams think they can break them.
4,5,y,Women's Conference begins 9 a.m.,new,And that's why we brought a lawsuit.


In [None]:
pair.isnull().sum()

no           0
code1        0
sentence1    0
code2        0
sentence2    0
dtype: int64

In [None]:
prompt=pair['sentence1']
respones=pair['sentence2']

In [None]:
prompt

0                        She's the shit.
1                  You don't understand.
2           Ours was the prolonged kind.
3         Stranger things have happened.
4       Women's Conference begins 9 a.m.
                     ...                
995                   Give me the match.
996               But there was silence.
997      But someone's paying the bills.
998                     And it was good.
999    But sometimes their systems fail.
Name: sentence1, Length: 1000, dtype: object

In [None]:
# prompt = pair[0].fillna('missing')
# respones = pair[1].fillna('missing')

In [None]:
def sen_probability(sentence1, sentence2):
    # if sentence2=="missing": 
    #     print(sentence1, sentence2)
    try:
        encoding = tokenizer(sentence1, sentence2, return_tensors='pt')
        with torch.no_grad():
            outputs = model(**encoding, labels=torch.LongTensor([1]))
            logits = outputs.logits
            prob = logits.softmax(dim=1)
            relat_prob = round(prob[0][0].item()*100, 4)
            neg_relat_prob = round(prob[0][1].item()*100, 4)
            pred = "높음" if torch.argmax(prob) == 0 else "낮음"
        return  relat_prob
    except: 
        print(sentence2)

In [None]:
model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForNextSentencePrediction.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# bert-base-uncased

prob_bert_base = []
for p,r in zip(prompt, respones):
    prob = sen_probability(p,r)
    prob_bert_base.append(prob)

In [None]:
# model_name = "bert-large-uncased"

# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForNextSentencePrediction.from_pretrained(model_name)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# # bert-large-uncased


# prob_bert_large = []
# for p,r in zip(prompt, respones):
#     prob = sen_probability(p,r)
#     prob_bert_large.append(prob)


In [None]:
prob_bert_base = pd.DataFrame(prob_bert_base)
prob_bert_base
# prob_bert_large = pd.DataFrame(prob_bert_large)


# pair.rename(columns={0:'PROMPT',
#                    1:'RESPONSE'
#                    }, inplace=True)

Unnamed: 0,0
0,99.8326
1,99.7998
2,0.2313
3,99.9928
4,0.0028
...,...
995,0.1276
996,0.0008
997,0.0065
998,0.0280


In [None]:
pair["probability"] = prob_bert_base
# pair["bert-large"] = prob_bert_large
pair.head()

Unnamed: 0,no,code1,sentence1,code2,sentence2,probability
0,1,y,She's the shit.,new,The shelter smells of bodies and bad habits.,99.8326
1,2,y,You don't understand.,new,All of us are experts.,99.7998
2,3,y,Ours was the prolonged kind.,new,We're staying focused on the big picture.,0.2313
3,4,y,Stranger things have happened.,new,Players and teams think they can break them.,99.9928
4,5,y,Women's Conference begins 9 a.m.,new,And that's why we brought a lawsuit.,0.0028


In [None]:
pair.to_csv("nsp_random_pair_prob.csv")