In [8]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel,BertForMaskedLM
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
import torch
from transformers import BertForMaskedLM, BertTokenizer
from tqdm import tqdm


In [9]:
labeled_comments = pd.read_csv('data/pos_tags_dataset.csv')
input_sentences = labeled_comments['cleaned_comments'].tolist()
input_sentences = [sentence for sentence in input_sentences if type(sentence)==str and  len(sentence.split()) > 2]

In [10]:
with open('data/bad_word.txt', 'r') as f:
    not_predict = f.read().split('\n')

In [11]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define a list of words that the model should not predict

# Load a pre-trained BERT model and tokenizer
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = model.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the optimizer and loss function for fine-tuning
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Define a list of input sentences and labels
sentences = input_sentences[:100]

labels = []
for sentence in sentences:
    label_sentence = []
    try:
        if len(sentence)>1:
            sentence = sentence.split()
        for word in sentence:
            if word in not_predict:
                label_sentence.append(0)
            else:
                label_sentence.append(1)
        labels.append(label_sentence)
    except:
        print(sentence)


# Convert the input sentences to input features
input_features = []
for sentence in sentences:
    # Tokenize the input sentence
    tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)
    # Create a list of labels for each token in the input sentence
    sentence_labels = []
    for token in tokenized_sentence:
        word = tokenizer.decode([token]).strip()
        if word in not_predict:
            sentence_labels.append(0)  # label 0 for blacklisted words
        else:
            sentence_labels.append(1)  # label 1 for other words
    # Append the input features and labels to the input_features list
    input_features.append((torch.tensor(tokenized_sentence).to(device), torch.tensor(sentence_labels).to(device)))

# Fine-tune the BERT model on the input features and labels
for epoch in range(10):
    running_loss = 0.0
    for input_ids, labels in tqdm(input_features):
        optimizer.zero_grad()
        outputs = model(input_ids.unsqueeze(0), labels=labels.unsqueeze(0))
        loss, prediction_scores = outputs[:2]
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print("Epoch {} loss: {:.4f}".format(epoch+1, running_loss/len(input_features)))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 100/100 [00:15<00:00,  6.52it/s]


Epoch 1 loss: 8.4241


100%|██████████| 100/100 [00:15<00:00,  6.42it/s]


Epoch 2 loss: 0.4234


100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Epoch 3 loss: 0.1480


100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


Epoch 4 loss: 0.1060


100%|██████████| 100/100 [00:15<00:00,  6.52it/s]


Epoch 5 loss: 0.0874


100%|██████████| 100/100 [00:15<00:00,  6.48it/s]


Epoch 6 loss: 0.0770


100%|██████████| 100/100 [00:16<00:00,  6.21it/s]


Epoch 7 loss: 0.0706


100%|██████████| 100/100 [00:16<00:00,  6.24it/s]


Epoch 8 loss: 0.0664


100%|██████████| 100/100 [00:16<00:00,  6.20it/s]


Epoch 9 loss: 0.0635


100%|██████████| 100/100 [00:16<00:00,  6.25it/s]

Epoch 10 loss: 0.0614





In [18]:
# Evaluate the fine-tuned model on some example input sentences
test_sentences = [
    "here comes the beginning of another fucking recession",
    "he enjoys a fucking orange juice.",
    "she bought some nipples at the store.",
    "we went to the park to play sucker."
]
for sentence in test_sentences:
    # Tokenize the input sentence
    tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)
    # Create a list of labels for each token in the input sentence
    sentence_labels = []
    # Convert the input sentence to a PyTorch tensor and predict the masked words
    input_ids = torch.tensor(tokenized_sentence).to(device)
    print(input_ids)
    with torch.no_grad():
        outputs = model(input_ids.unsqueeze(0))
        print(outputs)
        prediction_scores = outputs[0]
    # Decode the predicted masked words and print the result
    predicted_tokens = torch.argmax(prediction_scores, dim=-1).squeeze().tolist()
    print(predicted_tokens)
    predicted_words = tokenizer.decode(predicted_tokens).split()
    print("Input sentence:", sentence)
    print("Predicted words:", predicted_words)


tensor([  101,  2182,  3310,  1996,  2927,  1997,  2178,  8239, 19396,   102],
       device='cuda:0')
MaskedLMOutput(loss=None, logits=tensor([[[15.3306, 19.9276,  8.0185,  ...,  7.0004,  4.2338,  3.4028],
         [15.1143, 19.7127,  7.7288,  ...,  6.8165,  4.2824,  3.0219],
         [15.0954, 19.6774,  7.7642,  ...,  6.8457,  4.3665,  3.0776],
         ...,
         [15.2913, 19.8027,  7.8640,  ...,  6.9566,  4.3831,  3.1447],
         [15.4877, 20.0838,  8.1257,  ...,  7.0895,  4.6768,  3.3599],
         [14.0324, 18.7651,  6.8386,  ...,  6.1382,  3.7032,  2.3645]]],
       device='cuda:0'), hidden_states=None, attentions=None)
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Input sentence: here comes the beginning of another fucking recession
Predicted words: ['[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]', '[unused0]']
tensor([  101,  2002, 15646,  1037,  8239,  4589, 10869,  1012,   102],
       device='cuda:0')
MaskedLMOutput(