<a href="https://colab.research.google.com/github/lagodw/RedditBot/blob/master/Reddit_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install transformers datasets

In [None]:
#! usr/bin/env python3
import pandas as pd
import datetime as dt
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForNextSentencePrediction
from datasets import Dataset

In [None]:
raw_text = pd.read_csv('/content/drive/MyDrive/reddit_scrape.csv')

raw_text.loc[raw_text.parent_text.isna(), 'parent_text'] = raw_text.loc[raw_text.parent_text.isna(), 'title']

raw_text['parent_text'] = raw_text.subreddit + '. ' + raw_text.parent_text

raw_text['comment_text'] = raw_text.comment_text.str.lower()
raw_text['parent_text'] = raw_text.parent_text.str.lower()
raw_text = raw_text[(raw_text.comment_text.notna()) & (raw_text.parent_text.notna()) & (raw_text.comment_text != '[deleted]')]

raw_text = raw_text[['parent_text', 'comment_text']]

fake_data = raw_text[['parent_text']].copy()

raw_text['merge_id'] = raw_text.index
fake_data['merge_id'] = np.random.randint(0, raw_text.shape[0], size=raw_text.shape[0])
fake_data.loc[fake_data.merge_id == fake_data.index, 'merge_id'] = round(fake_data.loc[fake_data.merge_id == fake_data.index, 'merge_id'] / 2 + 10, 0)

fake_data = pd.merge(fake_data, raw_text[['merge_id', 'comment_text']], how = 'left', on = 'merge_id')

raw_text['label'] = 0
fake_data['label'] = 1

# raw_text = raw_text.sample(frac = 0.3, replace = False, random_state = 123)
# fake_data = fake_data.sample(frac = 0.3, replace = False, random_state = 123)

combined_text = raw_text.append(fake_data).drop(['merge_id'], axis = 1).reset_index()
combined_text['total_length'] = combined_text.parent_text.str.len() + combined_text.comment_text.str.len()
combined_text = combined_text[combined_text.total_length < 1000]
combined_text = combined_text.sample(frac=1, random_state = 123)
combined_text.head()

del raw_text
del fake_data

print(combined_text.shape)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [None]:
raw_datasets = Dataset.from_pandas(combined_text[['parent_text', 'comment_text', 'label']])

def tokenize_function(examples, max_length = 256):

    model_inputs = tokenizer(examples["parent_text"], examples['comment_text'], padding='max_length', truncation=True, max_length=max_length, add_special_tokens=True)

    return model_inputs

tokenized_datasets = raw_datasets.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['parent_text', 'comment_text'])

del raw_datasets

In [None]:
training_args = TrainingArguments(
    output_dir = 'redditbot_bert',
    num_train_epochs = 1,
    per_device_train_batch_size = 32,
    warmup_steps = 100,
    weight_decay = 0.01,
    save_strategy = 'epoch'
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets
)

trainer.train()

In [None]:
import torch
import math
device = torch.device("cuda")

model.eval()

i = 0

prompt1 = combined_text.parent_text[i]
prompt1 = prompt1.lower()
prompt2 = combined_text.comment_text[i]
prompt2 = prompt2.lower()

generated = tokenizer(prompt1, prompt2, return_tensors='pt')
generated = generated.to(device)

model = model.to(device)
labs = torch.LongTensor([1])
labs = labs.to(device)

sample_outputs = model(**generated)[0]

lyes = sample_outputs[0][0].item()
lno = sample_outputs[0][1].item()

pyes = math.exp(lyes) / (math.exp(lyes) + math.exp(lno))
print(pyes)

print(combined_text.parent_text[i])
print(combined_text.comment_text[i])
print(combined_text.label[i])

In [None]:
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
assert logits[0][0] < logits[0][1] # the next sentence was random