In [1]:
import pandas as pd
import numpy as np
import torch

from sklearn import metrics
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AlbertTokenizer, AlbertModel, BertModel, BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(f"device: {device}")

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

device: cpu


In [73]:
comments = pd.read_csv("data/pandora/all_comments_since_2015.csv", nrows=32_000)
authors = pd.read_csv("data/pandora/author_profiles.csv")

In [74]:
comments.head()

Unnamed: 0,author,author_flair_text,body,downs,created_utc,subreddit_id,link_id,parent_id,score,controversiality,gilded,id,subreddit,ups,word_count,word_count_quoteless,lang
0,MetricExpansion,ENTP,Those stats come from the test. [Echoing the c...,0.0,1474429000.0,t5_2s90r,t3_53plrw,t3_53plrw,6.0,0.0,0.0,d7vkyrf,mbti,6.0,151.0,149,en
1,MetricExpansion,,"That's great to hear! I hope you know that, de...",0.0,1480139000.0,t5_2s90r,t3_5ep948,t1_dafz6ab,1.0,0.0,0.0,dafzzrg,mbti,0.0,319.0,316,en
2,MetricExpansion,[ENTP-5 M 22],I can totally agree on reticence! With respect...,0.0,1455096000.0,t5_2s90r,t3_44q2vf,t1_cztchk3,1.0,0.0,0.0,czul5ag,mbti,1.0,145.0,143,en
3,MetricExpansion,<U+1D07><U+0274><U+1D1B><U+1D18> - <U+1D1B><U+...,I took it several times. I'm typed as TYPE_MEN...,0.0,1462865000.0,t5_2s90r,t3_4ijf4l,t3_4ijf4l,1.0,0.0,0.0,d2zo611,mbti,1.0,41.0,41,en
4,MetricExpansion,<U+1D07><U+0274><U+1D1B><U+1D18> - <U+1D1B><U+...,Gawd it's like we don't even need drugs to be ...,0.0,1460656000.0,t5_2s90r,t3_4eptxr,t1_d22uh4r,1.0,0.0,0.0,d22uu81,mbti,1.0,11.0,11,en


In [75]:
authors.head()

Unnamed: 0,author,mbti,introverted,intuitive,thinking,perceiving,gender,age,enneagram,country,...,enneagram_wing,is_native_english_country,predicted_test,test_name,test_scale,16pers_ta,test_result_type,is_female,is_female_pred,is_female_proba
0,-Afrodisiac-,intp,1.0,1.0,1.0,1.0,m,19.0,,,...,,,,,,,,0.0,1,0.531728
1,-Areopagan-,,,,,,m,,,,...,,,0.0,understand myself,percentiles,,percentiles,0.0,0,0.162758
2,-Automaticity,entp,0.0,1.0,1.0,1.0,,,,,...,,,,,,,,,0,0.013042
3,-Avacyn,entj,0.0,1.0,1.0,0.0,f,,,,...,,,,,,,,1.0,1,0.727445
4,-Avatar-Korra-,intp,1.0,1.0,1.0,1.0,,,,,...,,,,,,,,,0,0.068694


In [76]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [77]:
MAX_LEN = 100
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [78]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.body = dataframe.body
        self.targets = dataframe[['introverted', 'intuitive', 'thinking', 'perceiving']]
        print(self.targets)
        self.max_len = max_len

    def __len__(self):
        return len(self.body)

    def __getitem__(self, index):
        body = str(self.body[index])
        body = " ".join(body.split())

        inputs = self.tokenizer.encode_plus(
            body,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [79]:
authors = authors[authors['author'].notnull()]

In [80]:
authors['I/E'] = authors['mbti'].str[0].apply(lambda x: 1 if x == 'e' else 0)
authors['N/S'] = authors['mbti'].str[1].apply(lambda x: 1 if x == 's' else 0)
authors['T/F'] = authors['mbti'].str[2].apply(lambda x: 1 if x == 'f' else 0)
authors['J/P'] = authors['mbti'].str[3].apply(lambda x: 1 if x == 'p' else 0)

In [81]:
authors = authors[['author', 'introverted', 'intuitive', 'thinking', 'perceiving']]
comments = comments[['author', 'body']]

In [82]:
pandora = pd.merge(authors, comments, on='author')

In [83]:
train_size = 0.8
train_dataset=pandora.sample(frac=train_size, random_state=42)
test_dataset=pandora.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(pandora.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (32000, 6)
TRAIN Dataset: (25600, 6)
TEST Dataset: (6400, 6)
       introverted  intuitive  thinking  perceiving
0              0.0        1.0       1.0         1.0
1              1.0        1.0       1.0         0.0
2              1.0        1.0       1.0         1.0
3              0.0        1.0       1.0         1.0
4              1.0        1.0       0.0         0.0
...            ...        ...       ...         ...
25595          0.0        1.0       1.0         1.0
25596          1.0        1.0       1.0         1.0
25597          1.0        1.0       0.0         0.0
25598          1.0        1.0       1.0         1.0
25599          0.0        1.0       1.0         0.0

[25600 rows x 4 columns]
      introverted  intuitive  thinking  perceiving
0             1.0        1.0       1.0         1.0
1             1.0        1.0       1.0         1.0
2             1.0        1.0       1.0         1.0
3             1.0        1.0       1.0         1.0
4             1.0   

In [84]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}
test_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
}

training_loader = DataLoader(training_set, **train_params)
test_params = DataLoader(testing_set, **test_params)

In [85]:
class PERSBERT(torch.nn.Module):
  def __init__(self):
    super(PERSBERT, self).__init__()
    self.l1 = BertModel.from_pretrained('bert-base-uncased')
    self.l2 = torch.nn.Dropout(0.3)
    self.l3 = torch.nn.Linear(768, 6)

  def forward(self, ids, mask, token_type_ids):
    _, output_1 = self.l1(ids, attention_mask = mask, token_type_ids=token_type_ids, return_dict=False)
    output_2 = self.l2(output_1)
    output = self.l3(output_2)
    return output

model = PERSBERT()
model.to(device)


PERSBERT(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [86]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [87]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [88]:
def train(epoch):
  model.train()

  for _,data in enumerate(training_loader, 0):
    print(data)
    ids = data['ids'].to(device, dtype=torch.long)
    mask = data['mask'].to(device, dtype=torch.long)
    token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
    targets = data['targets'].to(device, dtype=torch.float)
    print(targets)

    outputs = model(ids, mask, token_type_ids)

    optimizer.zero_grad()
    loss = loss_fn(outputs, targets)
    if _%5000 == 0:
      print(f'Epoch: {epoch}, Loss: {loss.item()}')

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [90]:
len(training_loader)

3200

In [89]:
for epoch in range(EPOCHS):
  train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyError: 5840