In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from transformers import AutoModel, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import torch
from torch import nn
from datasets import Dataset, Features, ClassLabel


: 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
print(f"device: {device}")

if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

: 

In [None]:
data_dir = Path('data')
text_col_name = "text"
author_col_name = "author"
max_len=512

: 

# Datasets

## Essays

In [None]:
essays = pd.read_csv(data_dir / 'essays/essays.csv', encoding='ISO-8859-1')

: 

In [None]:
essays_map = {
  "y": 1,
  "n": 0
}

: 

In [None]:
essays.head()

: 

In [None]:
essays['EXT'] = essays['cEXT'].map(essays_map)
essays['NEU'] = essays['cNEU'].map(essays_map)
essays['AGR'] = essays['cAGR'].map(essays_map)
essays['CON'] = essays['cCON'].map(essays_map)
essays['OPN'] = essays['cOPN'].map(essays_map)

essays = essays.drop(['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'], axis='columns')

: 

In [None]:
essays.head()

: 

## Kaggle MBTI

In [None]:
kaggle_mbti = pd.read_csv(data_dir / Path('kaggle-mbti/mbti_1.csv'))

: 

In [None]:
kaggle_mbti.head()

: 

In [None]:
kaggle_mbti_map = {
  'E': 1,
  'I': 0,
  'S': 1,
  'N': 0,
  'T': 1,
  'F': 0,
  'J': 1,
  'P': 0
}

: 

In [None]:
kaggle_mbti['mbtiEXT'] = kaggle_mbti['type'].str[0]
kaggle_mbti['mbtiSEN'] = kaggle_mbti['type'].str[1]
kaggle_mbti['mbtiTHI'] = kaggle_mbti['type'].str[2]
kaggle_mbti['mbtiJUD'] = kaggle_mbti['type'].str[3]
kaggle_mbti['mbtiEXT'] = kaggle_mbti['mbtiEXT'].map(kaggle_mbti_map)
kaggle_mbti['mbtiSEN'] = kaggle_mbti['mbtiSEN'].map(kaggle_mbti_map)
kaggle_mbti['mbtiTHI'] = kaggle_mbti['mbtiTHI'].map(kaggle_mbti_map)
kaggle_mbti['mbtiJUD'] = kaggle_mbti['mbtiJUD'].map(kaggle_mbti_map)

: 

In [None]:
kaggle_mbti = kaggle_mbti.drop('type', axis='columns')

: 

In [None]:
kaggle_mbti.head()

: 

In [None]:
kaggle_mbti['posts'] = kaggle_mbti['posts'].str.split('\|\|\|')

: 

In [None]:
kaggle_mbti = kaggle_mbti.explode('posts').reset_index()

: 

In [None]:
kaggle_mbti = kaggle_mbti.rename({
  'posts': 'texts',
  'index': '#AUTHID'
}, axis='columns')

: 

In [None]:
kaggle_mbti.head(10)

: 

## MyPers

In [None]:
mypers = pd.read_csv(data_dir / Path('mypers/mypersonality_final.csv'), encoding='ISO-8859-1')

: 

In [None]:
mypers.head()

: 

## Twitter MBTI

In [None]:
tw_mbti = pd.read_csv(data_dir / Path('tw-mbti/twitter_MBTI.csv'))

: 

In [None]:
tw_mbti.head()

: 

In [None]:
tw_mbti_map = {
  'e': 1,
  'i': 0,
  's': 1,
  'n': 0,
  't': 1,
  'f': 0,
  'j': 1,
  'p': 0
}

: 

In [None]:
tw_mbti['mbtiEXT'] = tw_mbti['label'].str[0]
tw_mbti['mbtiSEN'] = tw_mbti['label'].str[1]
tw_mbti['mbtiTHI'] = tw_mbti['label'].str[2]
tw_mbti['mbtiJUD'] = tw_mbti['label'].str[3]

tw_mbti['mbtiEXT'] = tw_mbti['mbtiEXT'].map(tw_mbti_map)
tw_mbti['mbtiSEN'] = tw_mbti['mbtiSEN'].map(tw_mbti_map)
tw_mbti['mbtiTHI'] = tw_mbti['mbtiTHI'].map(tw_mbti_map)
tw_mbti['mbtiJUD'] = tw_mbti['mbtiJUD'].map(tw_mbti_map)

: 

In [None]:
tw_mbti = tw_mbti.drop('label', axis='columns')

: 

In [None]:
tw_mbti = tw_mbti.rename({'Unnamed: 0': 'AuthorID'}, axis='columns')

: 

In [None]:
tw_mbti['text'] = tw_mbti['text'].str.split('\|\|\|')
tw_mbti.head()

: 

In [None]:
tw_mbti = tw_mbti.explode('text').reset_index(drop=True)

: 

## PANDORA

In [None]:
pandora_authors = pd.read_csv(data_dir / Path('pandora/author_profiles.csv'))
pandora_comments = pd.read_csv(data_dir / Path('pandora/all_comments_since_2015.csv'))

: 

In [None]:
pandora_authors.columns

: 

In [None]:
pandora_authors.head()

: 

In [None]:
pandora_comments.head()

: 

In [None]:
pandora_authors = pandora_authors[['author', 'introverted', 'intuitive', 'thinking', 'perceiving', 
                                   'agreeableness', 'openness', 'conscientiousness', 'extraversion','neuroticism']]

: 

In [None]:
pandora_authors.head()

: 

In [None]:
pandora_comments = pandora_comments[['author', 'body']]

: 

In [None]:
pandora_authors.dtypes

: 

In [None]:
pandora = pd.merge(pandora_authors, pandora_comments, on='author')

: 

In [None]:
pandora

: 

## Consolidation

: 

# ML

## Classification Head

In [None]:
class PersModel(nn.Module):
  def __init__(self, 
               base_model_name=None, 
               config=None, 
               input_size=768, 
               hidden_size=1024, 
               classifier_size=4, 
               regressor_size=0
               ):
    super().__init__()
    self.config = AutoConfig.from_pretrained(
      base_model_name, 
      output_attentions=True, 
      output_hidden_states=True
      ) if config==None else config
    self.base_model = AutoModel.from_pretrained(base_model_name)
    self.dropout1 = nn.Dropout(0.1)
    self.hidden = nn.Linear(input_size, hidden_size)
    self.dropout2 = nn.Dropout(0.1)
    self.classifiers = nn.ModuleList([nn.Linear(hidden_size, 1) for _ in range(classifier_size)])
    self.regressors = nn.ModuleList([nn.Linear(hidden_size, 1) for _ in range(regressor_size)])
    self.classifier_act = nn.Sigmoid()
    self.regressor_act = nn.ReLU()
    
  def forward(self, input_ids=None, attention_mask=None, labels=None) :
    x = self.base_model(input_ids=input_ids, attention_mask=attention_mask)[0]
    x = self.dropout1(x)
    x = self.hidden(x)
    x = self.dropout2(x)

    pooled_x = torch.mean(x, dim=1)

    classifications = [self.classifier_act(classifier(pooled_x)).unsqueeze(-1) for classifier in self.classifiers]
    regressions = [self.regressor_act(regressor(x)) for regressor in self.regressors]
    
    classifications_tensor = torch.cat(classifications, dim=1)
      
    # output = {
    #  "classification": classifications,
    #  "regression": regressions,
    #}

    return classifications_tensor


: 

In [None]:
class PersTrainer(Trainer):

  def compute_loss(self, model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    loss_func = nn.BCELoss()
    classification_loss = loss_func(outputs.squeeze(-1), labels.float())
    # regression_loss = nn.MSELoss()(outputs["regression"], labels["regression"])
    # weight = 0.5
    # loss = weight * classification_loss + (1-weight) * regression_loss
    loss = classification_loss
    return (loss, outputs) if return_outputs else loss

: 

## Training

In [None]:
essays.dtypes

: 

In [None]:
essays['labels'] = essays.apply(lambda row: [row['EXT'], row['NEU'], row['AGR'], row['CON'], row['OPN']], axis=1)
essays = essays.drop(['EXT', 'NEU', 'AGR', 'CON', 'OPN'], axis='columns')

: 

In [None]:
model_name = 'FacebookAI/roberta-base'
dataset = Dataset.from_pandas(essays)

: 

In [None]:
dataset.features

: 

In [None]:
def convert_to_tensor(data):
  data['labels'] = torch.tensor(data['labels'], dtype=torch.float64)
  return data
dataset.set_format(type='torch', columns=['labels'])
dataset = dataset.map(convert_to_tensor)

: 

In [None]:
dataset = dataset.remove_columns('#AUTHID')

: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, max_len=max_len)

: 

In [None]:
def preprocess_function(examples):
    tokenized = tokenizer(examples['TEXT'], truncation=True, padding='max_length')
    return tokenized

tokenized = dataset.map(preprocess_function)

: 

In [None]:
tokenized

: 

In [None]:
train_test_split = tokenized.train_test_split(test_size=0.2)
test_val_split = train_test_split['test'].train_test_split(test_size=0.5)

train_dataset = train_test_split['train']
val_dataset = test_val_split['train']
test_dataset = test_val_split['train']

: 

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
model = PersModel(
  base_model_name=model_name,
  regressor_size=0,
  classifier_size=5
  ) 
trainer = PersTrainer(
  model=model, 
  tokenizer=tokenizer,
  args=training_args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset
  )

: 

In [None]:
torch.cuda.memory_allocated()

: 

In [None]:
trainer.train()

: 

: 