In [32]:
import pandas as pd
import numpy as np
import pylangacq
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
from bertviz import model_view, head_view

In [33]:
train_df_cc = pd.read_csv('datasets/addresso/train/cc_meta_data.csv')
train_df_cd = pd.read_csv('datasets/addresso/train/cd_meta_data.csv')
train_df_cc['path'] = 'datasets/addresso/train/transcription/cc/' + train_df_cc['ID'] + '.cha'
train_df_cd['path'] = 'datasets/addresso/train/transcription/cd/' + train_df_cd['ID'] + '.cha'
train_df_cc['class'] = 0
train_df_cd['class'] = 1

train_df = pd.concat([train_df_cc, train_df_cd], ignore_index=True)

train_df.head()

Unnamed: 0,ID,age,gender,mmse,path,class
0,S001,74,male,,datasets/addresso/train/transcription/cc/S001.cha,0
1,S002,62,female,30.0,datasets/addresso/train/transcription/cc/S002.cha,0
2,S003,69,female,29.0,datasets/addresso/train/transcription/cc/S003.cha,0
3,S004,71,female,30.0,datasets/addresso/train/transcription/cc/S004.cha,0
4,S005,74,female,30.0,datasets/addresso/train/transcription/cc/S005.cha,0


In [34]:
def path_to_text(path):
    chat = pylangacq.read_chat(path)
    utterances = [utterance for utterance in chat.utterances() if utterance.participant == 'PAR']
    tokens = [token for utterance in utterances for token in utterance.tokens]
    words = [token.word for token in tokens]
    
    return ' '.join(words)

In [35]:
train_df['text'] = train_df['path'].apply(path_to_text)

In [None]:
train_df[train_df['ID'] == 'S001']['text']

0    well there's POSTCLITIC a mother standing ther...
Name: text, dtype: object

In [None]:
class AddressoDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df['text'][idx]
        label = self.df['class'][idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, n_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name, output_attentions=True)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        attentions = outputs.attentions
        return self.out(output), attentions

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for batch in tqdm(data_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    optimizer.zero_grad()
    outputs, _ = model(input_ids, attention_mask)
    loss = nn.CrossEntropyLoss()(outputs, labels)
    loss.backward()
    optimizer.step()
    scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
  model.eval()
  predictions = []
  true_labels = []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids, attention_mask)
      _, preds = torch.max(outputs, dim=1)
      predictions.extend(preds)
      true_labels.extend(labels)

  return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions)

In [None]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 512
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

In [None]:
df_train, df_val = train_test_split(train_df, test_size=0.2, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

train_dataset = AddressoDataset(df_train, tokenizer, max_length)
val_dataset = AddressoDataset(df_val, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
device = 'cpu'

model = BERTClassifier(bert_model_name, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
  train(model, train_loader, optimizer, scheduler, device)
  acc, report = evaluate(model, val_loader, device)
  print(f'Epoch: {epoch + 1}')
  print(f'Accuracy: {acc}')
  print(report)

  0%|          | 0/6 [00:00<?, ?it/s]

100%|██████████| 6/6 [03:29<00:00, 34.89s/it]


TypeError: sequence item 0: expected str instance, Tensor found