In [1]:
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
raw_df = pd.read_csv('../data/full.csv')

In [4]:
raw_df.columns

Index(['mi_quality', 'transcript_id', 'video_title', 'video_url', 'topic',
       'utterance_id', 'interlocutor', 'timestamp', 'utterance_text',
       'annotator_id', 'therapist_input_exists', 'therapist_input_subtype',
       'reflection_exists', 'reflection_subtype', 'question_exists',
       'question_subtype', 'main_therapist_behaviour', 'client_talk_type'],
      dtype='object')

In [5]:
df = raw_df[raw_df['interlocutor']=='therapist'][['utterance_text', 'main_therapist_behaviour']]
df = df.rename(columns={'utterance_text':'text', 'main_therapist_behaviour':'category'})

In [6]:
df['category'].value_counts()

category
other              2143
question           1954
reflection         1717
therapist_input    1012
Name: count, dtype: int64

In [7]:
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['encode_cat'] = df['category'].apply(lambda x: encode_cat(x))

In [8]:
df

Unnamed: 0,text,category,encode_cat
0,Thanks for filling it out. We give this form t...,question,0
2,"So, let's see. It looks that you put-- You dri...",therapist_input,1
4,-and you usually have three to four drinks whe...,therapist_input,1
6,Okay. That's at least 12 drinks a week.,therapist_input,1
8,"Okay. Just so you know, my role, um, when we t...",therapist_input,1
...,...,...,...
13542,"So, I would strongly recommend that you have t...",therapist_input,1
13544,"So, you're happy with the date for [unintellig...",question,0
13546,Okay and would you like to come back and see m...,question,0
13548,And what would you like me to do if you don't ...,question,0


In [9]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 12
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [10]:
class Therapist(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.encode_cat[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [11]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Therapist(train_dataset, tokenizer, MAX_LEN)
testing_set = Therapist(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (6826, 3)
TRAIN Dataset: (5461, 3)
TEST Dataset: (1365, 3)


In [12]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [13]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 512)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(512, 4)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [14]:
model = DistillBERTClass()
model.to(device)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [15]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [16]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [17]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%200==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 200 steps: {loss_step}")
            print(f"Training Accuracy per 200 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [18]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 200 steps: 1.3731889724731445
Training Accuracy per 200 steps: 25.0
Training Loss per 200 steps: 1.0685877037878653
Training Accuracy per 200 steps: 53.68988391376451
Training Loss per 200 steps: 1.000870268764044
Training Accuracy per 200 steps: 57.2734829592685
The Total Accuracy for Epoch 0: 58.377586522614905
Training Loss Epoch: 0.9826705276704671
Training Accuracy Epoch: 58.377586522614905


In [19]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps = 0; nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask)
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

Validation Loss per 100 steps: 0.9577882885932922
Validation Accuracy per 100 steps: 50.0
Validation Loss per 100 steps: 0.7870044649237453
Validation Accuracy per 100 steps: 69.38943894389439
Validation Loss Epoch: 0.7941289423850545
Validation Accuracy Epoch: 69.23076923076923
Accuracy on test data = 69.23%


In [20]:
import torch
from transformers import DistilBertModel, DistilBertConfig

# Assume your model training process ends with a model instance named `model`
# Save the model's state dictionary
torch.save(model.state_dict(), 'distilbert_finetuned.pth')

"So, let's see. It looks that you put-- You drink alcohol at least four times a week on average-"

In [33]:
# Initialize the custom model
model_for_inference = DistillBERTClass()

# Load the trained state dictionary
model_for_inference.load_state_dict(torch.load('distilbert_finetuned.pth'))

# Set the model to inference mode
model_for_inference.eval()

text = df['text'][6]

inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Move the tensors to the device of the model
inputs = {k: v for k, v in inputs.items()}

# Perform inference
with torch.no_grad():
    outputs = model_for_inference(**inputs)

# Convert logits to probabilities
probabilities = torch.nn.functional.softmax(outputs, dim=1)

# Assuming you want the class with the highest probability
predicted_class_index = probabilities.argmax(dim=1).item()

# Output the result
print(f"Predicted class index: {predicted_class_index}")

Predicted class index: 2
