In [1]:
train_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Train_data/train_data_after_washing.csv'
test_dir = 'C:/Users/ROG/OneDrive/桌面/FYP/Dataset/Test_data/test_data_after_washing.csv'

In [2]:
import os
import shutil
import time

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score, cohen_kappa_score, roc_curve, auc, make_scorer, accuracy_score, f1_score 

In [3]:
data = pd.read_csv(train_dir)
data = pd.DataFrame(data)
test_data = pd.read_csv(test_dir)
test_data = pd.DataFrame(test_data)
train_data = pd.DataFrame()
val_data = pd.DataFrame()
data = data[["review","rating"]]
test_data = test_data[["review","rating"]]

In [4]:
data['labels'] = data['rating'].map({1 : 0,
                                     2 : 0,
                                     3 : 0,
                                     4 : 0,
                                     5 : 1,
                                     6 : 1,
                                     7 : 1,
                                     8 : 1,
                                     9 : 2,
                                     10 : 2})

In [5]:
test_data['labels'] = test_data['rating'].map({1 : 0,
                                               2 : 0,
                                               3 : 0,
                                               4 : 0,
                                               5 : 1,
                                               6 : 1,
                                               7 : 1,
                                               8 : 1,
                                               9 : 2,
                                               10 : 2})

In [6]:
data = data[["review","labels"]]
test_data = test_data[["review","labels"]]
y = data["labels"].to_numpy()
Y = data["labels"]
X = data["review"].to_numpy()
test_X = test_data["review"].to_numpy()
test_y = test_data["labels"].to_numpy()

In [7]:
tokenizer = AutoTokenizer.from_pretrained("C:/Users/ROG/OneDrive/桌面/FYP/Model/Mental/")
model = AutoModelForSequenceClassification.from_pretrained("C:/Users/ROG/OneDrive/桌面/FYP/Model/Mental/", num_labels=3) # 2 labels for positive and negative sentiment

In [9]:
tokenizer.save_pretrained("./Mental/")
model.save_pretrained("./Mental/")

In [None]:
def graded_precision(y_true, y_pred, weights):
    precision_0 = precision_score(y_true, y_pred, labels=[0], average='macro')
    precision_1 = precision_score(y_true, y_pred, labels=[1], average='macro')
    precision_2 = precision_score(y_true, y_pred, labels=[2], average='macro')
    gp = ( weights[0] * precision_0 + weights[1] * precision_1 + weights[2] * precision_2 ) / ( weights[0] + weights[1] + weights[2] )
    return gp
def graded_recall(y_true, y_pred, weights):
    recall_0 = recall_score(y_true, y_pred, labels=[0], average='macro')
    recall_1 = recall_score(y_true, y_pred, labels=[1], average='macro')
    recall_2 = recall_score(y_true, y_pred, labels=[2], average='macro')
    gr = ( weights[0] * recall_0 + weights[1] * recall_1 + weights[2] * recall_2 ) / ( weights[0] + weights[1] + weights[2] )
    return gr
def graded_f1(precision, recall):
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [None]:
def CM(y_true, y_pre, times):
        times = str(times)
        con_mat = confusion_matrix(y, pred)
        con_mat_norm = con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis]     # 归一化
        con_mat_norm = np.around(con_mat_norm, decimals=2)
        plt.figure(figsize=(8, 8))
        sns.heatmap(con_mat_norm, annot=True, cmap='Blues')
        plt.ylim(0, 3)
        plt.xlabel('Predicted labels')
        plt.ylabel('True labels')
        #save CM
        plt.savefig(fname='C:/Users/ROG/OneDrive/桌面/FYP/Model/Mental/'+ times + '/CM.png', dpi=300)
        plt.close()

In [None]:
skf = StratifiedKFold(n_splits=10)
val_acc = []
val_gp = []
val_gr = []
val_f1 = []
val_kp = []
tes_acc = []
tes_gp = []
tes_gr = []
tes_f1 = []
tes_kp = []
train_time = []
times = 0
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = Y[train_index], Y[val_index]
    
    X_train = X_train.astype(str).tolist()
    y_train = y_train.astype(int).tolist()
    X_val = X_val.astype(str).tolist()
    y_val = y_val.astype(int).tolist()
    test_X = test_X.astype(str).tolist()
    test_y = test_y.astype(int).tolist()
    
    #Create train set, validation set and test set
    inputs = tokenizer(X_train, padding=True, max_length=512, truncation=True, return_tensors="pt") # tokenize texts
    train_dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], torch.tensor(y_train)) # create dataset
    inputs = tokenizer(X_val, padding=True, max_length=512, truncation=True, return_tensors="pt") # tokenize texts
    val_dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], torch.tensor(y_val)) # create dataset
    inputs = tokenizer(test_X, padding=True,  max_length=512, truncation=True, return_tensors="pt")
    test_dataset = TensorDataset(inputs["input_ids"], inputs["attention_mask"], torch.tensor(test_y)) # create dataset
    
    #Create data loader 
    batch_size = 12 # sample batch size
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # train data loader
    val_loader = DataLoader(val_dataset, batch_size=batch_size) # validation data loader
    test_loader = DataLoader(test_dataset, batch_size=batch_size) # test data loader 
    
    # Train model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available
    model.to(device) # move model to device
    criterion = nn.CrossEntropyLoss() # loss function
    optimizer = AdamW(model.parameters(), lr=1e-5) # optimizer
    epochs = 150 # sample number of epochs
    t0 = time.time()
    for epoch in range(epochs):
        pbar = tqdm(train_loader, total=len(train_loader))
        print(f"Epoch {epoch+1}")
        model.train() # set model to train mode
        train_loss = 0.0 # initialize train loss
        for batch in pbar:
            input_ids, attention_mask, labels = batch # get batch data
            input_ids = input_ids.to(device) # move input ids to device
            attention_mask = attention_mask.to(device) # move attention mask to device
            labels = labels.to(device) # move labels to device
            optimizer.zero_grad() # zero the gradients
            outputs = model(input_ids, attention_mask=attention_mask) # forward pass
            loss = criterion(outputs.logits, labels) # compute loss
            loss.backward() # backward pass
            optimizer.step() # update parameters
            train_loss += loss.item() # accumulate train loss
            #pbar.set_description(f"Epoch {epoch+1}")
        
        print(f"Train loss: {train_loss/len(train_loader)}") # print average train loss
    t1 = time.time()
    time_train = t1 - t0
    train_time.append(time_train)
    directory = 'C:/Users/ROG/OneDrive/桌面/FYP/Model/Mental/'+ str(times)
    os.makedirs (directory, exist_ok=True)
    torch.save(model.state_dict(), directory + '/model_weights.pth')
    
    #Valitdate model
    model.eval() # set model to eval mode
    val_loss = 0.0 # initialize test loss
    with torch.no_grad(): # no gradient computation
        for batch in val_loader:
            input_ids, attention_mask, labels = batch # get batch data
            input_ids = input_ids.to(device) # move input ids to device
            attention_mask = attention_mask.to(device) # move attention mask to device
            labels = labels.to(device) # move labels to device
            outputs = model(input_ids, attention_mask=attention_mask) # forward pass
            loss = criterion(outputs.logits, labels) # compute loss
            preds = torch.argmax(outputs.logits, dim=1) # get predictions
            test_loss += loss.item() # accumulate test loss 
            test_preds.extend(preds.cpu().tolist()) # store test predictions 
            test_labels.extend(labels.cpu().tolist()) # store test labels 
    print(f"Val loss: {test_loss/len(test_loader)}") # print average test loss 
    weights = [2,1,1]
    acc = accuracy_score(test_labels, test_preds)
    pre = graded_precision(test_labels, test_preds, weights)
    rec = graded_recall(test_labels, test_preds, weights)
    f1 = graded_f1(pre, rec)
    kappa = cohen_kappa_score(test_labels, test_preds)
    val_acc.append(acc)
    val_gp.append(prec)
    val_gr.append(rec)
    val_f1.append(f1)
    val_kp.append(kappa)
    print(f"Val accuracy: {acc}") 
    print(f"Val precision: {pre}") 
    print(f"Val recall: {rec}") 
    print(f"Val f1 score: {f1}") 
    print(f"Val Kappa: {kappa}") 
    
    # Test model
    model.eval() # set model to eval mode
    test_loss = 0.0  # initialize test loss
    with torch.no_grad(): # no gradient computation
        for batch in test_loader:
            input_ids, attention_mask, labels = batch # get batch data
            input_ids = input_ids.to(device) # move input ids to device
            attention_mask = attention_mask.to(device) # move attention mask to device
            labels = labels.to(device) # move labels to device
            outputs = model(input_ids, attention_mask=attention_mask) # forward pass
            loss = criterion(outputs.logits, labels) # compute loss
            preds = torch.argmax(outputs.logits, dim=1) # get predictions
            test_loss += loss.item() # accumulate test loss 
            test_preds.extend(preds.cpu().tolist()) # store test predictions 
            test_labels.extend(labels.cpu().tolist()) # store test labels 
    print(f"Test loss: {test_loss/len(test_loader)}") # print average test loss 
    weights = [2,1,1]
    CM(test_labels, test_preds, times)
    acc = accuracy_score(test_labels, test_preds)
    pre = graded_precision(test_labels, test_preds, weights)
    rec = graded_recall(test_labels, test_preds, weights)
    f1 = graded_f1(pre, rec)
    kappa = cohen_kappa_score(test_labels, test_preds)
    tes_acc.append(acc)
    tes_gp.append(prec)
    tes_gr.append(rec)
    tes_f1.append(f1)
    tes_kp.append(kappa)
    print(f"Test accuracy: {acc}") 
    print(f"Test precision: {pre}") 
    print(f"Test recall: {rec}") 
    print(f"Test f1 score: {f1}") 
    print(f"Test Kappa: {kappa}") 
    
    times = times + 1

In [None]:
with open("C:/Users/ROG/OneDrive/桌面/FYP/Model/MentalBERT.txt", "w") as f:
    f.write("val_acc: ")
    for item in val_acc:
        f.write(str(item))
        if val_acc.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("val_gp: ")
    for item in val_gp:
        f.write(str(item))
        if val_gp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("val_gr: ")
    for item in val_gr:
        f.write(str(item))
        if val_gr.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("val_f1: ")
    for item in val_f1:
        f.write(str(item))
        if val_f1.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
    
    f.write("val_kp: ")
    for item in val_kp:
        f.write(str(item))
        if val_kp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")   
    
    f.write("tes_acc: ")
    for item in tes_acc:
        f.write(str(item))
        if tes_acc.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("tes_gp: ")
    for item in tes_gp:
        f.write(str(item))
        if tes_gp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
        
    f.write("tes_gr: ")
    for item in tes_gr:
        f.write(str(item))
        if tes_gr.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
            
    f.write("tes_f1: ")
    for item in tes_f1:
        f.write(str(item))
        if tes_f1.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")  
            
    f.write("tes_kp: ")
    for item in tes_kp:
        f.write(str(item))
        if tes_kp.index(item) == len(val_acc) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("train_time: ")
    for item in train_time:
        f.write(str(item))
        if train_time.index(item) == len(train_time) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")
    
    f.write("average_results: ")
    for item in avg_results:
        f.write(str(item))
        if avg_results.index(item) == len(avg_results) - 1: # Check if last item
            f.write(';')
        else:
            f.write(', ')
    f.write("\n")