In [None]:
# Downloading modules 
!pip install simpletransformers  # force install simpletrransformers in colab-environment

# Importing modules
import pandas as pd
import simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import sklearn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, KFold

import torch
from torch.nn.modules.activation import Threshold

import os
import numpy as np
import wandb
import logging


In [None]:
# mount colab to Google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# importing data
data = pd.read_csv('/content/drive/MyDrive/NLP hva så/ASD_classification/data/dataframes/data_eigstig_text_label.csv')

In [None]:
# Define hyperparameter sweep-values
sweep_configuration = {
    'method': 'random',
    'metric': {'name': 'train_loss', 'goal': 'minimize'},
    'parameters': 
    {
        'batch_size': {'values': [8,16, 32, 64]},
        'epochs': {'values': [5, 20, 50, 100]},
        'lr': {'max': 0.1, 'min': 0.0001}
     }
}

# refer sweep to wandb project
sweep_id = wandb.sweep(sweep=sweep_configuration, project='Parameter Sweep5')

# split data into train and test
train, test = train_test_split(data, test_size=0.1)

# train data to use for training and validation (splitting later on)
train_data = train
eval_data =  test

# Define model arguments
model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.output_dir = '/content/drive/MyDrive/NLP hva så/ASD_classification/out/cool_model'
#model_args.manual_seed = 4  # comment out if cross-validating
model_args.use_multiprocessing = True
model_args.train_batch_size = 16
model_args.eval_batch_size = 8
model_args.save_best_model = True
model_args.wandb_project = "Parameter Sweep5"

def training_model():
    # set k_fold-specifics
    n = 5
    seed = 43
    kf = KFold(n_splits=n, random_state = seed, shuffle=True)

    # Defining Model using k-folds
    results = []
    val_train = []
    for train_index, val_index in kf.split(train_data):
        # Initialize a new wandb run
        wandb.init()
        # splitting Dataframe (dataset not included)
        train_df = train_data.iloc[train_index]
        val_df = train_data.iloc[val_index]
        # Defining Model
        model = ClassificationModel('bert', 'bert-base-uncased', use_cuda=True, args=model_args, num_labels = 2, weight=[0.4, 0.6], 
                                    sweep_config=wandb.config)
        # train the model
        model.train_model(train_df, eval_df = val_df)
        # validate the model
        result, model_outputs, wrong_predictions = model.eval_model(val_df, acc=accuracy_score) # , acc = accuracy_score
        print(result['acc'])
        # append model score
        results.append(result['acc'])

        # sync wandb
        wandb.join()

# refer the model to the wandb id
wandb.agent(sweep_id, training_model)

In [None]:
# function that saves the model
def saveModel(model): 
    """ function to save model after training
    Args:
        model (Module): pytorch model to be saved
    """
    path = os.path.join("/Users", "kristian", "Documents", "Skole", "7. semester", "NLP", "Exam", "ASD_classification", + "BERT" + ".pth")
    torch.save(model, path)

# saving model manually
saveModel(model = model) 

In [None]:
# output result 
result, model_outputs, wrong_predictions

In [None]:
# playing around with the model making it make predictions on random sentence
model.predict(['I like ice cream'])
