In [1]:
import os
import sys
from time import gmtime, strftime

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
import torch
import poutyne
from poutyne import Model,Experiment
import transformers

from huggingface_hub import notebook_login

In [3]:
from data.custom_data import filepath_dataframe,nucPaired_fpDataframe
from data.selection import Selection,SelectionSet_1
from data.torchData import DataLoading
from data.transformation import ReduceRes

In [4]:
from training.transformer_pretraining import Wav2VecPreTraining
from models.hf_transformers import Sig2VecConfig, Sig2VecForPreTraining, Sig2VecForSequenceClassificationPT

In [5]:
#####################################################################################################################

# random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# gpu setting
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(DEVICE)
device = DEVICE

## data directory
data_dir  = '/root/npy_format'
readtype = 'npy'
splitchar = '/'
fpath = '.'

# data selection
data_selection = SelectionSet_1()
dataselection_name = 'SelectionSet1'

# data loading
transform = ReduceRes()
batch_size = 128
num_workers = 0

# model
in_channels = 70
embed_size = 768
num_hidden_layers = 10
num_attention_heads = 16
intermediate_size = 4*embed_size
total_time = 1600
num_frame = 25
frame_len = total_time//num_frame
classifier_proj_size = 64
num_labels = 6 
num_conv_pos_embeddings = 128
network_name = 'Sig2Vec'

# training
optim = torch.optim.Adam
lr = 0.001
pretrain_epochs = 100
finetune_epochs = 10
apply_spec_augment = True
mask_time_prob = 0.2
mask_time_length = 5
mask_feature_prob = 0
mask_feature_length = 0
num_negatives = 2
num_codevector_groups = 16  
num_codevectors_per_group = embed_size // num_codevector_groups  
codevector_dim = 128
proj_codevector_dim = 128
contrastive_logits_temperature = 0.1


# Experiment Name
comment = 'TestTrainer'
exp_name = f'{network_name}_Pretraining_{dataselection_name}_Comment-{comment}'

# auto

model_dir = os.path.join(fpath,'saved_model')
model_fname = os.path.join(model_dir,f'{exp_name}')
record_dir = os.path.join(fpath,'record')
record_fname = os.path.join(record_dir,f'{exp_name}.csv')
print('Experiment Name: ',exp_name)
print('Cuda Availability: ',torch.cuda.is_available())

Experiment Name:  Sig2Vec_Pretraining_SelectionSet1_Comment-TestTrainer
Cuda Availability:  True


In [6]:
config = Sig2VecConfig(
    in_channels = in_channels,
    vocab_size = embed_size,
    hidden_size = embed_size,
    num_hidden_layers = num_hidden_layers,
    num_attention_heads = num_attention_heads,
    intermediate_size = intermediate_size,
    hidden_act = 'gelu',
    conv_dim = (embed_size,),
    conv_stride = (frame_len,), 
    conv_kernel = (frame_len,),
    apply_spec_augment = True,    
    mask_time_prob = mask_time_prob,
    mask_time_length = mask_time_length,
    mask_feature_prob = mask_feature_prob,
    mask_feature_length = mask_feature_length,
    num_negatives = num_negatives,
    num_codevector_groups = num_codevector_groups, 
    num_codevectors_per_group = num_codevectors_per_group,
    codevector_dim = codevector_dim, 
    proj_codevector_dim = proj_codevector_dim,
    contrastive_logits_temperature = contrastive_logits_temperature,
)

module = Sig2VecForPreTraining(config)

In [7]:

# data preparation
df = filepath_dataframe(data_dir,splitchar)
# df = nucPaired_fpDataframe(df)
df_train,df_val,df_test = data_selection(df)
df_train = pd.concat([df_train,df_val])

In [8]:
pretrain_loading = DataLoading(transform=transform,
                               batch_size=batch_size,
                               readtype=readtype,
                               num_workers=num_workers,
                               drop_last=True)


pretrain_loader = pretrain_loading(df_train)

In [9]:
trainer = Wav2VecPreTraining(module,optim,lr, mask_time_prob, mask_time_length, num_negatives)

In [10]:
history = trainer.train(pretrain_loader,epochs=pretrain_epochs,verbose=True,rtn_history=True)

Epoch 1 [2021-12-15 23:23:06.072 pytorch-1-6-gpu-py3-ml-g4dn-xlarge-2a0e2554e33379262f4fea063cf1:256 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-12-15 23:23:06.095 pytorch-1-6-gpu-py3-ml-g4dn-xlarge-2a0e2554e33379262f4fea063cf1:256 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.


AttributeError: 'NoneType' object has no attribute 'backward'

In [None]:
trainer.save(model_fname)
record_fname = os.path.join(record_dir,f'{exp_name}_pretrain.csv')
pd.DataFrame(history).to_csv(record_fname)

In [None]:
del module, trainer, pretrain_loader

In [None]:
##### FINE-TUNING #####

# data loading
data_loading = DataLoading(transform=transform,batch_size=batch_size,readtype=readtype,
                           num_workers=num_workers,drop_last=True)
test_loading = DataLoading(transform=transform,batch_size=len(df_test),readtype=readtype,
                           num_workers=num_workers,drop_last=True)

df_train = df_train.rename({'fullpath_x':'fullpath'},axis=1)
df_val = df_val.rename({'fullpath_x':'fullpath'},axis=1)
df_test = df_test.rename({'fullpath_x':'fullpath'},axis=1)

train_loader = data_loading(df_train)
val_loader   = data_loading(df_val)
test_loader  = test_loading(df_test)

In [None]:
from transformers import TrainingArguments, Trainer
from datasets import load_metric
from data.torchData.utils import TransformersDataset, DatasetObject
from models.hf_transformers import Sig2VecForSequenceClassification

In [None]:
# load and create model
model = Sig2VecForSequenceClassificationPT.from_pretrained(model_fname, num_labels=num_labels)

# train with poutyne
mdl = Model(model,'adam','cross_entropy',
            batch_metrics=['accuracy'],
            epoch_metrics=[poutyne.F1('micro'),poutyne.F1('macro')]).to(device)
history = mdl.fit_generator(train_generator=train_loader,valid_generator=test_loader,epochs=finetune_epochs)

In [None]:
record_fname = os.path.join(record_dir,f'{exp_name}_finetuned.csv')
pd.DataFrame(history).to_csv(record_fname)

In [None]:
class TransformersDataset(DatasetObject):

    def __init__(self,filepaths,label=None,transform=None,readtype='npy'):
        super().__init__(filepaths=filepaths,
                         label=label,
                         transform=transform,
                         readtype=readtype)

    def __getitem__(self, idx):
        items = super().__getitem__(idx)
        dic = {}
        if isinstance(self.label,np.ndarray):
            X,y = items
            dic['input_values'] = X
            dic['label'] = y
        else:
            X = items
            dic['input_values'] = X
        return dic

    def load_data(self):
        return None



class create_TransformersDataset(object):
    
    def __init__(self,transform,readtype='npy'):
        self.transform = transform
        self.readtype = readtype

    def __call__(self,df):
        datasetobj = TransformersDataset(filepaths=df['fullpath'].to_numpy(),
                                         label=df['activity'].to_numpy(),
                                         transform=self.transform,
                                         readtype=self.readtype)
        return datasetobj

def create_compute_metrics():
    metric = load_metric("accuracy","f1")
    def compute_metrics(eval_pred):
        """Computes accuracy on a batch of predictions"""
        predictions = np.argmax(eval_pred.predictions, axis=1)
        return metric.compute(predictions=predictions, references=eval_pred.label_ids)
    return compute_metrics

class CustomTrainer(Trainer):
    
    def compute_loss(self, model, inputs, return_outputs=False):
        
        inputs['labels'] = inputs['labels'].long()
        
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            loss = self.label_smoother(outputs, labels)
        else:
            # We don't use .loss here since the model may return tuples instead of ModelOutput.
            loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
        return (loss, outputs) if return_outputs else loss

    

In [None]:
1

In [None]:
model = Sig2VecForSequenceClassification.from_pretrained(model_fname, num_labels=num_labels)

dataset_creator = create_TransformersDataset(transform=transform,readtype=readtype)

training_args = TrainingArguments(
    output_dir=record_dir,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=finetune_epochs,
    evaluation_strategy = 'epoch',
    logging_strategy = 'epoch',
#     logging_steps = 5,
#     eval_steps = 5,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_creator(df_train),
    eval_dataset=dataset_creator(df_test),
    compute_metrics=create_compute_metrics(),
)

trainer.train()