In [1]:
import os
import sys
from time import gmtime, strftime

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import numpy as np
import pandas as pd
import torch
import poutyne
from poutyne import Model,Experiment
import transformers

from huggingface_hub import notebook_login

In [3]:
from data.custom_data import filepath_dataframe,nucPaired_fpDataframe
from data.selection import Selection,SelectionSet_1
from data.torchData import DataLoading

In [4]:
from training.transformer_pretraining import Wav2VecPreTraining
from models.hf_transformers import Sig2VecConfig, Sig2VecForPreTraining, Sig2VecForSequenceClassificationPT

In [20]:
#####################################################################################################################

# random seed
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

# gpu setting
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(DEVICE)
device = DEVICE

## data directory
data_dir  = '/root/npy_format'
readtype = 'npy'
splitchar = '/'
fpath = '.'

# data selection
data_selection = SelectionSet_1()
dataselection_name = 'SelectionSet1'

# data loading
transform = None
batch_size = 32
num_workers = 0

# training
optim = torch.optim.SGD
lr = 0.0005
mask_prob= 0.2
mask_length =4
num_negative =4 
pretrain_epochs = 1
finetune_epochs = 1
supervision = None
temperature = 0.5

# model
in_channels = 70
embed_size = 512
num_hidden_layers = 2
num_attention_heads = 16
intermediate_size = 4*embed_size
num_codevector_groups = 32
num_codevectors_per_group = embed_size//num_codevector_groups
codevector_dim = 512//4
proj_codevector_dim = 512//4
total_time = 1600
num_frame = 25
frame_len = total_time//num_frame
n_classes = 6
network_name = 'Sig2Vec'

# Experiment Name
comment = 'TestTrainer'
exp_name = f'{network_name}_Pretraining_{dataselection_name}_Comment-{comment}'

# auto

model_dir = os.path.join(fpath,'saved_model')
model_fname = os.path.join(model_dir,f'{exp_name}')
record_dir = os.path.join(fpath,'record')
record_fname = os.path.join(record_dir,f'{exp_name}.csv')
print('Experiment Name: ',exp_name)
print('Cuda Availability: ',torch.cuda.is_available())

Experiment Name:  Sig2Vec_Pretraining_SelectionSet1_Comment-TestTrainer
Cuda Availability:  False


In [6]:
config = Sig2VecConfig(
    in_channels = in_channels,
    vocab_size = embed_size,
    hidden_size = embed_size,
    num_hidden_layers = num_hidden_layers,
    num_attention_heads = num_attention_heads,
    intermediate_size = intermediate_size,
    hidden_act = 'gelu',
    num_codevector_groups = num_codevector_groups, 
    num_codevectors_per_group = num_codevectors_per_group,
    codevector_dim = codevector_dim,
    proj_codevector_dim = proj_codevector_dim,
    conv_dim = (embed_size,),
    conv_stride = (frame_len,), 
    conv_kernel = (frame_len,)
)

module = Sig2VecForPreTraining(config)

In [7]:

# data preparation
df = filepath_dataframe(data_dir,splitchar)
# df = nucPaired_fpDataframe(df)
df_train,df_val,df_test = data_selection(df)
df_train = pd.concat([df_train,df_val])

In [8]:
pretrain_loading = DataLoading(transform=transform,
                               batch_size=batch_size,
                               readtype=readtype,
                               num_workers=num_workers,
                               drop_last=True)


pretrain_loader = pretrain_loading(df_train)

In [9]:
trainer = Wav2VecPreTraining(module,optim,lr,mask_prob,mask_length,num_negative)

In [10]:
history = trainer.train(pretrain_loader,epochs=pretrain_epochs,verbose=True,rtn_history=True)

Epoch 1 [2021-11-30 22:11:59.428 pytorch-1-6-cpu-py36--ml-t3-medium-4350395e6b439bff0bb751aa914e:497 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-11-30 22:11:59.491 pytorch-1-6-cpu-py36--ml-t3-medium-4350395e6b439bff0bb751aa914e:497 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> loss: 703.4725952148438


In [12]:
trainer.save(model_fname)

In [14]:
del module, trainer, pretrain_loader

In [15]:
##### FINE-TUNING #####

# data loading
data_loading = DataLoading(transform=transform,batch_size=batch_size,readtype=readtype,
                           num_workers=num_workers,drop_last=True)
test_loading = DataLoading(transform=transform,batch_size=len(df_test),readtype=readtype,
                           num_workers=num_workers,drop_last=True)

df_train = df_train.rename({'fullpath_x':'fullpath'},axis=1)
df_val = df_val.rename({'fullpath_x':'fullpath'},axis=1)
df_test = df_test.rename({'fullpath_x':'fullpath'},axis=1)

train_loader = data_loading(df_train)
val_loader   = data_loading(df_val)
test_loader  = test_loading(df_test)

In [17]:
# load and create model
model = Sig2VecForSequenceClassificationPT.from_pretrained(model_fname, num_labels=n_classes)

Some weights of the model checkpoint at ./saved_model/Sig2Vec_Pretraining_SelectionSet1_Comment-TestTrainer were not used when initializing Sig2VecForSequenceClassificationPT: ['quantizer.codevectors', 'project_q.bias', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_hid.weight', 'project_hid.bias', 'quantizer.weight_proj.bias']
- This IS expected if you are initializing Sig2VecForSequenceClassificationPT from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Sig2VecForSequenceClassificationPT from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Sig2VecForSequenceClassificationPT were not initialized from the model checkpoint at ./saved_model/Sig2Vec_Pretraining_SelectionSet1_Comm

In [18]:
# train with poutyne
mdl = Model(model,'adam','cross_entropy',
            batch_metrics=['accuracy'],
            epoch_metrics=[poutyne.F1('micro'),poutyne.F1('macro')]).to(device)
history = mdl.fit_generator(train_generator=train_loader,valid_generator=test_loader,epochs=finetune_epochs)

[35mEpoch: [36m1/1 [35mTrain steps: [36m50 [35mVal steps: [36m1 [32m5m22.27s [35mloss:[94m 2.789990[35m acc:[94m 51.812500[35m fscore_micro:[94m 0.518125[35m fscore_macro:[94m 0.415889[35m val_loss:[94m 3.241449[35m val_acc:[94m 16.666668[35m val_fscore_micro:[94m 0.166667[35m val_fscore_macro:[94m 0.047619[0m


In [21]:
pd.DataFrame(history).to_csv(record_fname)