In [467]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import re
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.metrics import mean_absolute_error

In [486]:
prefix = '/Users/apple/Downloads/depression/'

elmo = ElmoEmbedder()

train_split_df = pd.read_csv(prefix+'train_split_Depression_AVEC2017 (1).csv')
test_split_df = pd.read_csv(prefix+'dev_split_Depression_AVEC2017.csv')
train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist()
test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist()
train_split_clabel = train_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()
test_split_clabel = test_split_df[['PHQ8_Score']]['PHQ8_Score'].tolist()

topics = []
with open('/Users/apple/Downloads/depression/queries.txt', 'r') as f:
    for line in f.readlines():
        topics.append(line.strip('\n').strip())
        

def identify_topics(sentence):
    if sentence in topics:
        return True
    return False

def extract_features(number, audio_features, text_features, target, mode, targets):
    transcript = pd.read_csv(prefix+'{0}_P/{0}_TRANSCRIPT.csv'.format(number), sep='\t').fillna('')
    
    wavefile = wave.open(prefix+'{0}_P/{0}_AUDIO.wav'.format(number, 'r'))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)
    
    responses = []
    response = ''
    response_flag = False
    start_time = 0
    stop_time = 0
    signal = []

    global counter_train, counter_test

    for t in transcript.itertuples():
        # participant一句话结束
        if getattr(t,'speaker') == 'Ellie':
            if '(' in getattr(t,'value'):
                content = re.findall(re.compile(r'[(](.*?)[)]', re.S), getattr(t,'value'))[0]
            else:
                content = getattr(t,'value').strip()
            content = getattr(t,'value').strip()
            if identify_topics(content):
                response_flag = True
                if len(response) != 0:
                    responses.append(response.strip())
                response = ''
            elif response_flag and len(content.split()) > 4:
                response_flag = False
                if len(response) != 0:
                    responses.append(response)
                response = ''
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            elif response_flag:
                response+=' '+getattr(t,'value').split('\n')[0].strip()
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))
            
#     text features & audio_features
    clip = sr*1*15
    text_embeds = elmo.embed_sentence(responses).mean(0)
    if target >= 10 and mode == 'train':
        times = 3 if counter_train < 48 else 2
        for i in range(times):
            melspec = librosa.feature.melspectrogram(signal[clip*i:clip*(i+1)], n_mels=80,sr=sr)
            audio_features.append(melspec)
            text_features.append(text_embeds[i*10:(i+1)*10])
            targets.append(target)
            counter_train+=1
    else:
        melspec = librosa.feature.melspectrogram(signal[:clip], n_mels=80,sr=sr)
        audio_features.append(melspec)
        text_features.append(text_embeds[:10])
        targets.append(target)
        
    print('{}_P feature done'.format(number))
    
    
def extract_features_whole(number, audio_features, text_features, target, targets):
    transcript = pd.read_csv(prefix+'{0}_P/{0}_TRANSCRIPT.csv'.format(number), sep='\t').fillna('')
    
    wavefile = wave.open(prefix+'{0}_P/{0}_AUDIO.wav'.format(number, 'r'))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)
    
    responses = []
    response = ''
    response_flag = False
    start_time = 0
    stop_time = 0
    signal = []

    global counter_train, counter_test

    for t in transcript.itertuples():
        # participant一句话结束
        if getattr(t,'speaker') == 'Ellie':
            if '(' in getattr(t,'value'):
                content = re.findall(re.compile(r'[(](.*?)[)]', re.S), getattr(t,'value'))[0]
            else:
                content = getattr(t,'value').strip()
            content = getattr(t,'value').strip()
            if identify_topics(content):
                response_flag = True
                if len(response) != 0:
                    responses.append(response.strip())
                response = ''
            elif response_flag and len(content.split()) > 4:
                response_flag = False
                if len(response) != 0:
                    responses.append(response)
                response = ''
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            elif response_flag:
                response+=' '+getattr(t,'value').split('\n')[0].strip()
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))
            
#     text features & audio_features
    clip = sr*1*15
    text_embedds = elmo.embed_sentence(responses).mean(0)
    if target >= 10:
        times = 3 if counter_train < 48 else 2
        for i in range(times):
            melspec = librosa.feature.melspectrogram(signal[clip*i:clip*(i+1)], n_mels=80,sr=sr)
            audio_features.append(melspec)
            text_features.append(text_embedds[i*10:(i+1)*10])
            targets.append(target)
            counter_train+=1
    else:
        melspec = librosa.feature.melspectrogram(signal[:clip], n_mels=80,sr=sr)
        audio_features.append(melspec)
        text_features.append(text_embedds[:10])
        targets.append(target)
        
    print('{}_P feature done'.format(number))
    
    
counter_train = 0
    
# training set
features_train = []
targets_train = []
audio_features_train = []
text_features_train = []

# test set
features_test = []
ctargets_test = []
audio_features_test = []
text_features_test = []

# # training set
# for index in range(len(train_split_num)):
#     extract_features(train_split_num[index], audio_features_train, text_features_train, \
#                      train_split_clabel[index], 'train', targets_train)

# # test set
# for index in range(len(test_split_num)):
#     extract_features(test_split_num[index], audio_features_test, text_features_test, \
#                      test_split_clabel[index], 'test', ctargets_test)

# # preprocess
# audio_features_train = np.array(audio_features_train).astype('float32')
# audio_features_test = np.array(audio_features_test).astype('float32')
# audio_features_train = np.array([(X - X.min()) / (X.max() - X.min()) for X in audio_features_train])
# audio_features_test = np.array([(X - X.min()) / (X.max() - X.min()) for X in audio_features_test])

# for i in range(len(audio_features_train)):
#     features_train.append([text_features_train[i], audio_features_train[i]])

# for i in range(len(audio_features_test)):
#     features_test.append([text_features_test[i], audio_features_test[i]])

# print(np.shape(features_train), np.shape(features_test))

audio_features_whole = []
text_features_whole = []
features_whole = []
targets_whole = []
whole_split_num = train_split_num + test_split_num
whole_targets = train_split_clabel + test_split_clabel

# for index in range(len(whole_split_num)):
#     extract_features_whole(whole_split_num[index], audio_features_whole, text_features_whole, \
#                            whole_targets[index], targets_whole)
    
# # preprocess
# audio_features_whole = np.array(audio_features_whole).astype('float32')
# audio_features_whole = np.array([(X - X.min()) / (X.max() - X.min()) for X in audio_features_whole])

# for i in range(len(audio_features_whole)):
#     features_whole.append([text_features_whole[i], audio_features_whole[i]])
    
# print(np.shape(features_whole), np.shape(targets_whole))


In [487]:
# print("Saving npz file locally...")

# np.savez(prefix+'data/fuse/train_samples_reg.npz', features_train)
# np.savez(prefix+'data/fuse/test_samples_reg.npz', features_test)
# np.savez(prefix+'data/fuse/train_labels_reg.npz', targets_train)
# np.savez(prefix+'data/fuse/test_labels_reg.npz', ctargets_test)


# np.savez(prefix+'data/fuse/whole_samples_reg.npz', features_whole)
# np.savez(prefix+'data/fuse/whole_labels_reg.npz', targets_whole)

features_train = np.load(prefix+'data/fuse/train_samples_reg.npz', allow_pickle=True)['arr_0']
features_test = np.load(prefix+'data/fuse/test_samples_reg.npz', allow_pickle=True)['arr_0']
targets_train = np.load(prefix+'data/fuse/train_labels_reg.npz', allow_pickle=True)['arr_0']
ctargets_test = np.load(prefix+'data/fuse/test_labels_reg.npz', allow_pickle=True)['arr_0']

X_train = np.array(features_train)
X_test = np.array(features_test)
Y_train = np.array(targets_train)
Y_test = np.array(ctargets_test)

In [501]:
class BiLSTM(nn.Module):
    def __init__(self, config):
        super(BiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']

        self.build_model()


    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)
    
class CNN(nn.Module):
    def __init__(self, n_classes):
        super(CNN, self).__init__()
        self.conv2d_1 = nn.Conv2d(1, 32, (1,7), 1)
        self.conv2d_2 = nn.Conv2d(32, 32, (1,7), 1)
        self.dense_1 = nn.Linear(29952, 128)
        self.dense_2 = nn.Linear(128, 128)
        self.dense_3 = nn.Linear(128, n_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = F.relu(self.conv2d_1(x))
        x = F.max_pool2d(x, (4, 3), (1, 3))
        x = F.relu(self.conv2d_2(x))
        x = F.max_pool2d(x, (1, 3), (1, 3))
#         flatten in keras
        x = x.permute((0, 2, 3, 1))
        x = x.contiguous().view(-1, 29952)
        x = F.relu(self.dense_1(x))
#         x = F.relu(self.dense_2(x))
        x = self.dense_2(x)
        x = self.dropout(x)
        output = F.softmax(self.dense_3(x))
        return output

In [806]:
def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
    
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def plot_roc_curve(y_test, y_score):
    """
    Plots ROC curve for final trained model. Code taken from:
    https://vkolachalama.blogspot.com/2016/05/keras-implementation-of-mlp-neural.html
    """
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.savefig(prefix+'images/BiLSTM_roc.png')
    plt.close()

class fusion_net(nn.Module):
    def __init__(self, embed_size, hidden_dims, rnn_layers, dropout, num_classes, kernel_height):
        super(fusion_net, self).__init__()
        self.embed_size = embed_size
        self.hidden_dims = hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes
        self.kernel_height = kernel_height
        
#         ============================= BiLSTM =================================
        
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embed_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout)
        )
        
#         ============================= BiLSTM =================================

#         ============================= cnn =============================

        self.conv2d_1 = nn.Conv2d(1, 32, (1,7), 1)
        self.conv2d_2 = nn.Conv2d(32, 32, (1,7), 2)
        self.dense_1 = nn.Linear(29952, 128)
        self.dense_2 = nn.Linear(128, 128)
        self.dropout = nn.Dropout(self.dropout)
        
#         ============================= cnn =============================

#         ============================= last fc layer =============================
        self.fc_final = nn.Sequential(
            nn.Linear(256, self.num_classes),
            nn.ReLU(),
#             nn.Softmax(),
#             nn.Sigmoid()
        )
        
    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result
    
    def pretrained_feature(self, x):
        with torch.no_grad():
            x_text = []
            x_audio = []
            for ele in x:
                x_text.append(ele[0])
                x_audio.append(ele[1])
            x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False),\
                                Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False)
    #         ============================= BiLSTM =================================
            # x : [len_seq, batch_size, embedding_dim]
            x_text = x_text.permute(1, 0, 2)
            output, (final_hidden_state, final_cell_state) = self.lstm_net(x_text)
            # output : [batch_size, len_seq, n_hidden * 2]
            output = output.permute(1, 0, 2)
            # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
            final_hidden_state = final_hidden_state.permute(1, 0, 2)
            # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
            # atten_out = self.attention_net(output, final_hidden_state)
            atten_out = self.attention_net_with_w(output, final_hidden_state)
            text_feature = self.fc_out(atten_out)

    #         ============================= BiLSTM =================================

    #         ============================= cnn =============================
            x_audio = x_audio.unsqueeze(1)
            x_audio = F.relu(self.conv2d_1(x_audio))
            x_audio = F.max_pool2d(x_audio, (4, 3), (1, 3))
            x_audio = F.relu(self.conv2d_2(x_audio))
            x_audio = F.max_pool2d(x_audio, (1, 3), (1, 3))
    #         flatten in keras
            x_audio = x_audio.permute((0, 2, 3, 1))
            x_audio = x_audio.contiguous().view(-1, 29952)
            x_audio = F.relu(self.dense_1(x_audio))
#             x_audio = F.relu(self.dense_2(x_audio))
            x_audio = self.dense_2(x_audio)
            audio_feature = self.dropout(x_audio)
#         ============================= cnn =============================
        return (text_feature, audio_feature)
        
    def forward(self, x): 
#         tf, af = self.pretrained_feature(x)
#         y = tf+af
        output = self.fc_final(x)
        return output
    
class MyLoss(nn.Module):
    def __init__(self):
        super(MyLoss, self).__init__()
        
    def forward(self, text_feature, audio_feature, target, model):
        weight = model.fc_final[0].weight
        bias = model.fc_final[0].bias
        pred_text = F.linear(text_feature, weight[:, :config['hidden_dims']], bias).flatten()
        pred_audio = F.linear(audio_feature, weight[:, config['hidden_dims']:], bias).flatten()
        l = nn.L1Loss()
        target = torch.tensor(target).type(torch.FloatTensor)
#         return l(pred_text, target) + l(pred_audio, target) + \
#                 config['lambda']*torch.norm(weight[:, :config['hidden_dims']]) + \
#                 config['lambda']*torch.norm(weight[:, config['hidden_dims']:])  
        return l(pred_text, target) + l(pred_audio, target)
    
                                                   

In [807]:
config = {
    'num_classes': 1,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 1024,
    'batch_size': 2,
    'epochs': 100,
    'learning_rate': 6.25e-5,
    'hidden_dims': 128,
    'kernel_height': 35,
    'cuda': False,
    'lambda': 1e-2,
}

model = fusion_net(config['embedding_size'], config['hidden_dims'], \
                   config['rnn_layers'], config['dropout'], config['num_classes'], config['kernel_height'])

# optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
optimizer = optim.SGD(model.parameters(), lr=config['learning_rate'])
criterion = MyLoss()

def train(epoch, X_train, Y_train):
    global max_train_acc, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    pred = np.array([])
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        text_feature, audio_feature = model.pretrained_feature(x)
        concat_x = torch.cat((text_feature, audio_feature), dim=1)
#         concat_x = text_feature + audio_feature
        output = model(concat_x)
        pred = np.hstack((pred, output.data.flatten().numpy()))
        loss = criterion(text_feature, audio_feature, y, model)
#         loss = nn.L1Loss()(output, torch.tensor(y).type(torch.FloatTensor).view_as(output))
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()
#     max_train_acc = correct
    train_acc = correct
    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\t Loss: {:.6f}\t MAE: {:.2f}\n '.format(
                epoch, config['learning_rate'], total_loss/batch_idx, mean_absolute_error(pred, Y_train)))


def evaluate(model_name, X_test, Y_test, model):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = np.array([])
    global fold_idx, min_mae, maes
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            concat_x = torch.cat((text_feature, audio_feature), dim=1)
#             concat_x = text_feature + audio_feature
            output = model(concat_x)
        loss = criterion(text_feature, audio_feature, y, model)
#         loss = nn.L1Loss()(output, torch.tensor(y).type(torch.FloatTensor).view_as(output))
        pred = np.hstack((pred, output.data.flatten().numpy()))
        total_loss += loss.item()
        batch_idx+=1
        
    mae = mean_absolute_error(pred, Y_test)
    print('\nTest set: Average loss: {:.4f}\t MAE: {:.4f}\n'.format(total_loss/batch_idx/2, mae))
    
#     y_test_pred, conf_matrix = model_performance(Y_test, pred[2:])
    
#     print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test)))
#     # custom evaluation metrics
#     print('Calculating additional test metrics...')
#     accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
#     precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
#     recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     print("Accuracy: {}".format(accuracy))
#     print("Precision: {}".format(precision))
#     print("Recall: {}".format(recall))
#     print("F1-Score: {}\n".format(f1_score))
#     print('='*89)
    
    if min_mae >= mae and mae < 3.75:
        min_mae = mae
        save(model, '{}_{:.2f}'.format(model_name, mae))
        print('*'*64)
        print('model saved: mae: {}'.format(mae))
        print('*'*64)
#     if min_mae > mae:
#         min_mae = mae
#         maes[fold_idx] = mae
    return total_loss

In [808]:
lstm_model = torch.load('/Users/apple/Downloads/depression/BiLSTM_reg_128_3.88.pt')
cnn_model = CNN(1)
cnn_model.load_state_dict(torch.load('/Users/apple/Downloads/depression/cnn_melspec.pt'))

# print("Model's state_dict:")
# for param_tensor in cnn_model.state_dict():
#     print(param_tensor, "\t", cnn_model.state_dict()[param_tensor].size())

merge_ordereddict = lstm_model.state_dict().copy()
merge_ordereddict.update(cnn_model.state_dict())

model.load_state_dict(merge_ordereddict, strict=False)
    
if config['cuda']:
    model = model.cuda()
    
for param in model.parameters():
    param.requires_grad = True

# model.fc_final[0].weight.requires_grad = True
# model.fc_final[0].bias.requires_grad = True


In [809]:
max_f1 = -1
max_acc = -1
max_train_acc = -1
min_mae = 100

for ep in range(1, config['epochs']):
    train(ep, X_train, Y_train)
    tloss = evaluate('fuse_reg', X_test, Y_test, model)

Train Epoch:  1	 Learning rate: 0.0001	 Loss: 15.945302	 MAE: 7.64
 

Test set: Average loss: 6.4131	 MAE: 6.5139

Train Epoch:  2	 Learning rate: 0.0001	 Loss: 15.191943	 MAE: 6.90
 

Test set: Average loss: 6.1714	 MAE: 6.0099

Train Epoch:  3	 Learning rate: 0.0001	 Loss: 14.615025	 MAE: 6.32
 

Test set: Average loss: 5.9428	 MAE: 5.5411

Train Epoch:  4	 Learning rate: 0.0001	 Loss: 13.925727	 MAE: 5.62
 

Test set: Average loss: 5.7181	 MAE: 5.1119

Train Epoch:  5	 Learning rate: 0.0001	 Loss: 13.140352	 MAE: 4.84
 

Test set: Average loss: 5.5050	 MAE: 4.6969

Train Epoch:  6	 Learning rate: 0.0001	 Loss: 12.551642	 MAE: 4.25
 

Test set: Average loss: 5.3090	 MAE: 4.3621

Train Epoch:  7	 Learning rate: 0.0001	 Loss: 11.744102	 MAE: 3.45
 

Test set: Average loss: 5.1247	 MAE: 4.1041

Train Epoch:  8	 Learning rate: 0.0001	 Loss: 11.177486	 MAE: 2.92
 

Test set: Average loss: 4.9535	 MAE: 3.8833

Train Epoch:  9	 Learning rate: 0.0001	 Loss: 10.232799	 MAE: 2.15
 

Test set: 

Train Epoch: 73	 Learning rate: 0.0001	 Loss: 3.396107	 MAE: 7.49
 

Test set: Average loss: 3.8798	 MAE: 7.1970

Train Epoch: 74	 Learning rate: 0.0001	 Loss: 3.219485	 MAE: 7.98
 

Test set: Average loss: 3.8776	 MAE: 7.2263

Train Epoch: 75	 Learning rate: 0.0001	 Loss: 3.329794	 MAE: 7.99
 

Test set: Average loss: 3.8692	 MAE: 7.1703

Train Epoch: 76	 Learning rate: 0.0001	 Loss: 3.569724	 MAE: 7.54
 

Test set: Average loss: 3.8704	 MAE: 7.1932

Train Epoch: 77	 Learning rate: 0.0001	 Loss: 3.310316	 MAE: 7.87
 

Test set: Average loss: 3.8671	 MAE: 7.1683

Train Epoch: 78	 Learning rate: 0.0001	 Loss: 3.559334	 MAE: 7.65
 

Test set: Average loss: 3.8582	 MAE: 7.1251

Train Epoch: 79	 Learning rate: 0.0001	 Loss: 3.133425	 MAE: 7.77
 

Test set: Average loss: 3.8639	 MAE: 7.1441

Train Epoch: 80	 Learning rate: 0.0001	 Loss: 3.426604	 MAE: 8.10
 

Test set: Average loss: 3.8644	 MAE: 7.0967

Train Epoch: 81	 Learning rate: 0.0001	 Loss: 3.408102	 MAE: 7.63
 

Test set: Average l

In [None]:
fuse_model = torch.load('/Users/apple/Downloads/depression/fuse_0.85.pt')
tloss = evaluate('fuse', X_test, Y_test, fuse_model)                        

In [484]:
from sklearn.model_selection import StratifiedKFold

features_whole = np.load(prefix+'data/fuse/whole_samples_reg.npz', allow_pickle=True)['arr_0']
targets_whole = np.load(prefix+'data/fuse/whole_labels_reg.npz', allow_pickle=True)['arr_0']
fold = 4
fold_idx = 0
kfold = StratifiedKFold(n_splits=fold, shuffle=True)
maes = np.zeros(fold)

for train_idx, test_idx in kfold.split(features_whole, targets_whole):
    for ep in range(1, config['epochs']):
        train(ep, features_whole[train_idx], targets_whole[train_idx])
        tloss = evaluate('fuse_reg_cv', features_whole[test_idx], targets_whole[test_idx], model)
    fold_idx += 1
    min_mae = 100



Train Epoch:  1	 Learning rate: 0.0001	 Loss: 17.441852	 MAE: 9.00
 

Test set: Average loss: 16.8825	 MAE: 8.7609

Train Epoch:  2	 Learning rate: 0.0001	 Loss: 17.080365	 MAE: 8.64
 

Test set: Average loss: 16.5166	 MAE: 8.3893

Train Epoch:  3	 Learning rate: 0.0001	 Loss: 16.703621	 MAE: 8.26
 

Test set: Average loss: 16.1613	 MAE: 8.0251

Train Epoch:  4	 Learning rate: 0.0001	 Loss: 16.449851	 MAE: 8.00
 

Test set: Average loss: 15.8069	 MAE: 7.6610

Train Epoch:  5	 Learning rate: 0.0001	 Loss: 16.085894	 MAE: 7.63
 

Test set: Average loss: 15.4582	 MAE: 7.3036

Train Epoch:  6	 Learning rate: 0.0001	 Loss: 15.688275	 MAE: 7.23
 

Test set: Average loss: 15.1002	 MAE: 6.9365

Train Epoch:  7	 Learning rate: 0.0001	 Loss: 15.389376	 MAE: 6.94
 

Test set: Average loss: 14.7430	 MAE: 6.5700

Train Epoch:  8	 Learning rate: 0.0001	 Loss: 15.224189	 MAE: 6.77
 

Test set: Average loss: 14.4024	 MAE: 6.2221

Train Epoch:  9	 Learning rate: 0.0001	 Loss: 14.705560	 MAE: 6.25
 

Te


Test set: Average loss: 4.1445	 MAE: 7.4791

Train Epoch: 14	 Learning rate: 0.0001	 Loss: 4.585653	 MAE: 7.49
 

Test set: Average loss: 4.1520	 MAE: 7.5356

Train Epoch: 15	 Learning rate: 0.0001	 Loss: 4.797094	 MAE: 7.48
 

Test set: Average loss: 4.1522	 MAE: 7.5329

Train Epoch: 16	 Learning rate: 0.0001	 Loss: 4.721240	 MAE: 7.79
 

Test set: Average loss: 4.1545	 MAE: 7.5522

Train Epoch: 17	 Learning rate: 0.0001	 Loss: 4.510887	 MAE: 8.07
 

Test set: Average loss: 4.1524	 MAE: 7.5441

Train Epoch: 18	 Learning rate: 0.0001	 Loss: 4.688483	 MAE: 7.68
 

Test set: Average loss: 4.1569	 MAE: 7.5645

Train Epoch: 19	 Learning rate: 0.0001	 Loss: 4.759270	 MAE: 7.69
 

Test set: Average loss: 4.1582	 MAE: 7.5752

Train Epoch: 20	 Learning rate: 0.0001	 Loss: 4.653477	 MAE: 8.00
 

Test set: Average loss: 4.1597	 MAE: 7.5834

Train Epoch: 21	 Learning rate: 0.0001	 Loss: 4.874827	 MAE: 7.59
 

Test set: Average loss: 4.1604	 MAE: 7.5865

Train Epoch: 22	 Learning rate: 0.0001	 Lo

KeyboardInterrupt: 

In [37]:
print("Accuracy: {}".format(np.mean(accs)))
print("Precision: {}".format(np.mean(precs)))
print("Recall: {}".format(np.mean(recs)))
print("F1-Score: {}\n".format(np.mean(f1s)))
print('='*89)

Accuracy: 0.945
Precision: 0.9585360291882032
Recall: 0.9299999999999999
F1-Score: 0.9427670004171882



In [38]:
accs

array([0.9 , 0.98, 0.92, 0.98])