In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.optim as optim
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
import wave
import librosa
from python_speech_features import *
import re
from allennlp.commands.elmo import ElmoEmbedder

In [2]:
prefix = '/Users/apple/Downloads/depression/'

elmo = ElmoEmbedder()

train_split_df = pd.read_csv(prefix+'train_split_Depression_AVEC2017 (1).csv')
test_split_df = pd.read_csv(prefix+'dev_split_Depression_AVEC2017.csv')
train_split_num = train_split_df[['Participant_ID']]['Participant_ID'].tolist()
test_split_num = test_split_df[['Participant_ID']]['Participant_ID'].tolist()
train_split_clabel = train_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist()
test_split_clabel = test_split_df[['PHQ8_Binary']]['PHQ8_Binary'].tolist()

topics = []
with open('/Users/apple/Downloads/depression/queries.txt', 'r') as f:
    for line in f.readlines():
        topics.append(line.strip('\n').strip())
        

def identify_topics(sentence):
    if sentence in topics:
        return True
    return False

def extract_features(number, features, target, mode, targets):
    transcript = pd.read_csv(prefix+'{0}_P/{0}_TRANSCRIPT.csv'.format(number), sep='\t').fillna('')
    
    wavefile = wave.open(prefix+'{0}_P/{0}_AUDIO.wav'.format(number, 'r'))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)
    
    responses = []
    response = ''
    response_flag = False
    start_time = 0
    stop_time = 0
    signal = []

    global counter_train, counter_test

    for t in transcript.itertuples():
        # participant一句话结束
        if getattr(t,'speaker') == 'Ellie':
            if '(' in getattr(t,'value'):
                content = re.findall(re.compile(r'[(](.*?)[)]', re.S), getattr(t,'value'))[0]
            else:
                content = getattr(t,'value').strip()
            content = getattr(t,'value').strip()
            if identify_topics(content):
                response_flag = True
                if len(response) != 0:
                    responses.append(response.strip())
                response = ''
            elif response_flag and len(content.split()) > 4:
                response_flag = False
                if len(response) != 0:
                    responses.append(response)
                response = ''
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            elif response_flag:
                response+=' '+getattr(t,'value').split('\n')[0].strip()
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))
            
#     text features & audio_features
    clip = sr*1*15
    text_features = elmo.embed_sentence(responses).mean(0)
    if target == 1 and mode == 'train':
        times = 3 if counter_train < 51 else 2
        for i in range(times):
#             melspec = base.logfbank(signal[clip*i:clip*(i+1)], samplerate=sr, winlen=0.064, winstep=0.032, nfilt=80, nfft=1024, lowfreq=130, highfreq=6854)
            melspec = librosa.feature.melspectrogram(signal[clip*i:clip*(i+1)], n_mels=80,sr=sr)
            features.append([text_features[i*10:(i+1)*10], melspec])
            targets.append(target)
            counter_train+=1
    else:
#         melspec = base.logfbank(signal[:clip], samplerate=sr, winlen=0.064, winstep=0.032, nfilt=80, nfft=1024, lowfreq=130, highfreq=6854)
        melspec = librosa.feature.melspectrogram(signal[:clip], n_mels=80,sr=sr)
        features.append([text_features[:10], melspec])
        targets.append(target)
        
    print('{}_P feature done'.format(number))
    
    
def extract_features_whole(number, features, target, targets):
    transcript = pd.read_csv(prefix+'{0}_P/{0}_TRANSCRIPT.csv'.format(number), sep='\t').fillna('')
    
    wavefile = wave.open(prefix+'{0}_P/{0}_AUDIO.wav'.format(number, 'r'))
    sr = wavefile.getframerate()
    nframes = wavefile.getnframes()
    wave_data = np.frombuffer(wavefile.readframes(nframes), dtype=np.short)
    
    responses = []
    response = ''
    response_flag = False
    start_time = 0
    stop_time = 0
    signal = []

    global counter_train, counter_test

    for t in transcript.itertuples():
        # participant一句话结束
        if getattr(t,'speaker') == 'Ellie':
            if '(' in getattr(t,'value'):
                content = re.findall(re.compile(r'[(](.*?)[)]', re.S), getattr(t,'value'))[0]
            else:
                content = getattr(t,'value').strip()
            content = getattr(t,'value').strip()
            if identify_topics(content):
                response_flag = True
                if len(response) != 0:
                    responses.append(response.strip())
                response = ''
            elif response_flag and len(content.split()) > 4:
                response_flag = False
                if len(response) != 0:
                    responses.append(response)
                response = ''
        elif getattr(t,'speaker') == 'Participant':
            if 'scrubbed_entry' in getattr(t,'value'):
                continue
            elif response_flag:
                response+=' '+getattr(t,'value').split('\n')[0].strip()
            start_time = int(getattr(t,'start_time')*sr)
            stop_time = int(getattr(t,'stop_time')*sr)
            signal = np.hstack((signal, wave_data[start_time:stop_time].astype(np.float)))
            
#     text features & audio_features
    clip = sr*1*15
    text_features = elmo.embed_sentence(responses).mean(0)
    if target == 1:
        times = 3 if counter_train < 48 else 2
        for i in range(times):
#             melspec = base.logfbank(signal[clip*i:clip*(i+1)], samplerate=sr, winlen=0.064, winstep=0.032, nfilt=80, nfft=1024, lowfreq=130, highfreq=6854)
            melspec = librosa.feature.melspectrogram(signal[clip*i:clip*(i+1)], n_mels=80,sr=sr)
            features.append([text_features[i*10:(i+1)*10], melspec])
            targets.append(target)
            counter_train+=1
    else:
#         melspec = base.logfbank(signal[:clip], samplerate=sr, winlen=0.064, winstep=0.032, nfilt=80, nfft=1024, lowfreq=130, highfreq=6854)
        melspec = librosa.feature.melspectrogram(signal[:clip], n_mels=80,sr=sr)
        features.append([text_features[:10], melspec])
        targets.append(target)
        
    print('{}_P feature done'.format(number))
    
    
counter_train = 0
    
# training set
features_train = []
targets_train = []

# test set
features_test = []
ctargets_test = []

# # training set
# for index in range(len(train_split_num)):
#     extract_features(train_split_num[index], features_train, train_split_clabel[index], 'train', targets_train)
    
# # test set
# for index in range(len(test_split_num)):
#     extract_features(test_split_num[index], features_test, test_split_clabel[index], 'test', ctargets_test)
# print(np.shape(features_train), np.shape(features_test))

features_whole = []
targets_whole = []
whole_split_num = train_split_num + test_split_num
whole_targets = train_split_clabel + test_split_clabel

# for index in range(len(whole_split_num)):
#     extract_features_whole(whole_split_num[index], features_whole, whole_targets[index], targets_whole)
    
# print(np.shape(features_whole), np.shape(targets_whole))



In [3]:
# print("Saving npz file locally...")

# np.savez(prefix+'data/fuse/train_samples.npz', features_train)
# np.savez(prefix+'data/fuse/test_samples.npz', features_test)
# np.savez(prefix+'data/fuse/train_labels.npz', targets_train)
# np.savez(prefix+'data/fuse/test_labels.npz', ctargets_test)

# np.savez(prefix+'data/fuse/whole_samples.npz', features_whole)
# np.savez(prefix+'data/fuse/whole_labels.npz', targets_whole)

features_train = np.load(prefix+'data/fuse/train_samples.npz', allow_pickle=True)['arr_0']
features_test = np.load(prefix+'data/fuse/test_samples.npz', allow_pickle=True)['arr_0']
targets_train = np.load(prefix+'data/fuse/train_labels.npz', allow_pickle=True)['arr_0']
ctargets_test = np.load(prefix+'data/fuse/test_labels.npz', allow_pickle=True)['arr_0']

X_train = np.array(features_train)
X_test = np.array(features_test)
Y_train = np.array(targets_train)
Y_test = np.array(ctargets_test)

In [7]:
class BiLSTM(nn.Module):
    def __init__(self, config):
        super(BiLSTM, self).__init__()
        self.num_classes = config['num_classes']
        self.learning_rate = config['learning_rate']
        self.dropout = config['dropout']
        self.hidden_dims = config['hidden_dims']
        self.rnn_layers = config['rnn_layers']
        self.embedding_size = config['embedding_size']

        self.build_model()


    def build_model(self):
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )
        # self.attention_weights = self.attention_weights.view(self.hidden_dims, 1)

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embedding_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        # self.fc_out = nn.Linear(self.hidden_dims, self.num_classes)
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.num_classes)
        )

    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result

    def forward(self, x):
        
        # x : [len_seq, batch_size, embedding_dim]
        x = x.permute(1, 0, 2)
        output, (final_hidden_state, final_cell_state) = self.lstm_net(x)
        # output : [batch_size, len_seq, n_hidden * 2]
        output = output.permute(1, 0, 2)
        # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
        final_hidden_state = final_hidden_state.permute(1, 0, 2)
        # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
        # atten_out = self.attention_net(output, final_hidden_state)
        atten_out = self.attention_net_with_w(output, final_hidden_state)
        return self.fc_out(atten_out)

class oned_cnn(nn.Module):
    def __init__(self, n_classes, kernel_height):
        super(oned_cnn, self).__init__()
        self.n_classes = n_classes
        self.kernel_height = kernel_height
        self.conv_layer1 = nn.Sequential(
            nn.Conv2d(1, 64, (kernel_height, 3), 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))  
        )
        kernel_height = int(kernel_height/2)
        self.conv_layer2 = nn.Sequential(
            nn.Conv2d(64, 64, (3, 3), 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))  
        )
        kernel_height = int(kernel_height/2)-1
        self.conv_layer3 = nn.Sequential(
            nn.Conv2d(64, 64, (3, 1), 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))  
        )
        kernel_height = int(kernel_height/2)-1
        self.conv_layer4 = nn.Sequential(
            nn.Conv2d(64, 64, (3, 1), 1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1), stride=(2, 1))  
        )
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, self.n_classes),
            nn.ReLU()
        )
        
    def forward(self, x):
        x = self.conv_layer1(x)
        x = self.conv_layer2(x)
        x = self.conv_layer3(x)
        x = self.conv_layer4(x)
        x = self.dropout(x)
        
        '''pool_mean = F.adaptive_avg_pool2d(x, (1, 1))
        pool_max = F.adaptive_max_pool2d(x, (1, 1))
        pool_l2 = torch.norm(x.cpu(), dim=(2,3), keepdim=True).cuda()
        x = torch.cat((pool_mean.squeeze(-1), pool_max.squeeze(-1), pool_l2.squeeze(-1)), dim=-1)'''
        x = F.adaptive_max_pool2d(x, (4, 2))
        return self.fc(x)
    
class CNN(nn.Module):
    def __init__(self, n_classes):
        super(CNN, self).__init__()
        self.conv2d_1 = nn.Conv2d(1, 32, (1,7), 1)
        self.conv2d_2 = nn.Conv2d(32, 32, (1,7), 1)
        self.dense_1 = nn.Linear(120736, 128)
        self.dense_2 = nn.Linear(128, 128)
        self.dense_3 = nn.Linear(128, n_classes)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        x = F.relu(self.conv2d_1(x))
        x = F.max_pool2d(x, (4, 3), (1, 3))
        x = F.relu(self.conv2d_2(x))
        x = F.max_pool2d(x, (1, 3), (1, 3))
#         flatten in keras
        x = x.permute((0, 2, 3, 1))
        x = x.contiguous().view(-1, 120736)
        x = F.relu(self.dense_1(x))
        x = F.relu(self.dense_2(x))
        x = self.dropout(x)
        output = F.softmax(self.dense_3(x))
        return output

In [32]:
def save(model, filename):
    save_filename = '{}.pt'.format(filename)
    torch.save(model, save_filename)
    print('Saved as %s' % save_filename)
    
def standard_confusion_matrix(y_test, y_test_pred):
    """
    Make confusion matrix with format:
                  -----------
                  | TP | FP |
                  -----------
                  | FN | TN |
                  -----------
    Parameters
    ----------
    y_true : ndarray - 1D
    y_pred : ndarray - 1D

    Returns
    -------
    ndarray - 2D
    """
    [[tn, fp], [fn, tp]] = confusion_matrix(y_test, y_test_pred)
    return np.array([[tp, fp], [fn, tn]])

def model_performance(y_test, y_test_pred_proba):
    """
    Evaluation metrics for network performance.
    """
#     y_test_pred = y_test_pred_proba.data.max(1, keepdim=True)[1]
    y_test_pred = y_test_pred_proba

    # Computing confusion matrix for test dataset
    conf_matrix = standard_confusion_matrix(y_test, y_test_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    return y_test_pred, conf_matrix

def plot_roc_curve(y_test, y_score):
    """
    Plots ROC curve for final trained model. Code taken from:
    https://vkolachalama.blogspot.com/2016/05/keras-implementation-of-mlp-neural.html
    """
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.05])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curve')
    plt.legend(loc="lower right")
    plt.savefig(prefix+'images/BiLSTM_roc.png')
    plt.close()

class fusion_net(nn.Module):
    def __init__(self, embed_size, hidden_dims, rnn_layers, dropout, num_classes, kernel_height):
        super(fusion_net, self).__init__()
        self.embed_size = embed_size
        self.hidden_dims = hidden_dims
        self.rnn_layers = rnn_layers
        self.dropout = dropout
        self.num_classes = num_classes
        self.kernel_height = kernel_height
        
#         ============================= BiLSTM =================================
        
        # attention layer
        self.attention_layer = nn.Sequential(
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True)
        )

        # 双层lstm
        self.lstm_net = nn.LSTM(self.embed_size, self.hidden_dims,
                                num_layers=self.rnn_layers, dropout=self.dropout,
                                bidirectional=True)
        # FC层
        self.fc_out = nn.Sequential(
            nn.Dropout(self.dropout),
            nn.Linear(self.hidden_dims, self.hidden_dims),
            nn.ReLU(inplace=True),
            nn.Dropout(self.dropout)
        )
        
#         ============================= BiLSTM =================================

#         ============================= cnn =============================

        self.conv2d_1 = nn.Conv2d(1, 32, (1,7), 1)
        self.conv2d_2 = nn.Conv2d(32, 32, (1,7), 1)
        self.dense_1 = nn.Linear(120736, 128)
        self.dense_2 = nn.Linear(128, 128)
        self.dropout = nn.Dropout(self.dropout)
        
#         ============================= cnn =============================

#         ============================= last fc layer =============================
        self.fc_final = nn.Sequential(
            nn.Linear(128, self.num_classes),
#             nn.ReLU(),
            nn.Softmax(),
#             nn.Sigmoid()
        )
        
    def attention_net_with_w(self, lstm_out, lstm_hidden):
        '''
        :param lstm_out:    [batch_size, len_seq, n_hidden * 2]
        :param lstm_hidden: [batch_size, num_layers * num_directions, n_hidden]
        :return: [batch_size, n_hidden]
        '''
        lstm_tmp_out = torch.chunk(lstm_out, 2, -1)
        # h [batch_size, time_step, hidden_dims]
        h = lstm_tmp_out[0] + lstm_tmp_out[1]
        # [batch_size, num_layers * num_directions, n_hidden]
        lstm_hidden = torch.sum(lstm_hidden, dim=1)
        # [batch_size, 1, n_hidden]
        lstm_hidden = lstm_hidden.unsqueeze(1)
        # atten_w [batch_size, 1, hidden_dims]
        atten_w = self.attention_layer(lstm_hidden)
        # m [batch_size, time_step, hidden_dims]
        m = nn.Tanh()(h)
        # atten_context [batch_size, 1, time_step]
        atten_context = torch.bmm(atten_w, m.transpose(1, 2))
        # softmax_w [batch_size, 1, time_step]
        softmax_w = F.softmax(atten_context, dim=-1)
        # context [batch_size, 1, hidden_dims]
        context = torch.bmm(softmax_w, h)
        result = context.squeeze(1)
        return result
    
    def pretrained_feature(self, x):
        with torch.no_grad():
            x_text = []
            x_audio = []
            for ele in x:
                x_text.append(ele[0])
                x_audio.append(ele[1])
            x_text, x_audio = Variable(torch.tensor(x_text).type(torch.FloatTensor), requires_grad=False),\
                                Variable(torch.tensor(x_audio).type(torch.FloatTensor), requires_grad=False)
    #         ============================= BiLSTM =================================
            # x : [len_seq, batch_size, embedding_dim]
            x_text = x_text.permute(1, 0, 2)
            output, (final_hidden_state, final_cell_state) = self.lstm_net(x_text)
            # output : [batch_size, len_seq, n_hidden * 2]
            output = output.permute(1, 0, 2)
            # final_hidden_state : [batch_size, num_layers * num_directions, n_hidden]
            final_hidden_state = final_hidden_state.permute(1, 0, 2)
            # final_hidden_state = torch.mean(final_hidden_state, dim=0, keepdim=True)
            # atten_out = self.attention_net(output, final_hidden_state)
            atten_out = self.attention_net_with_w(output, final_hidden_state)
            text_feature = self.fc_out(atten_out)

    #         ============================= BiLSTM =================================

    #         ============================= cnn =============================
            x_audio = x_audio.unsqueeze(1)
            x_audio = F.relu(self.conv2d_1(x_audio))
            x_audio = F.max_pool2d(x_audio, (4, 3), (1, 3))
            x_audio = F.relu(self.conv2d_2(x_audio))
            x_audio = F.max_pool2d(x_audio, (1, 3), (1, 3))
    #         flatten in keras
            x_audio = x_audio.permute((0, 2, 3, 1))
            x_audio = x_audio.contiguous().view(-1, 120736)
            x_audio = F.relu(self.dense_1(x_audio))
            x_audio = F.relu(self.dense_2(x_audio))
            audio_feature = self.dropout(x_audio)
#         ============================= cnn =============================
        return (text_feature, audio_feature)
        
    def forward(self, x): 
        output = self.fc_final(x)
        return output
    
class MyLoss(nn.Module):
    def __init__(self):
        super(MyLoss, self).__init__()
        
    def forward(self, text_feature, audio_feature, target, model):
        weight = model.fc_final[0].weight
        bias = model.fc_final[0].bias
        pred_text = F.linear(text_feature, weight[:, :config['hidden_dims']], bias)
        pred_audio = F.linear(audio_feature, weight[:, config['hidden_dims']:], bias)
        l = nn.CrossEntropyLoss()
        target = torch.tensor(target)
#         l = nn.BCEWithLogitsLoss()
#         target = F.one_hot(target, num_classes=2).type(torch.FloatTensor)
#         print('y: {}\npred_audio: {}\npred_text: {}\n'.format(target, pred_audio.data.max(1, keepdim=True)[1], pred_text.data.max(1, keepdim=True)[1]))
        # return l(pred_text, target) + l(pred_audio, target) + \
        #         config['lambda']*torch.norm(weight[:, :config['hidden_dims']]) + \
        #         config['lambda']*torch.norm(weight[:, config['hidden_dims']:])  
        return l(pred_text, target) + l(pred_audio, target)
    
                                                   

In [39]:
config = {
    'num_classes': 2,
    'dropout': 0.5,
    'rnn_layers': 2,
    'embedding_size': 1024,
    'batch_size': 2,
    'epochs': 60,
    'learning_rate': 1e-4,
    'hidden_dims': 128,
    'kernel_height': 35,
    'cuda': False,
    'lambda': 1e-2,
}

model = fusion_net(config['embedding_size'], config['hidden_dims'], \
                   config['rnn_layers'], config['dropout'], config['num_classes'], config['kernel_height'])

optimizer = optim.Adam(model.parameters(), lr=config['learning_rate'])
criterion = nn.CrossEntropyLoss()
# criterion = MyLoss()

def train(epoch, X_train, Y_train):
    global max_train_acc, train_acc
    model.train()
    batch_idx = 1
    total_loss = 0
    correct = 0
    for i in range(0, X_train.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_train.shape[0]:
            x, y = X_train[i:], Y_train[i:]
        else:
            x, y = X_train[i:(i+config['batch_size'])], Y_train[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        # 将模型的参数梯度设置为0
        optimizer.zero_grad()
        text_feature, audio_feature = model.pretrained_feature(x)
        # concat_x = torch.cat((text_feature, audio_feature), dim=1)
        # dot_x = text_feature.mul(audio_feature)
        add_x = text_feature.add(audio_feature)
        output = model(add_x)
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(torch.tensor(y).data.view_as(pred)).cpu().sum()
        loss = criterion(output, torch.tensor(y))
        # loss = criterion(text_feature, audio_feature, y, model)
        # 后向传播调整参数
        loss.backward()
        # 根据梯度更新网络参数
        optimizer.step()
        batch_idx += 1
        # loss.item()能够得到张量中的元素值
        total_loss += loss.item()
    cur_loss = total_loss
#     max_train_acc = correct
    train_acc = correct
    print('Train Epoch: {:2d}\t Learning rate: {:.4f}\tLoss: {:.6f}\t Accuracy: {}/{} ({:.0f}%)\n '.format(
                epoch, config['learning_rate'], cur_loss/len(X_train), correct, len(X_train),
        100. * correct / len(X_train)))


def evaluate(model_name, X_test, Y_test, model):
    model.eval()
    batch_idx = 1
    total_loss = 0
    pred = torch.empty(config['batch_size'], 1).type(torch.LongTensor)
    # global max_train_acc, f1s, accs, recs, precs, fold_idx, train_acc
    global max_train_acc, max_acc,max_f1
    for i in range(0, X_test.shape[0], config['batch_size']):
        if i + config['batch_size'] > X_test.shape[0]:
            x, y = X_test[i:], Y_test[i:]
        else:
            x, y = X_test[i:(i+config['batch_size'])], Y_test[i:(i+config['batch_size'])]
        if config['cuda']:
            x, y = Variable(torch.from_numpy(x).type(torch.FloatTensor), requires_grad=True).cuda(), Variable(torch.from_numpy(y)).cuda()
        text_feature, audio_feature = model.pretrained_feature(x)
        with torch.no_grad():
            add_x = text_feature.add(audio_feature)
            output = model(add_x)
        loss = criterion(output, torch.tensor(y))
        # loss = criterion(text_feature, audio_feature, y, model)
        pred = torch.cat((pred, output.data.max(1, keepdim=True)[1]))
        total_loss += loss.item()
    
    y_test_pred, conf_matrix = model_performance(Y_test, pred[2:])
    
    print('\nTest set: Average loss: {:.4f}'.format(total_loss/len(X_test)))
    # custom evaluation metrics
    print('Calculating additional test metrics...')
    accuracy = float(conf_matrix[0][0] + conf_matrix[1][1]) / np.sum(conf_matrix)
    precision = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[0][1])
    recall = float(conf_matrix[0][0]) / (conf_matrix[0][0] + conf_matrix[1][0])
    f1_score = 2 * (precision * recall) / (precision + recall)
    print("Accuracy: {}".format(accuracy))
    print("Precision: {}".format(precision))
    print("Recall: {}".format(recall))
    print("F1-Score: {}\n".format(f1_score))
    print('='*89)
    
    if max_f1 <= f1_score and max_train_acc >= 149:
        max_f1 = f1_score
        max_acc = accuracy
        save(model, '{}_{:.2f}'.format(model_name, max_f1))
        print('*'*64)
        print('model saved: f1: {}\tacc: {}'.format(max_f1, max_acc))
        print('*'*64)
#     if max_train_acc <= train_acc:
#         max_train_acc = train_acc
#         f1s[fold_idx] = f1_score
#         recs[fold_idx] = recall
#         precs[fold_idx] = precision
#         accs[fold_idx] = accuracy
    return total_loss

In [42]:
lstm_model = torch.load('/Users/apple/Downloads/depression/BiLSTM_elmo_128_0.83_best.pt')
cnn_model = CNN(2)
cnn_model.load_state_dict(torch.load('/Users/apple/Downloads/depression/cnn_melspec_clf.pt'))

# print("Model's state_dict:")
# for param_tensor in cnn_model.state_dict():
#     print(param_tensor, "\t", cnn_model.state_dict()[param_tensor].size())

merge_ordereddict = lstm_model.state_dict().copy()
merge_ordereddict.update(cnn_model.state_dict())

model.load_state_dict(merge_ordereddict, strict=False)
    
if config['cuda']:
    model = model.cuda()
    
for param in model.parameters():
    param.requires_grad = False

model.fc_final[0].weight.requires_grad = True
model.fc_final[0].bias.requires_grad = True

In [43]:
max_f1 = -1
max_acc = -1
max_train_acc = -1

for ep in range(1, config['epochs']):
    train(ep, X_train, Y_train)
    tloss = evaluate('fuse', X_test, Y_test, model)

Train Epoch:  1	 Learning rate: 0.0001	Loss: 0.387150	 Accuracy: 83/154 (54%)
 
Confusion Matrix:
[[ 1  2]
 [11 21]]

Test set: Average loss: 0.3468
Calculating additional test metrics...
Accuracy: 0.6285714285714286
Precision: 0.3333333333333333
Recall: 0.08333333333333333
F1-Score: 0.13333333333333333

Train Epoch:  2	 Learning rate: 0.0001	Loss: 0.367670	 Accuracy: 89/154 (58%)
 
Confusion Matrix:
[[ 1  2]
 [11 21]]

Test set: Average loss: 0.3468
Calculating additional test metrics...
Accuracy: 0.6285714285714286
Precision: 0.3333333333333333
Recall: 0.08333333333333333
F1-Score: 0.13333333333333333

Train Epoch:  3	 Learning rate: 0.0001	Loss: 0.419618	 Accuracy: 73/154 (47%)
 
Confusion Matrix:
[[ 1  2]
 [11 21]]

Test set: Average loss: 0.3468
Calculating additional test metrics...
Accuracy: 0.6285714285714286
Precision: 0.3333333333333333
Recall: 0.08333333333333333
F1-Score: 0.13333333333333333

Train Epoch:  4	 Learning rate: 0.0001	Loss: 0.409878	 Accuracy: 76/154 (49%)
 
Co

KeyboardInterrupt: 

In [10]:
fuse_model = torch.load('/Users/apple/Downloads/depression/fuse_0.85.pt')
tloss = evaluate('fuse', X_test, Y_test, fuse_model)                        

Confusion Matrix:
[[11  3]
 [ 1 20]]

Test set: Average loss: 605395734.3144
Calculating additional test metrics...
Accuracy: 0.8857142857142857
Precision: 0.7857142857142857
Recall: 0.9166666666666666
F1-Score: 0.8461538461538461



UnboundLocalError: local variable 'max_f1' referenced before assignment

In [36]:
from sklearn.model_selection import StratifiedKFold

features_whole = np.load(prefix+'data/fuse/whole_samples.npz', allow_pickle=True)['arr_0']
targets_whole = np.load(prefix+'data/fuse/whole_labels.npz', allow_pickle=True)['arr_0']
fold = 4
fold_idx = 0
kfold = StratifiedKFold(n_splits=fold, shuffle=True)
max_train_acc = -1
train_acc = -1
f1s = np.zeros(fold)
accs = np.zeros(fold)
precs = np.zeros(fold)
recs = np.zeros(fold)

for train_idx, test_idx in kfold.split(features_whole, targets_whole):
    for ep in range(1, config['epochs']):
        train(ep, features_whole[train_idx], targets_whole[train_idx])
        tloss = evaluate('fuse_cv', features_whole[test_idx], targets_whole[test_idx])
    fold_idx += 1
    max_train_acc = -1
    train_acc = -1

  input = module(input)


Train Epoch:  1	 Learning rate: 0.0001	Loss: 27396845.244322	 Accuracy: 139/150 (93%)
 
Confusion Matrix:
[[20  1]
 [ 5 24]]

Test set: Average loss: 30268969.9373
Calculating additional test metrics...
Accuracy: 0.88
Precision: 0.9523809523809523
Recall: 0.8
F1-Score: 0.8695652173913043

Train Epoch:  2	 Learning rate: 0.0001	Loss: 49739154.156523	 Accuracy: 136/150 (91%)
 
Confusion Matrix:
[[21  2]
 [ 4 23]]

Test set: Average loss: 31198778.9012
Calculating additional test metrics...
Accuracy: 0.88
Precision: 0.9130434782608695
Recall: 0.84
F1-Score: 0.8749999999999999

Train Epoch:  3	 Learning rate: 0.0001	Loss: 47398616.441149	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[20  1]
 [ 5 24]]

Test set: Average loss: 41997094.7366
Calculating additional test metrics...
Accuracy: 0.88
Precision: 0.9523809523809523
Recall: 0.8
F1-Score: 0.8695652173913043

Train Epoch:  4	 Learning rate: 0.0001	Loss: 37285254.120403	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[21  2]
 [ 4 23]]

Tes

Train Epoch: 23	 Learning rate: 0.0001	Loss: 65776668.280911	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[20  1]
 [ 5 24]]

Test set: Average loss: 58479626.2543
Calculating additional test metrics...
Accuracy: 0.88
Precision: 0.9523809523809523
Recall: 0.8
F1-Score: 0.8695652173913043

Train Epoch: 24	 Learning rate: 0.0001	Loss: 45423022.493199	 Accuracy: 140/150 (93%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 52796537.3003
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch: 25	 Learning rate: 0.0001	Loss: 70366721.531172	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 46336875.8603
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch: 26	 Learning rate: 0.0001	Loss: 43245651.603105	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test

Train Epoch: 45	 Learning rate: 0.0001	Loss: 33135779.454090	 Accuracy: 136/150 (91%)
 
Confusion Matrix:
[[21  2]
 [ 4 23]]

Test set: Average loss: 26646336.9805
Calculating additional test metrics...
Accuracy: 0.88
Precision: 0.9130434782608695
Recall: 0.84
F1-Score: 0.8749999999999999

Train Epoch: 46	 Learning rate: 0.0001	Loss: 41915181.158125	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 27425584.0204
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch: 47	 Learning rate: 0.0001	Loss: 37376998.636048	 Accuracy: 136/150 (91%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 27344604.5005
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch: 48	 Learning rate: 0.0001	Loss: 24858258.170997	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Tes

Train Epoch:  8	 Learning rate: 0.0001	Loss: 31723940.822042	 Accuracy: 132/150 (88%)
 
Confusion Matrix:
[[25  1]
 [ 0 24]]

Test set: Average loss: 3033734.4598
Calculating additional test metrics...
Accuracy: 0.98
Precision: 0.9615384615384616
Recall: 1.0
F1-Score: 0.9803921568627451

Train Epoch:  9	 Learning rate: 0.0001	Loss: 28027064.562925	 Accuracy: 133/150 (89%)
 
Confusion Matrix:
[[25  1]
 [ 0 24]]

Test set: Average loss: 2429322.3002
Calculating additional test metrics...
Accuracy: 0.98
Precision: 0.9615384615384616
Recall: 1.0
F1-Score: 0.9803921568627451

Train Epoch: 10	 Learning rate: 0.0001	Loss: 53083240.395528	 Accuracy: 131/150 (87%)
 
Confusion Matrix:
[[25  1]
 [ 0 24]]

Test set: Average loss: 2055215.4205
Calculating additional test metrics...
Accuracy: 0.98
Precision: 0.9615384615384616
Recall: 1.0
F1-Score: 0.9803921568627451

Train Epoch: 11	 Learning rate: 0.0001	Loss: 45837927.394009	 Accuracy: 133/150 (89%)
 
Confusion Matrix:
[[25  1]
 [ 0 24]]

Test se

Train Epoch: 30	 Learning rate: 0.0001	Loss: 58861492.615187	 Accuracy: 131/150 (87%)
 
Confusion Matrix:
[[25  1]
 [ 0 24]]

Test set: Average loss: 4278439.7404
Calculating additional test metrics...
Accuracy: 0.98
Precision: 0.9615384615384616
Recall: 1.0
F1-Score: 0.9803921568627451

Train Epoch: 31	 Learning rate: 0.0001	Loss: 234076978.455315	 Accuracy: 125/150 (83%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 30945370.9397
Calculating additional test metrics...
Accuracy: 0.94
Precision: 1.0
Recall: 0.88
F1-Score: 0.9361702127659575

Train Epoch: 32	 Learning rate: 0.0001	Loss: 72548251.212615	 Accuracy: 129/150 (86%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 15996028.2197
Calculating additional test metrics...
Accuracy: 0.94
Precision: 1.0
Recall: 0.88
F1-Score: 0.9361702127659575

Train Epoch: 33	 Learning rate: 0.0001	Loss: 83794078.543918	 Accuracy: 131/150 (87%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 3631335.

Train Epoch: 52	 Learning rate: 0.0001	Loss: 133481683.331474	 Accuracy: 126/150 (84%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 23571644.2197
Calculating additional test metrics...
Accuracy: 0.94
Precision: 1.0
Recall: 0.88
F1-Score: 0.9361702127659575

Train Epoch: 53	 Learning rate: 0.0001	Loss: 95754836.924808	 Accuracy: 130/150 (87%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 7876780.8599
Calculating additional test metrics...
Accuracy: 0.94
Precision: 1.0
Recall: 0.88
F1-Score: 0.9361702127659575

Train Epoch: 54	 Learning rate: 0.0001	Loss: 34873957.305464	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 5203253.8200
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 55	 Learning rate: 0.0001	Loss: 84907699.651899	 Accuracy: 129/150 (86%)
 
Confusion Matrix:
[[22  0]
 [ 3 25]]

Test set: Average loss: 14983955

Train Epoch: 15	 Learning rate: 0.0001	Loss: 19929440.212192	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 60103975.7256
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 16	 Learning rate: 0.0001	Loss: 26586078.146888	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 63376577.3257
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 17	 Learning rate: 0.0001	Loss: 45146675.723972	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[22  2]
 [ 3 23]]

Test set: Average loss: 82802878.0887
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9166666666666666
Recall: 0.88
F1-Score: 0.8979591836734694

Train Epoch: 18	 Learning rate: 0.0001	Loss: 23393270.441765	 Accuracy: 137/150 (91%)
 
Confusion Matrix:
[[22  2]
 [ 3 23]]

Te

Train Epoch: 37	 Learning rate: 0.0001	Loss: 21757988.637124	 Accuracy: 137/150 (91%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 103930057.3275
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 38	 Learning rate: 0.0001	Loss: 19237580.846434	 Accuracy: 139/150 (93%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 98158898.9274
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 39	 Learning rate: 0.0001	Loss: 21869229.109804	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 95425727.4074
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 40	 Learning rate: 0.0001	Loss: 26662166.018184	 Accuracy: 137/150 (91%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]



Train Epoch: 59	 Learning rate: 0.0001	Loss: 12564914.584445	 Accuracy: 135/150 (90%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 81601827.2470
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch:  1	 Learning rate: 0.0001	Loss: 33666022.859640	 Accuracy: 132/150 (88%)
 
Confusion Matrix:
[[25  5]
 [ 0 20]]

Test set: Average loss: 20331054.7284
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.8333333333333334
Recall: 1.0
F1-Score: 0.9090909090909091

Train Epoch:  2	 Learning rate: 0.0001	Loss: 30829921.424023	 Accuracy: 128/150 (85%)
 
Confusion Matrix:
[[24  2]
 [ 1 23]]

Test set: Average loss: 4299711.0497
Calculating additional test metrics...
Accuracy: 0.94
Precision: 0.9230769230769231
Recall: 0.96
F1-Score: 0.9411764705882353

Train Epoch:  3	 Learning rate: 0.0001	Loss: 31847530.485997	 Accuracy: 130/150 (87%)
 
Confusion Matrix:
[[25  9]
 [ 0 16]]

Test 

Train Epoch: 22	 Learning rate: 0.0001	Loss: 47491216.455383	 Accuracy: 132/150 (88%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 4205426.5697
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 23	 Learning rate: 0.0001	Loss: 39051441.842360	 Accuracy: 134/150 (89%)
 
Confusion Matrix:
[[21  1]
 [ 4 24]]

Test set: Average loss: 4390427.2093
Calculating additional test metrics...
Accuracy: 0.9
Precision: 0.9545454545454546
Recall: 0.84
F1-Score: 0.8936170212765958

Train Epoch: 24	 Learning rate: 0.0001	Loss: 63700362.903786	 Accuracy: 134/150 (89%)
 
Confusion Matrix:
[[24  1]
 [ 1 24]]

Test set: Average loss: 4467463.3699
Calculating additional test metrics...
Accuracy: 0.96
Precision: 0.96
Recall: 0.96
F1-Score: 0.96

Train Epoch: 25	 Learning rate: 0.0001	Loss: 41934989.693785	 Accuracy: 138/150 (92%)
 
Confusion Matrix:
[[24  1]
 [ 1 24]]

Test set: Average loss: 4209045.4

Train Epoch: 45	 Learning rate: 0.0001	Loss: 45632773.466794	 Accuracy: 133/150 (89%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test set: Average loss: 5245337.6104
Calculating additional test metrics...
Accuracy: 0.92
Precision: 0.9565217391304348
Recall: 0.88
F1-Score: 0.9166666666666666

Train Epoch: 46	 Learning rate: 0.0001	Loss: 35886128.806322	 Accuracy: 139/150 (93%)
 
Confusion Matrix:
[[24  2]
 [ 1 23]]

Test set: Average loss: 4737685.4502
Calculating additional test metrics...
Accuracy: 0.94
Precision: 0.9230769230769231
Recall: 0.96
F1-Score: 0.9411764705882353

Train Epoch: 47	 Learning rate: 0.0001	Loss: 38086944.352010	 Accuracy: 134/150 (89%)
 
Confusion Matrix:
[[24  2]
 [ 1 23]]

Test set: Average loss: 4063335.6903
Calculating additional test metrics...
Accuracy: 0.94
Precision: 0.9230769230769231
Recall: 0.96
F1-Score: 0.9411764705882353

Train Epoch: 48	 Learning rate: 0.0001	Loss: 30253627.762261	 Accuracy: 134/150 (89%)
 
Confusion Matrix:
[[22  1]
 [ 3 24]]

Test

In [37]:
print("Accuracy: {}".format(np.mean(accs)))
print("Precision: {}".format(np.mean(precs)))
print("Recall: {}".format(np.mean(recs)))
print("F1-Score: {}\n".format(np.mean(f1s)))
print('='*89)

Accuracy: 0.945
Precision: 0.9585360291882032
Recall: 0.9299999999999999
F1-Score: 0.9427670004171882



In [38]:
accs

array([0.9 , 0.98, 0.92, 0.98])