In [10]:
from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Input
from keras.models import Model
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.constraints import unitnorm
from keras.layers.core import Reshape, Flatten, Merge
from keras.layers.convolutional import Convolution2D, MaxPooling2D, Convolution1D, MaxPooling1D
from sklearn.cross_validation import KFold
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
import numpy as np
from sklearn import cross_validation
import math
from keras_input_data import make_idx_data
from load_vai import loadVAI
import _pickle as cPickle
from metrics import continuous_metrics
from keras import backend as K
import csv

In [11]:
def imdb_cnn(W=None):
    # Number of feature maps (outputs of convolutional layer)
    N_fm = 10
    # kernel size of convolutional layer
    kernel_size =10
    dims = 300  # 300 dimension
    maxlen = 87  # maxlen of sentence
    max_features = W.shape[0]
    hidden_dims = 100
    print('Build model...')
    model = Sequential()

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features, dims, input_length=maxlen, weights=[W]))
    #model.add(Dropout(0.2))

    # we add a Convolution1D, which will learn nb_filter
    # word group filters of size filter_length:
    model.add(Convolution1D(nb_filter=N_fm,
                            filter_length=kernel_size,
                            border_mode='valid',
                            activation='relu',
                            ))
    model.add(Dropout(0.4))
    # we use standard max pooling (halving the output of the previous layer):
    model.add(MaxPooling1D(pool_length=1))

    # We flatten the output of the conv layer,
    # so that we can add a vanilla dense layer:
    model.add(Flatten())

    # We add a vanilla hidden layer:
    model.add(Dense(hidden_dims))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(1))
    model.add(Activation('linear'))
    print('----------- imdbcnn----------')
    return model

In [12]:
def lstm(W):
    model = Sequential()
    model.add(Embedding(W.shape[0], W.shape[1], input_length=maxlen))
    model.add(LSTM(200))  # try using a GRU instead, for fun
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('linear'))
    print('----------- lstm----------')

    return model

In [13]:

def cnn_lstm(W):
    nb_filter = 20
    filter_length = 3
    pool_length = 1
    lstm_output_size = 60
    p = 0.5

    model = Sequential()
    model.add(Embedding(W.shape[0], W.shape[1], input_length=maxlen, weights=[W]))
    model.add(Dropout(p))
    model.add(Convolution1D(nb_filter=nb_filter,
                            filter_length=filter_length,
                            border_mode='valid',
                            activation='relu',
                            subsample_length=1))
    model.add(MaxPooling1D(pool_length=pool_length))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(lstm_output_size))
    model.add(Dropout(p))
    model.add(Dense(1))
    model.add(Activation('linear'))
    print('----------- cnnlstm----------')

    return model

In [22]:
def lstm_cnn(W):
    
    nb_filter = 10
    filter_length =2
    pool_length = 1
    
    region_input = Input(shape=(maxlen,), dtype='int32', name='region_input')
        ###这是一个逗号标志的输入的区域的句子，属于整个文章的一个区域。
    x = Embedding(W.shape[0], W.shape[1], weights=[W], input_length=maxlen)(region_input)

    lstm_output = LSTM(50, return_sequences=True, name='lstm')(x)  

    region_conv = Convolution1D(nb_filter=nb_filter,
                                    filter_length=filter_length,
                                    border_mode='valid',
                                    activation='relu',
                                    subsample_length=1)(lstm_output)
    region_max = MaxPooling1D(pool_length=pool_length)(region_conv)
    region_vector = Flatten()(region_max)
    textvector = Dense(30, activation='relu')(region_vector)
    predictions = Dense(1, activation='linear')(textvector)
    final_model = Model(region_input, predictions, name='model')
    model=final_model
    print('----------- lstmcnn----------')

    return model

In [24]:

if __name__ == '__main__':
    x = cPickle.load(open("mr.p", "rb"))
    revs, W, W2, word_idx_map, vocab = x[0], x[1], x[2], x[3], x[4]
    print("data loaded!")
    sentences=[]
    for rev in revs:
        sentence = rev['text']
        sentences.append(sentence)
    idx_data = make_idx_data(sentences, word_idx_map)
    #print(idx_data)

    dim = 'V'
    column = loadVAI(dim)
    irony=column
    maxlen = 87  # cut texts after this number of words (among top max_features most common words)
    batch_size = 8

    # option = 'Irony'  # or Arousal,irony
    Y = np.array(irony)
    Y = [float(x) for x in Y]
    # print(option + ' prediction.......................')

    # X_train, X_test, y_train, y_test = cross_validation.train_test_split(idx_data, Y, test_size=0.2,
    #                                                                      random_state=2)
    n_MAE=0
    n_Pearson_r=0
    n_Spearman_r=0
    n_MSE=0
    n_R2=0
    n_MSE_sqrt=0
    SEED = 42
    ID=list(range(1,1006))
    n = 5  # repeat the CV procedure 5 times to get more precise results
    
    X_train_0, X_test_0, y_train_0, y_test_0, ID_train_0, ID_test_0 = cross_validation.train_test_split(
         idx_data, Y, ID, test_size=.20, random_state=2)
    j=0
    for i in range(n):
        n_MAE=0
        n_Pearson_r=0
        
        for i in range(n):
            # for each iteration, randomly hold out 20% of the data as CV set
            # X_train, X_test, y_train, y_test, ID_train, ID_test = cross_validation.train_test_split(
            #     idx_data, Y, ID, test_size=.20, random_state=i * SEED)
            y_test = y_test_0
            y_train = y_train_0
            X_test = X_test_0
            X_train = X_train_0
            ID_test = ID_test_0
            ID_train = ID_train_0
            if i>0:
                y_test=y_train_0[:201]
                y_train=y_train_0[201:]+y_test_0
                X_test = X_train_0[:201]
                X_train=X_train_0[201:]+X_test_0
                ID_test = ID_train_0[:201]
                ID_train = ID_train_0[201:]+ID_test_0
                y_test_0 = y_test
                y_train_0 = y_train
                X_test_0 = X_test
                X_train_0 = X_train
                ID_test_0 = ID_test
                ID_train_0 = ID_train

            max_features = W.shape[0]  # shape of W: (13631, 300) , changed to 14027 through min_df = 3
            # print(max_features)

            #print("Pad sequences (samples x time)")
            X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
            X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
            #print('X_train shape:', X_train.shape)
            #print('X_test shape:', X_test.shape)



            model = lstm_cnn(W)

            model.compile(loss='mae', optimizer='adagrad')  # loss function: mse
            #print("Train...")
            early_stopping = EarlyStopping(monitor='val_loss', patience=5)
            result = model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=10,validation_data=(X_test, y_test),
                               callbacks=[early_stopping])
            score = model.evaluate(X_test, y_test, batch_size=batch_size)
            #print('Test score:', score)
            # experiment evaluated by multiple metrics
            predict = model.predict(X_test, batch_size=batch_size).reshape((1, len(X_test)))[0]
            #print(ID_test)
            predict=list(predict)
            #print(list(predict))
            headers=['ID_test','pred']
            with open('./irony_1005_2/V_lstmcnn'+str(j+1)+'_prediction.csv','a+',newline='') as f:
                    write = csv.writer(f)
                    write.writerow(headers)
                    for i in range(0,201):
                        write.writerow([ID_test[i],predict[i]])




            estimate=continuous_metrics(y_test, predict, 'prediction result:')
             # MSE, MAE, Pearson_r, R2, Spearman_r, MSE_sqrt

            n_MAE += estimate[0]
            n_Pearson_r += estimate[1]

        ndigit=3

        avg_MAE =  round(n_MAE/5, ndigit)
        avg_Pearson_r =  round(n_Pearson_r/5, ndigit) 

        print('average evaluate result:')
        print(avg_MAE ,avg_Pearson_r)
        j=j+1

data loaded!
--------------- now is valence1005------------
----------- lstmcnn----------
Train on 804 samples, validate on 201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MAE, Pearson_r
----------- lstmcnn----------
Train on 804 samples, validate on 201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MAE, Pearson_r
----------- lstmcnn----------
Train on 804 samples, validate on 201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
MAE, Pearson_r
----------- lstmcnn----------
Train on 804 samples, validate on 201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
----------- lstmcnn----------
Train on 804 samples, validate on 201 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
average evaluate result:
0.401 0.323
---