In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils.class_weight import compute_sample_weight, compute_class_weight
from matplotlib import pyplot as plt
import joblib
from sklearn.metrics import f1_score
#from scipy.interpolate import spline
import seaborn

In [None]:
Emotion = pd.read_csv('./trainData/emotion.csv')
emotion = Emotion['emotion']
seaborn.barplot(x=emotion.value_counts().index, 
                y=emotion.value_counts())



In [None]:
maxsize = 240000
emotion_list = emotion.unique()
idx_train=[]
for emo in emotion_list:
    size = 0
    for i,e in enumerate(labs['emotion']):
        if e == emo and size <maxsize:
            size+=1
            idx_train.append(i)

In [None]:
y_data = joblib.load('./fea_sel/label/label8_ver2.pkl')
#labs = joblib.load('./fea_sel/label/label_final.pkl')
X_train = joblib.load('./fea_sel/embed_model/emb_train.pkl')[idx_train]
Y_train = pd.Series(y_data)[idx_train]
X_test = joblib.load('./fea_sel/embed_model/emb_test.pkl')     

In [None]:
seaborn.barplot(x=Y_train.value_counts().index, 
                y=Y_train.value_counts())

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

lb = LabelBinarizer()
y  = lb.fit_transform(Y_train)

sample_weight = compute_sample_weight('balanced', Y_train)
np.unique(sample_weight, return_counts=True)

In [None]:
from keras.layers import (Input, Dense, Embedding, 
                          Conv2D, SeparableConv2D, 
                          MaxPool2D, AvgPool2D, 
                          Add, Multiply, Subtract)
from keras.layers import (Reshape, Flatten, Dropout, 
                          Concatenate, BatchNormalization, 
                          LeakyReLU, Activation, MaxoutDense)
from keras.constraints import max_norm
from keras.activations import softplus, tanh, relu, elu
from keras import regularizers
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam, RMSprop, SGD, Nadam
from keras.models import Model
from keras import backend as K

from keras.callbacks import LearningRateScheduler

In [None]:
embedding = joblib.load('./fea_sel/embed_model/tweet_model_embedding.pkl')

In [None]:
def print_report(y_val, y_val_pred, sample_weight):
    
    print(classification_report(y_true=np.argmax(y_val, axis=1), 
                                y_pred=np.argmax(y_val_pred, axis=1),
                                target_names=list(lb.classes_))
                                )
    print(classification_report(y_true=np.argmax(y_val, axis=1), 
                                y_pred=np.argmax(y_val_pred, axis=1),
                                target_names=list(lb.classes_),
                                sample_weight=sample_weight))
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def Embedding(inputs, embedding, max_document_length, trainable=False, max_norm_value=25):
    input_dim, output_dim = embedding.shape
    embedding_layer = Embedding(input_dim=input_dim,
                                output_dim=output_dim,
                                weights=[embedding],
                                input_length=max_document_length,
                                trainable=trainable,
                                embeddings_constraint=max_norm(max_norm_value))(inputs)
    embedding_layer = Reshape((max_document_length, embedding.shape[1], 1))(embedding_layer)
    return embedding_layer
def conv2D(inputs, filter_sizes, num_filters, embedding_dim, max_document_length, activation=None):
    conv2D_list = []
    for size in filter_sizes:
        conv = Conv2D(num_filters, 
                    kernel_size=(size, embedding_dim), 
                    padding='valid',
                    kernel_initializer='glorot_uniform',
                    kernel_constraint=max_norm(200),
                    bias_initializer='zeros')(inputs)
        conv = LeakyReLU()(conv)
        pool_size = (max_document_length - size + 1, 1)
        pooled = MaxPool2D(pool_size=pool_size, 
                            strides=(1, 1), 
                            padding='valid')(conv)
        conv2D_list.append(pooled)
        
    concatenated_tensor = Concatenate(axis=1)(conv2D_list)
    concatenated_tensor = Flatten()(concatenated_tensor)
    return concatenated_tensor
def scheduler(epoch):

    if epoch % 20 == 0 and epoch != 0:
        lr = K.get_value(model.optimizer.lr)
        K.set_value(model.optimizer.lr, lr * 0.1)
        print("lr changed to {}".format(lr * 0.1))
    return K.get_value(model.optimizer.lr)


reduce_lr = LearningRateScheduler(scheduler)
def model(filter_sizes=(3, 3, 3), 
                   num_filters=150, 
                   embedd_trainable=False, 
                   activation=None):
    K.clear_session()
    inputs_2 = Input(shape=(max_document_length, ), dtype='int32')
    embedding_2 = Embedding(inputs_2, 
                                   embedding, 
                                   max_document_length, 
                                   trainable=False)
    embedding_3 = Embedding(inputs_2, 
                                   embedding, 
                                   max_document_length, 
                                   trainable=True)
    embedding_2 = Add()([embedding_2, embedding_3])
    concatenated_tensor_2 = conv2D(embedding_2, 
                                        filter_sizes, 
                                        num_filters, 
                                        100, 
                                        max_document_length)
    concatenated_tensor = Dropout(0.35)(concatenated_tensor_2)
   
    dense_0 = Dense(units=400, 
                    activation='tanh',
                    kernel_constraint=max_norm(30))(concatenated_tensor)
    dense_1 = MaxoutDense(output_dim=32,
                           nb_feature=4,
                           W_constraint=max_norm(3))(dense_0)
    output  = Dense(units=num_label, 
                    activation='softmax',
                    kernel_constraint=max_norm(3))(dense_1)
    
    model = Model(inputs=[inputs_2], outputs=output)
    model.compile(optimizer=SGD(lr=0.1, decay=1e-6, momentum=0.9), 
                  loss=categorical_crossentropy, 
                  metrics=[f1])
    model.summary()
    return model

control panel

In [None]:
num_label = len(lb.classes_)
max_document_length = X_train.shape[1]
batch_size = 128
embedding_dim = embedding.shape[1]

In [None]:
fake = np.zeros(X_train.shape[0])
for i, (train_index, val_index) in enumerate(skf.split(fake, Y_train)):
    x_train = X_train[train_index]
    y_train = y[train_index]
    x_val = X_train[val_index]
    y_val = y[val_index]
    (sample_weight_train, 
     sample_weight_val) = (sample_weight[train_index], sample_weight[val_index])
    path = './fea_sel/embed_model/{}_weights.best.hdf5'.format(i)
    checkpoint = ModelCheckpoint(path, 
                                 monitor='val_f1', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='max')
    print("Creating Model...")
    model = model()
    model_hist = model.fit([x_train], y_train,
                           sample_weight=None,
                           validation_data=([x_val], y_val, None),
                           batch_size=batch_size,
                           shuffle=True,
                           epochs=40, 
                           verbose=1,
                           callbacks=[checkpoint,reduce_lr])
    model.load_weights(path)

##### prediction with test dataset

In [None]:
import csv
def writer_csv(logPath, logging):
    f = open(logPath,'a')
    w = csv.writer(f,lineterminator = '\r')
    w.writerow(logging)
    f.close() 

model.load_weights(path)              # loading the weights of best model
y_test_pred = model.predict([X_test])  # predict with the test dataset    
test_pred=lb.inverse_transform(y_test_pred) #inverse_transform the prediction into number(0-7)
testID = joblib.load('./fea_sel/test/ID.pkl')

logg = './submission12.csv'   
writer_csv(logg, ['id','emotion'])

# replace the numeric label with emotions
for id_,emo in zip(testID,test_pred):
    if emo == 0: w = 'sadness'
    elif emo == 1: w = 'disgust'
    elif emo == 2: w = 'anticipation'
    elif emo == 3: w = 'joy'
    elif emo == 4: w = 'trust'
    elif emo == 5: w = 'anger'
    elif emo == 6: w = 'fear'
    elif emo == 7: w = 'surprise'
    ww= [id_,w]
    writer_csv(logg, ww)