In [0]:
import random
import tensorflow as tf
import argparse
import zipfile
import sklearn.metrics
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
import datetime

log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

emotions = ["anger", "anticipation", "disgust", "fear", "joy", "love",
            "optimism", "pessimism", "sadness", "surprise", "trust"]
emotion_to_int = {"0": 0, "1": 1, "NONE": -1}
%load_ext tensorboard
callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
file1='/content/2018-E-c-En-train.txt'
file2='/content/2018-E-c-En-dev.txt'
train_data=pd.read_csv(file1, sep="\t", header=0)
dev_data=pd.read_csv(file2, sep="\t", header=0)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [0]:
embedding_dim = 100
max_length = 35
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

num_sentences =len(train_data)
corpus_train=[]
for i in range(num_sentences):
  corpus_train.append([train_data.iloc[i,1],train_data.iloc[i,2:13]])
corpus_val=[]
num_val_sentences=len(dev_data)
for i in range(num_val_sentences):
  corpus_val.append([dev_data.iloc[i,1],dev_data.iloc[i,2:13]])

val_sen=[]
val_lab=[] 
# random.shuffle(corpus_train)
for x in range(num_val_sentences):
    val_sen.append(corpus_val[x][0])
    val_lab.append(corpus_val[x][1])
    
sentences=[]
labels=[]
for x in range(num_sentences):
    sentences.append(corpus_train[x][0])
    labels.append(corpus_train[x][1])


tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
vocab_size=len(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
training_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)




val_seq=tokenizer.texts_to_sequences(val_sen)
val_sequences=pad_sequences(val_seq,maxlen=max_length,padding=padding_type, truncating=trunc_type)


In [0]:
# Note this is the 100 dimension version of GloVe from Stanford
# I unzipped to make this notebook easier
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt \
    -O /tmp/glove.6B.100d.txt
embeddings_index = {};
with open('/tmp/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split();
        word = values[0];
        coefs = np.asarray(values[1:], dtype='float32');
        embeddings_index[word] = coefs;

embeddings_matrix = np.zeros((vocab_size+1, embedding_dim));
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word);
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector;

--2019-12-04 05:07:52--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/glove.6B.100d.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 108.177.126.128, 2a00:1450:4013:c05::80
Connecting to storage.googleapis.com (storage.googleapis.com)|108.177.126.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 347116733 (331M) [text/plain]
Saving to: ‘/tmp/glove.6B.100d.txt’


2019-12-04 05:07:57 (72.5 MB/s) - ‘/tmp/glove.6B.100d.txt’ saved [347116733/347116733]



In [0]:
label=np.array(labels)
val_labs=np.array(val_lab)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
tf.keras.layers.Dropout(.4),
tf.keras.layers.Conv1D(6400, kernel_size=3, activation='relu',padding='valid'),
tf.keras.layers.MaxPooling1D(pool_size=3),

tf.keras.layers.Conv1D(6400, kernel_size=3, activation='relu',padding='valid'),
tf.keras.layers.MaxPooling1D(pool_size=3),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(12800, activation='relu'),
tf.keras.layers.Dropout(.40),
tf.keras.layers.Dense(11, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
#model.summary()
num_epochs =150
history = model.fit(training_sequences, label,batch_size=64, epochs=num_epochs, validation_data=(val_sequences, val_labs), verbose=1,callbacks=callbacks)

In [0]:
dev_prediction=model.predict(val_sequences)
dev_predictions = np.zeros(dev_prediction.shape)
dev_predictions[dev_prediction>0.4] = 1
dev_predictions=pd.DataFrame(dev_predictions)
dev_predictions.columns=emotions

print("accuracy: {:.3f}".format(sklearn.metrics.jaccard_similarity_score(
    dev_data[emotions], dev_predictions[emotions])))
dev_predictions

In [0]:
label=np.array(labels)
val_labs=np.array(val_lab)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
# tf.keras.layers.Conv1D(256, kernel_size=3, padding='same', activation='relu'),
# tf.keras.layers.MaxPooling1D(pool_size=2),
# tf.keras.layers.BatchNormalization(axis=-1),
tf.keras.layers.Conv1D(512, kernel_size=3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=2),
tf.keras.layers.Dropout(.15),
tf.keras.layers.Conv1D(256, kernel_size=3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=2),
tf.keras.layers.Dropout(.15),

tf.keras.layers.Conv1D(64, kernel_size=3, padding='same', activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=2),

tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64,dropout=.2,recurrent_dropout=.2)),
tf.keras.layers.Dense(11, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
#model.summary()
num_epochs =50
history = model.fit(training_sequences, label,batch_size=64, epochs=num_epochs, validation_data=(val_sequences, val_labs), verbose=1,callbacks=callbacks)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 6838 samples, validate on 886 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50


In [0]:
dev_prediction=model.predict(val_sequences)
dev_predictions = np.zeros(dev_prediction.shape)
dev_predictions[dev_prediction>0.33] = 1
dev_predictions=pd.DataFrame(dev_predictions)
dev_predictions.columns=emotions

print("accuracy: {:.3f}".format(sklearn.metrics.jaccard_similarity_score(
    dev_data[emotions], dev_predictions[emotions])))
dev_predictions

accuracy: 0.519




Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
881,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
882,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
883,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
884,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [0]:
dev_predictions[:] = dev_predictions[:].astype(int)
dev_res_data=pd.read_csv(file2, sep="\t", header=0)
dev_res_data.iloc[:,3:]=dev_predictions
dev_res_data

In [0]:
# dev_predictions['ID']=dev_data.ID
# dev_predictions['Tweet']=dev_data.Tweet
# dev_predictions
# cols = dev_predictions.columns.tolist()
# cols = cols[-1:] + cols[:-1]
# cols = cols[-1:] + cols[:-1]
# cols
# dev_predictions = dev_predictions[cols]
# dev_predictions = dev_predictions[cols]
# dev_predictions

In [0]:
dev_res_data.to_csv("E-C_en_pred.txt", sep="\t", index=False)
with zipfile.ZipFile('submission.zip', mode='w') as submission_zip:
    submission_zip.write("E-C_en_pred.txt")

In [0]:
#%tensorboard --logdir logs/fit


In [0]:
random.seed(123)
label=np.array(labels)
val_labs=np.array(val_lab)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=True),
tf.keras.layers.Conv1D(128,kernel_size=3,padding='same',activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=2),
tf.keras.layers.Dropout(0.2),
# tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128,dropout=.15,recurrent_dropout=.15,return_sequences=True)),
tf.keras.layers.GRU(32,dropout=.2,recurrent_dropout=.15),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(128,activation='relu'),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(11, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(0.001),metrics=['acc'])
#model.summary()

num_epochs =50
history = model.fit(training_sequences, label,batch_size=128, epochs=num_epochs, validation_data=(val_sequences, val_labs), verbose=1,callbacks=callbacks)


Train on 6838 samples, validate on 886 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50


In [0]:
dev_prediction=model.predict(val_sequences)
dev_predictions = np.zeros(dev_prediction.shape)
dev_predictions[dev_prediction>0.35] = 1
dev_predictions=pd.DataFrame(dev_predictions)
dev_predictions.columns=emotions

print("accuracy: {:.3f}".format(sklearn.metrics.jaccard_similarity_score(
    dev_data[emotions], dev_predictions[emotions])))
dev_predictions[:] = dev_predictions[:].astype(int)
dev_res_data=pd.read_csv(file2, sep="\t", header=0)
dev_res_data.iloc[:,3:]=dev_predictions
dev_res_data.to_csv("E-C_en_pred.txt", sep="\t", index=False)
with zipfile.ZipFile('submission.zip', mode='w') as submission_zip:
    submission_zip.write("E-C_en_pred.txt")

accuracy: 0.463




In [0]:
 model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Conv1D(256, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Dropout(.15),
    tf.keras.layers.Conv1D(128, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(128, kernel_size=3, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32,dropout=.3,recurrent_dropout=.1)),
    tf.keras.layers.Dense(11, activation='sigmoid')
    ])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
#model.summary()
num_epochs =50
history = model.fit(training_sequences, label,batch_size=128, epochs=num_epochs, validation_data=(val_sequences, val_labs), verbose=1,callbacks=callbacks)


Train on 6838 samples, validate on 886 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50


In [0]:
dev_prediction=model.predict(val_sequences)
dev_predictions = np.zeros(dev_prediction.shape)
dev_predictions[dev_prediction>0.32] = 1
dev_predictions=pd.DataFrame(dev_predictions)
dev_predictions.columns=emotions

print("accuracy: {:.3f}".format(sklearn.metrics.jaccard_similarity_score(
    dev_data[emotions], dev_predictions[emotions])))


accuracy: 0.533




In [0]:
dev_predictions[:] = dev_predictions[:].astype(int)
dev_res_data=pd.read_csv(file2, sep="\t", header=0)
dev_res_data.iloc[:,3:]=dev_predictions
dev_res_data.to_csv("E-C_en_pred.txt", sep="\t", index=False)
with zipfile.ZipFile('submission.zip', mode='w') as submission_zip:
    submission_zip.write("E-C_en_pred.txt")

In [0]:
random.seed(123)
label=np.array(labels)
val_labs=np.array(val_lab)
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
tf.keras.layers.Conv1D(160,kernel_size=3,padding='same',activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=2),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(120,dropout=.2,recurrent_dropout=.2)),
#tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(8),input_shape=(10,16)),
#tf.keras.layers.Flatten(),
tf.keras.layers.Dense(80,activation='relu'),
tf.keras.layers.Dropout(0.15),
tf.keras.layers.Dense(11, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(0.001),metrics=['acc'])
#model.summary()

num_epochs =50
history = model.fit(training_sequences, label,batch_size=128, epochs=num_epochs, validation_data=(val_sequences, val_labs), verbose=1,callbacks=callbacks)


Train on 6838 samples, validate on 886 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


In [0]:
dev_prediction=model.predict(val_sequences)
dev_predictions = np.zeros(dev_prediction.shape)
dev_predictions[dev_prediction>0.345] = 1
dev_predictions=pd.DataFrame(dev_predictions)
dev_predictions.columns=emotions

print("accuracy: {:.3f}".format(sklearn.metrics.jaccard_similarity_score(
    dev_data[emotions], dev_predictions[emotions])))


accuracy: 0.533


