In [1]:
# Python 2.x
# How to use pre-trained embeddings:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

In [1]:
import keras
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense, Flatten, LSTM, Input, Embedding


Using TensorFlow backend.


# Preprocessing training, valid & test data.

In [2]:
train_ids = []
valid_ids = []
test_ids = []

train_class = []
valid_class = []

train_texts = []
valid_texts = []
test_texts = []

train = open('/home/kunst/utterances.train', 'r')
for line in train:
    split = line.split("\t")
    # save ids
    train_ids.append(split[0])
    # save classifications
    train_class.append(split[1])
    data = split[2].split(";")
    data[-1] = data[-1].strip() # deleting \n
    # save uts
    train_texts.append(data)
train.close()
train_texts = np.array(train_texts)

valid = open('/home/kunst/utterances.valid', 'r')
for line in valid:
    split = line.split("\t")
    valid_ids.append(split[0])
    valid_class.append(split[1])
    data = split[2].split(";")
    data[-1] = data[-1].strip() # deleting \n
    valid_texts.append(data)
valid.close()
valid_texts = np.array(valid_texts)

test = open('/home/kunst/utterances.test', 'r')
for line in test:
    split = line.split("\t")
    test_ids.append(split[0])
    data = split[1].split(";")
    data[-1] = data[-1].strip() # deleting \n
    test_texts.append(data)
test.close()
test_texts = np.array(test_texts)

# Creating output classes

In [3]:
classification = np.unique(train_class)

lb = LabelBinarizer()
y_train = lb.fit_transform(train_class)
y_valid = lb.fit_transform(valid_class)

# Creating input.

In [4]:
MAX_WORD_COUNT = 1000

tokenizer = Tokenizer(num_words=MAX_WORD_COUNT)
tokenizer.fit_on_texts(train_texts.flatten().tolist()+valid_texts.flatten().tolist())
word_index = tokenizer.word_index

MAX_SEQUENCE_LENGTH = 100

# training data with all four uts
x_train =[]
for ar in train_texts:
    sequences = tokenizer.texts_to_sequences(ar)
    x_train.append(pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH))
x_train = np.array(x_train)
x_valid =[]
for ar in valid_texts:
    sequences = tokenizer.texts_to_sequences(ar)
    x_valid.append(pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH))
x_valid = np.array(x_valid)
x_test =[]
for ar in test_texts:
    sequences = tokenizer.texts_to_sequences(ar)
    x_test.append(pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH))
x_test = np.array(x_test)

# training data with only last ut
x_train_2 = []
for ar in x_train:
    x_train_2.append(ar[3])
x_train_2 = np.array(x_train_2)
x_valid_2 = []
for ar in x_valid:
    x_valid_2.append(ar[3])
x_valid_2 = np.array(x_valid_2)
x_test_2 = []
for ar in x_test:
    x_test_2.append(ar[3])
x_test_2 = np.array(x_test_2)

# Compute an index mapping words to known embeddings, by parsing the data dump of pre-trained embeddings

In [5]:
embeddings_index = {}
w2v = open('word2vec_embeddings.txt', 'r')
for line in w2v:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
w2v.close()

# Computing embedding matrix.

In [6]:
EMBEDDING_DIM = 300

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        # words not found in embedding index will assigned random vector
        embedding_matrix[i] = np.random.uniform(-0.25, 0.25, 300)

# Models

#### Dense

In [15]:
from keras.layers import Dropout
modelDense = Sequential()
modelDense.add(Embedding(len(word_index) + 1,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix]))
modelDense.add(Flatten())
modelDense.add(Dense(units=200, kernel_initializer='random_uniform', activation='sigmoid'))
modelDense.add(Dropout(0.3))
modelDense.add(Dense(units=200, kernel_initializer='random_uniform', activation='relu'))
modelDense.add(Dropout(0.2))
modelDense.add(Dense(units=200, kernel_initializer='random_uniform', activation='sigmoid'))
modelDense.add(Dropout(0.1))
modelDense.add(Dense(units=200, kernel_initializer='random_uniform', activation='relu'))
modelDense.add(Dense(units=len(classification), kernel_initializer='random_uniform', activation='softmax'))
modelDense.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modelDense.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 300)          7041300   
_________________________________________________________________
flatten_5 (Flatten)          (None, 30000)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 200)               6000200   
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_11 (Dense)             (None, 200)               40200     
__________

#### RNN

In [22]:
from keras.layers import SimpleRNN, Dropout
modelRNN = Sequential()
modelRNN.add(Embedding(len(word_index) + 1,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix], trainable=False))
modelRNN.add(SimpleRNN(units=100, return_sequences=True, activation='relu'))
modelRNN.add(SimpleRNN(units=100, return_sequences=True, activation='relu'))
modelRNN.add(SimpleRNN(units=100, return_sequences=True, activation='relu'))
modelRNN.add(SimpleRNN(units=100, return_sequences=True, activation='relu'))
modelRNN.add(Dense(units=100, kernel_initializer='random_uniform', activation='relu'))
modelRNN.add(Dropout(0.2))
modelDense.add(Dense(units=100, kernel_initializer='random_uniform', activation='relu'))
modelRNN.add(Dropout(0.2))
modelRNN.add(Flatten())
modelRNN.add(Dense(units=31, kernel_initializer='random_uniform', activation='softmax'))
modelRNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])#
modelRNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 100, 300)          7041300   
_________________________________________________________________
simple_rnn_9 (SimpleRNN)     (None, 100, 100)          40100     
_________________________________________________________________
simple_rnn_10 (SimpleRNN)    (None, 100, 100)          20100     
_________________________________________________________________
simple_rnn_11 (SimpleRNN)    (None, 100, 100)          20100     
_________________________________________________________________
simple_rnn_12 (SimpleRNN)    (None, 100, 100)          20100     
_________________________________________________________________
dense_16 (Dense)             (None, 100, 100)          10100     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 100)          0         
__________

#### LSTM - RNN

In [18]:
from keras.models import Model
from keras.layers import LSTM, Input

# Input layer
input_x = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='float32')

# Embedding layer
embeddings = Embedding(len(word_index) + 1,EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix])(input_x)

# Reshape to 3D
lstm = LSTM(units=100, activation='tanh', )(embeddings)

dense0 = Dense(units=100, kernel_initializer='random_uniform', activation='relu')(lstm)

# Softmax
dense = Dense(units=len(classification), kernel_initializer='random_uniform', activation='softmax')(dense0)

# Defining model
modelLSTM = Model(input_x, dense)

# Model compilation, Adam optimizer
modelLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

modelLSTM.summary()    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 100, 300)          7041300   
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_15 (Dense)             (None, 31)                3131      
Total params: 7,204,831
Trainable params: 163,531
Non-trainable params: 7,041,300
_________________________________________________________________


#### CNN

In [7]:
from keras.models import Model
from keras.layers import Input, Embedding, Conv2D, MaxPooling2D, Flatten, Reshape
from keras.layers.merge import Concatenate

# Input layer
input_x = Input(shape=(4,MAX_SEQUENCE_LENGTH), dtype='float32')

# Embedding layer
embeddings = Embedding(len(word_index) + 1,EMBEDDING_DIM, input_shape=(4,MAX_SEQUENCE_LENGTH), weights=[embedding_matrix])(input_x)

num_filters = 100

pooled_outputs = []
conv1 = Conv2D(num_filters, kernel_size=2, activation='tanh')(embeddings)
pooling1 = MaxPooling2D(pool_size=(int(conv1.shape[1]), 1))(conv1)
pooled_flat1 = Flatten()(pooling1)
pooled_outputs.append(pooled_flat1)
conv2 = Conv2D(num_filters, kernel_size=4, activation='tanh')(embeddings)
pooling2 = MaxPooling2D(pool_size=(int(conv2.shape[1]), 1))(conv2)
pooled_flat2 = Flatten()(pooling2)
pooled_outputs.append(pooled_flat2)
    
# Concatenation of maxpooling outputs
h_pool = Concatenate()(pooled_outputs)

# Dense
dense = Dense(units=100, kernel_initializer='random_uniform', activation='relu')(h_pool)

# Softmax
output = Dense(len(classification), activation='softmax')(dense)

# Model
modelCNN = Model(input_x, output)

# Model compilation, Adam optimizer
modelCNN.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

modelCNN.summary()    

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 4, 100)       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 100, 300)  7041300     input_1[0][0]                    
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 3, 99, 100)   120100      embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 1, 97, 100)   480100      embedding_1[0][0]                
__________________________________________________________________________________________________
max_poolin

# Training

#### Dense

In [17]:
# Fit the model
history = modelDense.fit(x_train_2, y_train, validation_data=(x_valid_2, y_valid), epochs=5, batch_size=4000)
# Final evaluation of the model
scores = modelDense.evaluate(x_valid_2, y_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


Train on 196502 samples, validate on 20000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 48.61%


#### RNN

In [23]:
# Fit the model
modelRNN.fit(x_train_2, y_train, epochs=5, batch_size=4000)
# Final evaluation of the model
scores = modelRNN.evaluate(x_valid_2, y_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 53.11%


#### LSTM

In [20]:
# Fit the model
modelLSTM.fit(x_train_2, y_train, epochs=5, batch_size=4000)
# Final evaluation of the model
scores = modelLSTM.evaluate(x_valid_2, y_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 57.02%


#### CNN

In [9]:
history = modelCNN.fit(x_train, y_train, validation_data=(x_valid, y_valid), epochs=3, batch_size=1024)
scores = modelCNN.evaluate(x_valid, y_valid, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 196502 samples, validate on 20000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 58.53%


# Prediction

In [10]:
prediction = modelCNN.predict(x_test)

# Testfile schreiben

In [12]:
labels = []
max_pos = np.argmax(prediction, axis=1)
for i in range (0, max_pos.shape[0]):
    labels.append(classification[max_pos[i]])
labels = np.array(labels)

filename = "2693266_Hoshaber_Topic1_result.txt"
file = open(filename, 'w')
for i in range (0,np.array(test_ids).shape[0]):
    text = str(np.array(test_ids)[i]) + "\t" + str(labels[i] + "\n")
    file.write(text)
file.close()
