# RNN Classification of Israeli and Palestinian Narrative Texts

In [19]:
import time, os, pickle
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #get rid of warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.models import load_model

## 1. First, let's look at the data and process it.

Load in word2vec:

In [3]:
vocab_dicts_path = "pickles/vocab_dicts.p"
word2idx, idx2word, word2vec = pickle.load(open(vocab_dicts_path, 'rb'))
len(word2vec)

18798

Load in training, dev, and test data from text files. 

In [4]:
isr_train = open('processed_data/i_train.txt', 'r').readlines()
pal_train = open('processed_data/p_train.txt', 'r').readlines() 
isr_train = isr_train + isr_train
isr_train = isr_train[:len(pal_train)] #balance the training set
print("training:", len(isr_train), 'isr lines loaded and', len(pal_train), 'pal lines loaded')

isr_dev = open('processed_data/i_dev.txt', 'r').readlines()
pal_dev = open('processed_data/p_dev.txt', 'r').readlines() 
print("dev:", len(isr_dev), 'isr lines loaded and', len(pal_dev), 'pal lines loaded')

isr_test = open('processed_data/i_test.txt', 'r').readlines()
pal_test = open('processed_data/p_test.txt', 'r').readlines() 
print("test:", len(isr_test), 'isr lines loaded and', len(pal_test), 'pal lines loaded')

sentence_length = 50
word2vec_len = 300

training: 61705 isr lines loaded and 61705 pal lines loaded
dev: 150 isr lines loaded and 150 pal lines loaded
test: 150 isr lines loaded and 150 pal lines loaded


For each sentence, which has 45 words, we use word2vec to create a 45 by 300 dimensional vector.

In [5]:
def get_matrix_from_lines(num_words, word2vec_len, isr_lines, pal_lines, word2vec):
    
    n_isr = len(isr_lines)
    n_pal = len(pal_lines)
    x_matrix = np.zeros((n_isr+n_pal, num_words, word2vec_len))
    
    #add isr lines first
    for i, line in enumerate(isr_lines):
        words = line[:-1].split(' ')
        words = words[:x_matrix.shape[1]]
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i, j, :] = word2vec[word]
    
    #then add pal lines
    for i, line in enumerate(pal_lines):
        words = line[:-1].split(' ')
        words = words[:x_matrix.shape[1]]
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i+n_isr, j, :] = word2vec[word]
    
    y_matrix = np.zeros(n_isr+n_pal)
    y_matrix[n_isr:] = 1
    
    return x_matrix, y_matrix

Get the training, validation, and test data:

In [6]:
x_train, y_train = get_matrix_from_lines(sentence_length, word2vec_len, isr_train, pal_train, word2vec)
print("training matrix shapes:", x_train.shape, y_train.shape)
#x_train, y_train = shuffle(x_train, y_train, random_state=0)
x_dev, y_dev = get_matrix_from_lines(sentence_length, word2vec_len, isr_dev, pal_dev, word2vec)
print("dev matrix shapes:", x_dev.shape, y_dev.shape)
x_test, y_test = get_matrix_from_lines(sentence_length, word2vec_len, isr_test, pal_test, word2vec)
print("test matrix shapes:", x_test.shape, y_test.shape)

training matrix shapes: (123410, 50, 300) (123410,)
dev matrix shapes: (300, 50, 300) (300,)
test matrix shapes: (300, 50, 300) (300,)


## 2. Build the model in Keras

In [7]:
model = Sequential()
model.add(Bidirectional(LSTM(sentence_length, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(sentence_length, return_sequences=False)))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 50, 100)           140400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 200,901
Trainable params: 200,901
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
start = time.time()
model.fit(x_train, y_train, batch_size=1024, nb_epoch=2, validation_split=0.1,  shuffle=True)
print('training time : ', time.time() - start)
model.save('my_model.h5')
#model = load_model('my_model.h5')



Train on 123410 samples, validate on 300 samples
Epoch 1/2

KeyboardInterrupt: 

In [28]:
def conf_to_pred(y):
    y_class = np.zeros(y.shape)
    for i in range(y.shape[0]):
        if y[i] < 0.5:
            y_class[i] = 0
        else:
            y_class[i] = 1
    return y_class
    
def get_accuracy(model, x, y):
    y_predict = model.predict(x)
    y_class = conf_to_pred(y_predict)
    return accuracy_score(y, y_class)
        

0.9006

In [35]:
dev_acc = get_accuracy(model, x_dev, y_dev)
test_acc = get_accuracy(model, x_test, y_test)

0.78