# RNN Classification of Israeli and Palestinian Narrative Texts

In [1]:
import time, os, pickle
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #get rid of warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers import Bidirectional
from keras.models import Sequential
from keras.models import load_model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


## 1. First, let's look at the data and process it.

Load in word2vec:

In [2]:
vocab_dicts_path = "pickles/vocab_dicts.p"
word2idx, idx2word, word2vec = pickle.load(open(vocab_dicts_path, 'rb'))
len(word2vec)

18798

Load in training, dev, and test data from text files. 

In [3]:
isr_train = open('processed_data/i_train.txt', 'r').readlines()
pal_train = open('processed_data/p_train.txt', 'r').readlines() 
isr_train = isr_train + isr_train
isr_train = isr_train[:len(pal_train)] #balance the training set
print("training:", len(isr_train), 'isr lines loaded and', len(pal_train), 'pal lines loaded')

isr_dev = open('processed_data/i_dev.txt', 'r').readlines()
pal_dev = open('processed_data/p_dev.txt', 'r').readlines() 
print("dev:", len(isr_dev), 'isr lines loaded and', len(pal_dev), 'pal lines loaded')

isr_test = open('processed_data/i_test.txt', 'r').readlines()
pal_test = open('processed_data/p_test.txt', 'r').readlines() 
print("test:", len(isr_test), 'isr lines loaded and', len(pal_test), 'pal lines loaded')

sentence_length = 50
word2vec_len = 300

training: 61705 isr lines loaded and 61705 pal lines loaded
dev: 150 isr lines loaded and 150 pal lines loaded
test: 150 isr lines loaded and 150 pal lines loaded


For each sentence, which has 45 words, we use word2vec to create a 45 by 300 dimensional vector.

In [4]:
def get_matrix_from_lines(num_words, word2vec_len, isr_lines, pal_lines, word2vec):
    
    n_isr = len(isr_lines)
    n_pal = len(pal_lines)
    x_matrix = np.zeros((n_isr+n_pal, num_words, word2vec_len))
    
    #add isr lines first
    for i, line in enumerate(isr_lines):
        words = line[:-1].split(' ')
        words = words[:x_matrix.shape[1]]
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i, j, :] = word2vec[word]
    
    #then add pal lines
    for i, line in enumerate(pal_lines):
        words = line[:-1].split(' ')
        words = words[:x_matrix.shape[1]]
        for j, word in enumerate(words):
            if word in word2vec:
                x_matrix[i+n_isr, j, :] = word2vec[word]
    
    y_matrix = np.zeros(n_isr+n_pal)
    y_matrix[n_isr:] = 1
    
    return x_matrix, y_matrix

Get the training, validation, and test data:

In [5]:
x_train, y_train = get_matrix_from_lines(sentence_length, word2vec_len, isr_train, pal_train, word2vec)
print("training matrix shapes:", x_train.shape, y_train.shape)
#x_train, y_train = shuffle(x_train, y_train, random_state=0)
x_dev, y_dev = get_matrix_from_lines(sentence_length, word2vec_len, isr_dev, pal_dev, word2vec)
print("dev matrix shapes:", x_dev.shape, y_dev.shape)
x_test, y_test = get_matrix_from_lines(sentence_length, word2vec_len, isr_test, pal_test, word2vec)
print("test matrix shapes:", x_test.shape, y_test.shape)

training matrix shapes: (123410, 50, 300) (123410,)
dev matrix shapes: (300, 50, 300) (300,)
test matrix shapes: (300, 50, 300) (300,)


## 2. Build the model in Keras

In [6]:
model = Sequential()
model.add(Bidirectional(LSTM(sentence_length, return_sequences=True), input_shape=(sentence_length, word2vec_len)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(sentence_length, return_sequences=False)))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 50, 100)           140400    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 100)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 100)               60400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 200,901
Trainable params: 200,901
Non-trainable params: 0
_________________________________________________________________
None


Note that since there are many augmented versions of each line, AND we use a sliding window in the data generation, the cross-validation accuracy isn't really valid. That's why we have independent dev and test sets.

In [7]:
start = time.time()
model.fit(x_train, y_train, batch_size=1024, nb_epoch=2, validation_split=0.03,  shuffle=True)
print('training time : ', time.time() - start)
model.save('my_model.h5')
#model = load_model('my_model.h5')



Train on 119707 samples, validate on 3703 samples
Epoch 1/2
Epoch 2/2
training time :  801.0629768371582


## Testing phase.

Test the model on independent development and test sets:

In [29]:
def conf_to_pred(y):
    y_class = np.zeros(y.shape)
    for i in range(y.shape[0]):
        if y[i] < 0.5:
            y_class[i] = 0
        else:
            y_class[i] = 1
    return y_class
    
def get_accuracy(model, x, y):
    y_predict = model.predict(x)
    y_class = conf_to_pred(y_predict)
    return accuracy_score(y, y_class)

dev_acc = get_accuracy(model, x_dev, y_dev)
print("dev_acc", dev_acc)
test_acc = get_accuracy(model, x_test, y_test)
print("test_acc", test_acc)



dev_acc 0.8066666666666666
test_acc 0.83


Make up your own sentence and test it!

In [62]:
def test_sentence(input_sentence, word2vec, num_words, word2vec_len):
    
    words = input_sentence.split(" ")
    x = np.zeros((1, num_words, word2vec_len))
    for j, word in enumerate(words):
            if word.lower() in word2vec:
                x[0, j, :] = word2vec[word.lower()]
    
    y_predict = model.predict(x)
    return y_predict
    
isr_examples = ['the terrorists who used its territory as a base from which to launch strikes at israeli soldiers and civilians injuring many israel retaliated by attacking the areas where the attacks the idf retaliation attacks in the village of samua originated hoping to put a stop',
                'shooting incidents between the idf and the arab armies the confrontations resulted mainly from activities initiated by both sides in areas that had been demilitarized as a result of the armistice agreements at first the idf emerged as the weaker party to the conflict one',
                'children were expelled from their schools some months after the nazis came to power they organized a book burning event in which books by jewish authors were burned including works by famous writers such as sigmund freud karl marx and albert einstein next to come',
                'the twentieth century cannot be told without reference to the shoah holocaust as its influence on the jews in the land of israel and around the world was and remains paramount jews are moved into the ghettos about three weeks into world war ii poland', 
                'to the arabs established schools and helped raise the literacy rate among the arab population the percentage of illiterates dropped from percent to percent at the start of the mandatory era approximately students were attending arab public schools whereas in just before the british mandate']

for isr_example in isr_examples:
    print(test_sentence(isr_example, word2vec, sentence_length, word2vec_len)[0][0], isr_example, '\n')
    
pal_examples = pal_dev[:10]
for pal_example in pal_examples:
    print(test_sentence(pal_example, word2vec, sentence_length, word2vec_len)[0][0], pal_example)


0.083140954 the terrorists who used its territory as a base from which to launch strikes at israeli soldiers and civilians injuring many israel retaliated by attacking the areas where the attacks the idf retaliation attacks in the village of samua originated hoping to put a stop 

0.0019222595 shooting incidents between the idf and the arab armies the confrontations resulted mainly from activities initiated by both sides in areas that had been demilitarized as a result of the armistice agreements at first the idf emerged as the weaker party to the conflict one 

0.26086769 children were expelled from their schools some months after the nazis came to power they organized a book burning event in which books by jewish authors were burned including works by famous writers such as sigmund freud karl marx and albert einstein next to come 

0.0016804785 the twentieth century cannot be told without reference to the shoah holocaust as its influence on the jews in the land of israel and around t