# 1. Import libraries

In [2]:
import keras
from numpy import asarray
from numpy import zeros
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
import re
import csv
import pandas
import numpy as np
from tensorflow.contrib import rnn
import tensorflow as tf
import matplotlib.pyplot as plt

In [3]:
np.set_printoptions(8, suppress=True)
#These options determine the way floating point numbers, arrays and other NumPy objects are displayed.
# 8 in first param indicates - Number of digits of precision for floating point output (default 8).
# suppressbool, optional:-
# If True, always print floating point numbers using fixed point notation, in which case numbers equal to zero in the current precision will print as zero. If False, then scientific notation is used when absolute value of the smallest number is < 1e-4 or the ratio of the maximum absolute value to the minimum is > 1e3. The default is False.

%matplotlib inline
#%matplotlib inline sets the backend of matplotlib to the 'inline' backend.

# 2. Data Import and Preprocessing

# 2.1 Loading train data

In [10]:
#Loading train data
train_file = 'data/train.csv'
train_corpus = pandas.read_csv(train_file, encoding='latin1').to_numpy()

print(train_corpus.shape)

(7613, 5)


In [12]:
#Head of train data to verify
train_corpus[:5]

array([[1, nan, nan,
        'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
        1],
       [4, nan, nan, 'Forest fire near La Ronge Sask. Canada', 1],
       [5, nan, nan,
        "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
        1],
       [6, nan, nan,
        '13,000 people receive #wildfires evacuation orders in California ',
        1],
       [7, nan, nan,
        'Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school ',
        1]], dtype=object)

In [16]:
#Tail of the train data to verify
train_corpus[-5:]

array([[10869, nan, nan,
        'Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5',
        1],
       [10870, nan, nan,
        '@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.',
        1],
       [10871, nan, nan,
        'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
        1],
       [10872, nan, nan,
        'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
        1],
       [10873, nan, nan,
        'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d',
        1]], dtype=object)

In [18]:
train_label = train_corpus[:,4] #target value (y=0 /1)
train_text = train_corpus[:,3] #id,keyword,location,text

# 2.2 Removing hyperlinks from Train data

In [19]:
#As we observed in the head and tail of the train data, we see there are many hyperlinks in between the data.
#So to simplify our process, we want to remove the hyperlinks from the data
for idx, tweet in enumerate(train_text):
    train_text[idx] = re.sub(r"http\S+", "", tweet)

In [21]:
#Verifying that the hyperlinks are removed.
train_corpus[-5:]

array([[10869, nan, nan,
        'Two giant cranes holding a bridge collapse into nearby homes ',
        1],
       [10870, nan, nan,
        '@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.',
        1],
       [10871, nan, nan, 'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. ',
        1],
       [10872, nan, nan,
        'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
        1],
       [10873, nan, nan,
        'The Latest: More Homes Razed by Northern California Wildfire - ABC News ',
        1]], dtype=object)

In [29]:
np.info(train_corpus)

class:  ndarray
shape:  (7613, 5)
strides:  (8, 60904)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x1a4a3ab000
byteorder:  little
byteswap:  False
type: object


# 2.3 Loading test data

In [46]:
#Loading test data
test_file = 'data/test.csv'
test_corpus = pandas.read_csv(test_file, encoding='latin1').to_numpy()

print(test_corpus.shape)

(3263, 4)


In [47]:
#Head of test data to verify
test_corpus[:5]

array([[0, nan, nan, 'Just happened a terrible car crash'],
       [2, nan, nan,
        'Heard about #earthquake is different cities, stay safe everyone.'],
       [3, nan, nan,
        'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all'],
       [9, nan, nan, 'Apocalypse lighting. #Spokane #wildfires'],
       [11, nan, nan, 'Typhoon Soudelor kills 28 in China and Taiwan']],
      dtype=object)

In [48]:
#Tail of the test data to verify
test_corpus[-5:]

array([[10861, nan, nan,
        'EARTHQUAKE SAFETY LOS ANGELES Â\x89Ã\x9bÃ\x92 SAFETY FASTENERS XrWn'],
       [10865, nan, nan,
        'Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power'],
       [10868, nan, nan,
        'Green Line derailment in Chicago http://t.co/UtbXLcBIuY'],
       [10874, nan, nan,
        'MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3'],
       [10875, nan, nan,
        '#CityofCalgary has activated its Municipal Emergency Plan. #yycstorm']],
      dtype=object)

In [49]:
test_label = np.zeros((3263, 1), dtype=int) #Since this is what is missing and we need to predict, we are initiating it with zeroes
test_text = test_corpus[:,3] #id,keyword,location,text (all the given four attributes)

# 2.4 Removing hyperlinks from Test data


In [50]:
# Remove hyperlinks from strings

for idx, tweet in enumerate(test_text):
    test_text[idx] = re.sub(r"http\S+", "", tweet)

In [51]:
np.info(test_corpus)

class:  ndarray
shape:  (3263, 4)
strides:  (8, 26104)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x13f311000
byteorder:  little
byteswap:  False
type: object


# 2.5. One-Hot encoding of label

In [52]:
from keras.utils.np_utils import to_categorical

# One-hot encode
train_targets = train_label
encoded_train_targets = to_categorical(train_targets)
print(encoded_train_targets)

[[0. 1.]
 [0. 1.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]


In [53]:
labels = encoded_train_targets

In [54]:
labels

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [None]:
#Defining inputs

training_docs = train_text #training 
test_docs = test_text #test

# 2.6 Building a reverse encoder 

In [55]:
def inverseEncoding(encoded):
    ans = np.zeros(encoded.shape[0])
    for idx, vector in enumerate(encoded):
        ans[idx] = np.argmax(vector)
    return ans

print(inverseEncoding(encoded_train_targets))

[1. 1. 1. ... 1. 1. 1.]


# 2.7: Tokenizer - Text pre-processing with Keras

In [58]:
#Document Tokenization
#Use Keras library to create a vector of words for every tweet.
#These vectors are padded up to 50, which can be the limit of number of words possible in a 140 character tweet.

# Prepare tokenizer (t for training set, tt for test set)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


t = Tokenizer() #tokenizer
tt = Tokenizer() #test tokenizer
t.fit_on_texts(training_docs) 
tt.fit_on_texts(test_docs)


# Integer encode the documents
encoded_docs = t.texts_to_sequences(training_docs)
encoded_test_docs = tt.texts_to_sequences(test_docs)


# Pad documents to a max length of 50 words (140 characters, 3 characters a word (including space))
max_length = 50
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')

# 3. Develop a model for the data

# 3.1 Word Embeddings

In [13]:
# Word Embedding
#For the embedding matrix we use GloVe’s 50d word vector pre-trained on 2 billion tweets. The
#embedding matrix tabulates how frequently word’s co-occur with one another in a given corpus

# load the whole embedding into memory

#The GLoVe link: https://nlp.stanford.edu/projects/glove/
embeddings_index = dict()
f = open('../glove/glove.twitter.27B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [14]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# create a weight matrix for words in test docs
test_embedding_matrix = zeros((vocab_size, 50))
for word, j in tt.word_index.items():
    test_embedding_vector = embeddings_index.get(word)
    if test_embedding_vector is not None:
        test_embedding_matrix[j] = test_embedding_vector
            
embedding_matrix_transpose = embedding_matrix.transpose()
test_embedding_matrix_transpose = test_embedding_matrix.transpose()

In [15]:
# print(embedding_matrix)
embedding_matrix_transpose = embedding_matrix.transpose()
test_embedding_matrix_transpose = test_embedding_matrix.transpose()

In [16]:
#define constants
#unrolled through 28 time steps in mnist , 50(=max_length) words per tweet ?
time_steps = max_length
#hidden LSTM units = batch size , we can also take 128 for tweets and mnist?
# Total of 4743 word vectors in training set. To ensure constant batch size: 4743 = 3*3*17*31 = 153
num_units= 128
#rows of 28 pixels for mnist , 50 dimension for glove words.
n_input= 50
#learning rate for adam
learning_rate=0.001
#mnist is meant to be classified in 10 classes(0-9).
n_classes= 2
#size of batch = we can take 128 same as number of units.
batch_size= num_units

In [17]:
#weights and biases of appropriate shape to accomplish above task
out_weights=tf.Variable(tf.random_normal([num_units,n_classes]))
out_bias=tf.Variable(tf.random_normal([n_classes]))

#defining placeholders
#input image placeholder
x=tf.placeholder("float",[None,time_steps,n_input])
#input label placeholder
y=tf.placeholder("float",[None,n_classes])

In [18]:
#processing the input tensor from [batch_size,n_steps,n_input] to "time_steps" number of [batch_size,n_input] tensors
input=tf.unstack(x ,time_steps,1)

In [19]:
#defining the network
lstm_layer=rnn.BasicLSTMCell(num_units,forget_bias=1)
outputs,_=rnn.static_rnn(lstm_layer,input,dtype="float32")

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor














In [20]:
#converting last output of dimension [batch_size,num_units] to [batch_size,n_classes] by out_weight multiplication
prediction=tf.matmul(outputs[-1],out_weights)+out_bias

## softmax for probability
prob = tf.nn.softmax(prediction)

In [21]:
#Defining learning_rate for the Adam optimizer
learning_rate=0.001
#loss_function
loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
#optimization
opt=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

#model evaluation
#prediction = output of last LSTM time step x weights + bias
#probability = softmax(prediction)
correct_prediction=tf.equal(tf.argmax(prediction,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [22]:
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

def change_shape(data,embedding_matrix_transpose):
    '''
    Change shape to batch_x=batch_x.reshape((batch_size,time_steps,n_input))
    '''
    data1 = zeros((batch_size, time_steps, n_input))
    for x in range(0, num_units):
        for y in range(0, time_steps): 
            #print (data[x,y], embedding_matrix(data[x,y]))
            #print (embedding_matrix[data[x,y]])
            for z in range(0,n_input):
                data1[x][y][z]= embedding_matrix_transpose[z,data[x,y]]
    #print(data1)
    return(data1)
                

In [23]:
probabilities = np.zeros((test_label.size, 2))
losses = np.zeros(199)
accuracies = np.zeros(199)
#initialize variables
init=tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    iter=1
    while iter<2000:
        #batch_x,batch_y=mnist.train.next_batch(batch_size=batch_size)
        # use embedding matrix and one hot for batch_x
        batch_x, batch_y = next_batch(batch_size, padded_docs, labels)
        batch_x= change_shape(batch_x, embedding_matrix_transpose)
        sess.run(opt, feed_dict={x: batch_x, y: batch_y})

        if iter %10==0:
            acc=sess.run(accuracy,feed_dict={x:batch_x,y:batch_y})
            los=sess.run(loss,feed_dict={x:batch_x,y:batch_y})
            print("For iter ",iter)
            print("Accuracy ",acc)
            print("Loss ",los)
            print("__________________")

        iter=iter+1
    print("Optimization finished!")
    
    #calculating test accuracy
    # all test labels are 0
    test_data = padded_test_docs
    
    '''
    Change shape to test_data=test_data.reshape((3263,time_steps,n_input))
    '''
    test_data1 = zeros((3263, time_steps, n_input))
    for a in range(0, 1701):
        for b in range(0, time_steps): 
            for c in range(0,n_input):
                test_data1[a][b][c]= test_embedding_matrix_transpose[c,test_data[a,b]]
    
    probabilities = sess.run(prob, feed_dict={x: test_data1})

For iter  10
Accuracy  0.8046875
Loss  0.50922424
__________________
For iter  20
Accuracy  0.796875
Loss  0.4614792
__________________
For iter  30
Accuracy  0.8203125
Loss  0.40784174
__________________
For iter  40
Accuracy  0.7734375
Loss  0.4762222
__________________
For iter  50
Accuracy  0.8046875
Loss  0.42260525
__________________
For iter  60
Accuracy  0.7890625
Loss  0.44487348
__________________
For iter  70
Accuracy  0.8125
Loss  0.45680463
__________________
For iter  80
Accuracy  0.8203125
Loss  0.4178124
__________________
For iter  90
Accuracy  0.765625
Loss  0.47575513
__________________
For iter  100
Accuracy  0.8515625
Loss  0.33627287
__________________
For iter  110
Accuracy  0.78125
Loss  0.5046478
__________________
For iter  120
Accuracy  0.796875
Loss  0.49587548
__________________
For iter  130
Accuracy  0.84375
Loss  0.35237136
__________________
For iter  140
Accuracy  0.875
Loss  0.37459505
__________________
For iter  150
Accuracy  0.859375
Loss  0.347976

In [24]:
    print("Testing Probabilities")
    print("0\t 1")
    print(probabilities)
    probabilities.shape

Testing Probabilities
0	 1
[[0.1505156  0.8494844 ]
 [0.09050537 0.9094946 ]
 [0.01507206 0.984928  ]
 ...
 [0.82812697 0.1718731 ]
 [0.82812697 0.1718731 ]
 [0.82812685 0.1718731 ]]


(3263, 2)

In [26]:
idx = np.arange(test_label.size, dtype=np.int16)
idx.shape
out = np.rec.fromarrays((idx, probabilities[:,0], probabilities[:,1]),  names = ('i','D','ND'))
print(out)

[(   0, 0.1505156 , 0.8494844) (   1, 0.09050537, 0.9094946)
 (   2, 0.01507206, 0.984928 ) ... (3260, 0.82812697, 0.1718731)
 (3261, 0.82812697, 0.1718731) (3262, 0.82812685, 0.1718731)]


In [38]:
idx = np.arange(test_label.size, dtype=np.int16)
idx.shape
results = np.rec.fromarrays((test_corpus[:,0], np.where(probabilities[:,1]>0.5, 1, 0)),  names = ('id','target'))
print(results)

[(0, 1) (2, 1) (3, 1) ... (10868, 0) (10874, 0) (10875, 0)]


In [39]:
with open('results.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)
    wr.writerow(('id','target'))
    wr.writerows(results)

array([0, 2, 3, ..., 10868, 10874, 10875], dtype=object)

# Results

In [59]:
iterations = np.arange(start=10, stop=2000, step=10, dtype=np.int16)
# Plot Loss and Accuracy vs iterations
plt.plot( iterations, losses )
plt.plot( iterations, accuracies )
plt.legend(['Train Log Loss', 'Train Accuracies'])
plt.ylabel('Log Loss, Accuracy')
plt.xlabel('Iteration')
plt.title('Training Loss, Accuracy vs Iterations at 128 Batch')
plt.show()

NameError: name 'losses' is not defined