In [12]:
import keras
from numpy import asarray
from numpy import zeros
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
import tensorflow as tf
import numpy as np
import pandas
from tensorflow.contrib import rnn
import re
import csv
import matplotlib.pyplot as plt


In [13]:
np.set_printoptions(8, suppress=True)
%matplotlib inline

In [14]:
#Data Import and Preprocessing
train_file = 'train.csv'
test_file = 'test.csv'
train_corpus = pandas.read_csv(train_file, encoding='latin1').to_numpy()
test_corpus = pandas.read_csv(test_file, encoding='latin1').to_numpy()
print(test_corpus.shape)
print(train_corpus.shape)

(3263, 4)
(7613, 5)


In [15]:
train_label = train_corpus[:,4]
train_text = train_corpus[:,3]
test_label = np.zeros((3263, 1), dtype=int)
test_text = test_corpus[:,3]

In [16]:
# Remove hyperlinks from strings
for idx, tweet in enumerate(train_text):
    train_text[idx] = re.sub(r"http\S+", "", tweet)
for idx, tweet in enumerate(test_text):
    test_text[idx] = re.sub(r"http\S+", "", tweet)

# One-Hot encoding of label

In [17]:
train_targets = train_label
# One-hot encode
encoded_train_targets = to_categorical(train_targets)
#print(encoded_train_targets)

def inverseEncoding(encoded):
    ans = np.zeros(encoded.shape[0])
    for idx, vector in enumerate(encoded):
        ans[idx] = np.argmax(vector)
    return ans

#print(inverseEncoding(encoded_train_targets))

In [18]:
labels = encoded_train_targets

In [19]:
#Document Tokenization
#Use Keras library to create a vector of words for every tweet.
#These vectors are padded up to 50, which can be the limit of number of words possible in a 140 character tweet.

# Prepare tokenizer (t for training set, tt for test set)
docs = train_text
test_docs = test_text
t = Tokenizer()
tt = Tokenizer()
t.fit_on_texts(docs)
tt.fit_on_texts(test_docs)
vocab_size = len(t.word_index) + 1
test_vocab_size = len(tt.word_index) + 1
# Integer encode the documents
encoded_docs = t.texts_to_sequences(docs)
encoded_test_docs = tt.texts_to_sequences(test_docs)
# Pad documents to a max length of 50 words (140 characters, 3 characters a word (including space))
max_length = 50
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
padded_test_docs = pad_sequences(encoded_test_docs, maxlen=max_length, padding='post')

In [20]:
# Word Embedding
#For the embedding matrix we use GloVe’s 50d word vector pre-trained on 2 billion tweets. The
#embedding matrix tabulates how frequently word’s co-occur with one another in a given corpus

# load the whole embedding into memory

#The GLoVe link: https://nlp.stanford.edu/projects/glove/
embeddings_index = dict()
f = open('glove.twitter.27B.50d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1193514 word vectors.


In [21]:
# create a weight matrix for words in training docs
embedding_matrix = zeros((vocab_size, 50))
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
# create a weight matrix for words in test docs
test_embedding_matrix = zeros((vocab_size, 50))
for word, j in tt.word_index.items():
    test_embedding_vector = embeddings_index.get(word)
    if test_embedding_vector is not None:
        test_embedding_matrix[j] = test_embedding_vector
            
embedding_matrix_transpose = embedding_matrix.transpose()
test_embedding_matrix_transpose = test_embedding_matrix.transpose()

In [22]:
# print(embedding_matrix)
embedding_matrix_transpose = embedding_matrix.transpose()
test_embedding_matrix_transpose = test_embedding_matrix.transpose()

In [23]:
#define constants
#unrolled through 28 time steps in mnist , 50(=max_length) words per tweet ?
time_steps = max_length
#hidden LSTM units = batch size , we can also take 128 for tweets and mnist?
# Total of 4743 word vectors in training set. To ensure constant batch size: 4743 = 3*3*17*31 = 153
num_units= 128
#rows of 28 pixels for mnist , 50 dimension for glove words.
n_input= 50
#learning rate for adam
learning_rate=0.001
#mnist is meant to be classified in 10 classes(0-9).
n_classes= 2
#size of batch = we can take 128 same as number of units.
batch_size= num_units

In [24]:
#weights and biases of appropriate shape to accomplish above task
out_weights=tf.Variable(tf.random_normal([num_units,n_classes]))
out_bias=tf.Variable(tf.random_normal([n_classes]))

#defining placeholders
#input image placeholder
x=tf.placeholder("float",[None,time_steps,n_input])
#input label placeholder
y=tf.placeholder("float",[None,n_classes])

In [25]:
#processing the input tensor from [batch_size,n_steps,n_input] to "time_steps" number of [batch_size,n_input] tensors
input=tf.unstack(x ,time_steps,1)

In [26]:
#defining the network
lstm_layer=rnn.BasicLSTMCell(num_units,forget_bias=1)
outputs,_=rnn.static_rnn(lstm_layer,input,dtype="float32")

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor














In [27]:
#converting last output of dimension [batch_size,num_units] to [batch_size,n_classes] by out_weight multiplication
prediction=tf.matmul(outputs[-1],out_weights)+out_bias

## softmax for probability
prob = tf.nn.softmax(prediction)

In [28]:
#loss_function
loss=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
#optimization
opt=tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

#model evaluation
#prediction = output of last LSTM time step x weights + bias
#probability = softmax(prediction)
correct_prediction=tf.equal(tf.argmax(prediction,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [29]:
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0 , len(data))
    np.random.shuffle(idx)
    idx = idx[:num]
    data_shuffle = [data[ i] for i in idx]
    labels_shuffle = [labels[ i] for i in idx]

    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

def change_shape(data,embedding_matrix_transpose):
    '''
    Change shape to batch_x=batch_x.reshape((batch_size,time_steps,n_input))
    '''
    data1 = zeros((batch_size, time_steps, n_input))
    for x in range(0, num_units):
        for y in range(0, time_steps): 
            #print (data[x,y], embedding_matrix(data[x,y]))
            #print (embedding_matrix[data[x,y]])
            for z in range(0,n_input):
                data1[x][y][z]= embedding_matrix_transpose[z,data[x,y]]
    #print(data1)
    return(data1)
                

In [30]:
probabilities = np.zeros((test_label.size, 2))
#initialize variables
init=tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    iter=1
    while iter<800:
        #batch_x,batch_y=mnist.train.next_batch(batch_size=batch_size)
        # use embedding matrix and one hot for batch_x
        batch_x, batch_y = next_batch(batch_size, padded_docs, labels)
        batch_x= change_shape(batch_x, embedding_matrix_transpose)
        sess.run(opt, feed_dict={x: batch_x, y: batch_y})

        if iter %10==0:
            acc=sess.run(accuracy,feed_dict={x:batch_x,y:batch_y})
            los=sess.run(loss,feed_dict={x:batch_x,y:batch_y})
            print("For iter ",iter)
            print("Accuracy ",acc)
            print("Loss ",los)
            print("__________________")

        iter=iter+1
    print("Optimization finished!")
    
    #calculating test accuracy
    # all test labels are 0
    test_data = padded_test_docs
    
    '''
    Change shape to test_data=test_data.reshape((3263,time_steps,n_input))
    '''
    test_data1 = zeros((3263, time_steps, n_input))
    for a in range(0, 1701):
        for b in range(0, time_steps): 
            for c in range(0,n_input):
                test_data1[a][b][c]= test_embedding_matrix_transpose[c,test_data[a,b]]
    
    probabilities = sess.run(prob, feed_dict={x: test_data1})

For iter  10
Accuracy  0.625
Loss  0.66835
__________________
For iter  20
Accuracy  0.6796875
Loss  0.6086925
__________________
For iter  30
Accuracy  0.8125
Loss  0.45201832
__________________
For iter  40
Accuracy  0.7109375
Loss  0.6025829
__________________
For iter  50
Accuracy  0.8125
Loss  0.42705977
__________________
For iter  60
Accuracy  0.734375
Loss  0.49522856
__________________
For iter  70
Accuracy  0.8203125
Loss  0.4370807
__________________
For iter  80
Accuracy  0.7734375
Loss  0.5165398
__________________
For iter  90
Accuracy  0.8359375
Loss  0.3547323
__________________
For iter  100
Accuracy  0.75
Loss  0.47695732
__________________
For iter  110
Accuracy  0.75
Loss  0.50231296
__________________
For iter  120
Accuracy  0.7890625
Loss  0.4764795
__________________
For iter  130
Accuracy  0.8125
Loss  0.41109008
__________________
For iter  140
Accuracy  0.8046875
Loss  0.4756549
__________________
For iter  150
Accuracy  0.796875
Loss  0.45462367
_____________

In [31]:
    print("Testing Probabilities")
    print("0\t 1")
    print(probabilities)
    probabilities.shape

Testing Probabilities
0	 1
[[0.28840914 0.71159077]
 [0.02939575 0.9706042 ]
 [0.02672152 0.97327846]
 ...
 [0.8687839  0.13121611]
 [0.8687839  0.13121611]
 [0.8687839  0.13121608]]


(3263, 2)

In [45]:
idx = np.arange(test_label.size, dtype=np.int16)
idx.shape
out = np.rec.fromarrays((idx, probabilities[:,0], probabilities[:,1]),  names = ('i','D','ND'))
print(out)

[(   0, 0.48068073, 0.5193193 ) (   1, 0.0104261 , 0.9895739 )
 (   2, 0.00804476, 0.9919552 ) ... (3260, 0.8786945 , 0.12130552)
 (3261, 0.8786945 , 0.12130552) (3262, 0.8786945 , 0.1213055 )]


In [None]:
with open('pred.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_NONE)
    wr.writerow(('id','Not a Disaster','Disaster'))
    wr.writerows(out)