In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
input_df = pd.read_csv('../dataset/train.csv',usecols=['id','text','target'])
input_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
from sklearn import preprocessing
from keras.layers import Input,Dense,Embedding,LSTM,Dropout,Activation
from keras.layers import Bidirectional,GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
input_df.text.values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [5]:
embedded_size = 100
max_features = 10000
maxlen = 100

In [6]:
test = pd.read_csv('../dataset/test.csv')
x_test = test.text.values

In [7]:
# Split the data to train and validation
from sklearn.model_selection import train_test_split
train_df ,val_df = train_test_split(input_df,test_size = 0.1 , random_state = 43)
train_X = train_df.text.values
val_X = val_df.text.values

In [8]:
# tokenizing the text data using Tokenizer function
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

NameError: name 'test_X' is not defined

In [None]:
# padding the values
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [36]:
type(test_X)

numpy.ndarray

In [21]:
train_y = train_df.target.values
val_y = val_df.target.values

In [None]:

#Building layers of embedding
model = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), 
     tf.keras.layers.Flatten(),
     tf.keras.layers.Dense(6, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
])

In [22]:
from keras.models import Model

inp = Input(shape = (maxlen,))
x = Embedding(max_features,embedded_size)(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1,activation = 'sigmoid')(x)
model = Model(inputs = inp,outputs = x)
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          84480     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropo

In [24]:
# training the model
model.fit(train_X, train_y, batch_size=512, epochs=10, validation_data=(val_X, val_y))

Train on 6851 samples, validate on 762 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f4260569a50>

In [25]:
preds = model.predict([test_X],batch_size = 1024,verbose = 1)



In [28]:
probabilities = model.predict(test_X)

In [29]:
predictions = (preds > 0.5).astype(int)
predictions = np.ndarray.flatten(predictions)

In [30]:
original_test_df = pd.read_csv("../dataset/test.csv")
df = pd.DataFrame({'text' : original_test_df['text'],'prediction' : predictions,'probabilities' : np.ndarray.flatten(probabilities)})
#df.to_csv("test_df.csv", index=False)

In [27]:
len(test_X)

3263

In [31]:
df.values[50:60]

array([["Stop saying 'I Wish' and start saying 'I Will'. \x89ÛÒ Unknown",
        0, 0.006142735481262207],
       ["I want to go to Aftershock in October because it has all the bands I listen to and #NXT! Can't afford it yet though. #gradschoolapps",
        0, 0.0007947683334350586],
       ["'We are still living in the aftershock of Hiroshima people are still the scars of history.' - Edward Bond http://t.co/engTl5wrGp",
        1, 0.976668119430542],
       ['320 [IR] ICEMOON [AFTERSHOCK] | http://t.co/THyzOMVWU0 | @djicemoon | #Dubstep #TrapMusic #DnB #EDM #Dance #Ices\x89Û_ http://t.co/83jOO0xk29',
        0, 0.00025957822799682617],
       ['Aftershock https://t.co/Ecy4U623nO', 0, 0.018789947032928467],
       ["'There is no victory at bargain basement prices.' Dwight David Eisenhower",
        0, 0.007542967796325684],
       ['Bo2 had by far the best competitive maps imo hope bo3 is the same #InVahnWeTrust',
        0, 0.008750826120376587],
       ['Brass and Copper in Catacly