In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [2]:
input_df = pd.read_csv('../dataset/train.csv',usecols=['id','text','target'])
#fixing conflicts
target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
input_df.at[input_df['id'].isin(target_error),'target'] = 0
input_df.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
from sklearn import preprocessing
from keras.layers import Input,Dense,Embedding,LSTM,Dropout,Activation
from keras.layers import Bidirectional,GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
input_df.text.values

array(['Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all',
       'Forest fire near La Ronge Sask. Canada',
       "All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected",
       ...,
       'M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ',
       'Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.',
       'The Latest: More Homes Razed by Northern California Wildfire - ABC News http://t.co/YmY4rSkQ3d'],
      dtype=object)

In [5]:
embedded_size = 100
max_features = 10000
maxlen = 100

In [6]:
test = pd.read_csv('../dataset/test.csv')
test_X = test.text.values


In [7]:
# Split the data to train and validation
from sklearn.model_selection import train_test_split
train_df ,val_df = train_test_split(input_df,test_size = 0.1 , random_state = 43)
train_X = train_df.text.values
val_X = val_df.text.values

In [8]:
# tokenizing the text data using Tokenizer function
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

In [9]:
# padding the values
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

In [10]:
type(test_X)

numpy.ndarray

In [11]:
train_y = train_df.target.values
val_y = val_df.target.values

In [None]:

#Building layers of embedding
model = tf.keras.Sequential([
     tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), 
     tf.keras.layers.Flatten(),
     tf.keras.layers.Dense(6, activation='relu'),
     tf.keras.layers.Dense(1, activation='sigmoid')
])

In [12]:
from keras.models import Model

inp = Input(shape = (maxlen,))
x = Embedding(max_features,embedded_size)(inp)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16,activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(1,activation = 'sigmoid')(x)
model = Model(inputs = inp,outputs = x)
model.compile(loss = 'binary_crossentropy',optimizer = 'adam',metrics = ['accuracy'])

print(model.summary())

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          84480     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_1 (Dropo

In [13]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint('./temp_files/best_model2_16bz.h5', monitor='val_accuracy', save_best_only=True)

In [14]:
# training the model
model.fit(train_X, train_y,callbacks=[checkpoint], batch_size=16, epochs=10, validation_data=(val_X, val_y))


Train on 6851 samples, validate on 762 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f50d44cfc50>

In [19]:
#preds = model.predict([test_X],batch_size = 1024,verbose = 1)
#preds



array([[0.8224608 ],
       [0.9142288 ],
       [0.9885963 ],
       ...,
       [0.9785872 ],
       [0.95936173],
       [0.9243155 ]], dtype=float32)

In [15]:
model.load_weights('./temp_files/best_model2_16bz.h5')
#test_pred = model.predict(test_input)
probabilities = model.predict(test_X)
probabilities

array([[0.4685789 ],
       [0.6791415 ],
       [0.93585503],
       ...,
       [0.9365703 ],
       [0.8783996 ],
       [0.54677826]], dtype=float32)

In [16]:
predictions = (probabilities > 0.5).astype(int)
predictions = np.ndarray.flatten(predictions)
predictions

array([0, 1, 1, ..., 1, 1, 1])

In [30]:
original_test_df = pd.read_csv("../dataset/test.csv")
df = pd.DataFrame({'text' : original_test_df['text'],'prediction' : predictions,'probabilities' : np.ndarray.flatten(probabilities)})
#df.to_csv("test_df.csv", index=False)

In [None]:
result = predictions.tolist()
result

In [18]:
submit = pd.read_csv("../dataset/sample_submission.csv")
submit.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [19]:
submit['target'] = result
print(submit)

         id  target
0         0       0
1         2       1
2         3       1
3         9       0
4        11       1
...     ...     ...
3258  10861       1
3259  10865       1
3260  10868       1
3261  10874       1
3262  10875       1

[3263 rows x 2 columns]


In [20]:
submit.to_csv("./temp_files/lstm_submit_callbacks_corrected_16bz.csv",index=False)