In [1]:
from keras import Model
from keras import Input
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dense
from keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
sample = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [3]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
# MODEL
BATCH_SIZE  = 128
EPOCH       = 3
VAL_SPLIT   = 0.15  #15%

# TOKENIZER
# The maximum number of words to be used. (most frequent)
MAX_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 150

## Preprocessing

In [5]:
Y = pd.get_dummies(train_df.target)
CLASS_NUM = Y.shape[1]

### Tokenize

In [6]:
all_text = train_df.text.append(test_df.text, ignore_index=True)

In [7]:
%%time
tokenize = Tokenizer(num_words=MAX_WORDS)
tokenize.fit_on_texts(all_text)

CPU times: user 377 ms, sys: 8.5 ms, total: 385 ms
Wall time: 413 ms


In [8]:
%%time
sequences = tokenize.texts_to_sequences(train_df.text)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
print(sequences_matrix.shape)

(7613, 150)
CPU times: user 203 ms, sys: 1.73 ms, total: 205 ms
Wall time: 208 ms


In [9]:
print(train_df.text[1])
print(sequences_matrix[1])

Forest fire near La Ronge Sask. Canada
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  168   43  212  927 8813 8814 1442]


In [10]:
def RNN():
    inputs = Input(name='inputs',shape=[MAX_SEQUENCE_LENGTH])
    layer = Embedding(MAX_WORDS,50,input_length=MAX_SEQUENCE_LENGTH)(inputs)
    layer = LSTM(100)(layer)
    layer = Dense(256, activation='relu', name='FC1')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(CLASS_NUM, activation='sigmoid', name='out_layer')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [11]:
model = RNN()
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          [(None, 150)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 150, 50)           1000000   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               60400     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               25856     
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 2)                 514       
Total params: 1,086,770
Trainable params: 1,086,770
Non-trainable params: 0
____________________________________________

In [12]:
history = model.fit(sequences_matrix,Y,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCH,
                    validation_split=VAL_SPLIT)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Submission

In [13]:
test_sequences = tokenize.texts_to_sequences(test_df.text)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
%%time
predict_proba = model.predict(test_sequences_matrix)

CPU times: user 657 ms, sys: 34.2 ms, total: 691 ms
Wall time: 647 ms


In [15]:
predict_proba = pd.DataFrame(predict_proba, columns=Y.columns)
predict_proba.to_csv('predict_proba.csv', index=False)
predict_proba.head()

Unnamed: 0,0,1
0,0.345722,0.576475
1,0.096031,0.816121
2,0.018097,0.936158
3,0.057417,0.871084
4,0.003003,0.981461


## Output

In [16]:
predict_target = Y.columns[np.argmax(predict_proba.values, axis=1)]
submission = pd.DataFrame({'id':sample['id'], 
                           'target':predict_target})

submission.to_csv('submission.csv', index=False)