### Loading data and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nlp_utils.model import train_model

In [6]:
data_folder = 'data/'

In [7]:
## using the cleaned files
train_data = pd.read_csv(data_folder+'train_clean.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test_clean.csv'); print(test_data.shape)

(7613, 6)
(3263, 5)


In [8]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,text_clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order cali...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [9]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

'Arnhem Weather - &lt;p&gt;An unrelenting and dangerous heat wave will expand across the South Central United States\x89Û_ http://t.co/yhAqa5WXoK'

In [10]:
# train_data.location.value_counts()

In [11]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

### Word Embeddings

#### Pre-processing

In [12]:
from tensorflow.keras.preprocessing import text, sequence

In [13]:
## creating a tokenizer
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

In [14]:
word_index = tokenizer.word_index

In [15]:
len(word_index)

22700

In [16]:
# convering tweets to sequences and padding them
train_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(train_data['text']), maxlen=70,padding='post')
test_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(test_data['text']), maxlen=70,padding='post')

In [17]:
train_seq_x

array([[ 119, 4633,   24, ...,    0,    0,    0],
       [ 189,   45,  229, ...,    0,    0,    0],
       [  40, 1751, 1620, ...,    0,    0,    0],
       ...,
       [2824, 2401,  709, ...,    0,    0,    0],
       [  78, 1145,   41, ...,    0,    0,    0],
       [   4,  209,   54, ...,    0,    0,    0]], dtype=int32)

In [18]:
train_x, valid_x, train_y,valid_y = train_test_split(train_seq_x, train_data['target'], test_size = 0.15, random_state = 42 )

In [19]:
### loading pretrained word-embeddings
embeddings_index = {}

for i, line in enumerate(open('data/wiki-news-300d-1M.vec', encoding='utf-8')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [20]:
len(embeddings_index.keys())

999995

In [21]:
### token embedding mapping
embedding_matrix = np.zeros((len(word_index)+1, 300))

x = 0
unfound_words = []
for word, i in word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        x+=1
        embedding_matrix[i] = vec
    else:
        unfound_words.append(word)

print("Embeddings found for {} words out of {}".format(x, len(word_index)))

Embeddings found for 12084 words out of 22700


In [22]:
# unfound_words[400:450]

In [23]:
# embedding_matrix[-1]

In [24]:
# embedding_matrix.sum(axis=1)

### LSTM

In [25]:
from tensorflow.keras import layers

In [26]:
from tensorflow.keras import optimizers

In [27]:
from tensorflow.keras import models

In [28]:
def create_lstm_model():
    
    input_layer = layers.Input(shape=(70,)) # input layer
    embedding_layer = layers.Embedding(len(word_index)+1, 300, trainable = False, weights = [embedding_matrix])(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    lstm_layer = layers.LSTM(100 )(embedding_layer)
#     pooling_layer = layers.GlobalMaxPool1D()(lstm_layer)

    output_layer0 = layers.Dense(128, activation = 'relu')(lstm_layer)
#     output_layer0 = layers.BatchNormalization()(output_layer0)
    output_layer0 = layers.Dropout(0.25)(output_layer0)

#     output_layer1 = layers.Dense(64, activation = 'relu')(output_layer0)
# #     output_layer1 = layers.BatchNormalization()(output_layer1)
#     output_layer1 = layers.Dropout(0.25)(output_layer1)
    
#     output_layer2 = layers.Dense(32, activation = 'relu')(output_layer1)
# #     output_layer2 = layers.BatchNormalization()(output_layer2)
#     output_layer2 = layers.Dropout(0.25)(output_layer2)
    
    output_layer3 = layers.Dense(16, activation = 'relu')(output_layer0)
#     output_layer3 = layers.BatchNormalization()(output_layer3)
#     output_layer3 = layers.Dropout(0.25)(output_layer3)
    
    output_layer4 = layers.Dense(1, activation = 'sigmoid')(output_layer3)
    
    model= models.Model(inputs = input_layer, outputs = output_layer4)
    model.compile(optimizer = optimizers.Adam(learning_rate=0.0001) ,loss = "binary_crossentropy")
    
    return model

In [29]:
### training the model

model = create_lstm_model()


In [30]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 70)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 70, 300)           6810300   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 70, 300)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 128)               12928     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064  

In [31]:
train_model(model, train_x, train_y, valid_x, valid_y, 
            neural_network = True, epochs = 200,
            test_vectors = test_seq_x, submissions_data = sample_submission, submissions_file_prefix="lstm_submission" 
           )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Classification report : 

              precision    recall  f1-score   support

           1       0.85      0.76      0.80       491
           0       0.83      0.90      0.86       651

   micro avg       0.84      0.84      0.84      1142
   macro avg       0.84      0.83      0.83      1142
weighted avg       0.84      0.84      0.84      1142

Exporting data to: 

	 data/lstm_submission_20210226162831.csv


In [32]:
#0.76, 0.85

### BiLSTM

In [31]:
def create_bilstm_model():
    
    input_layer = layers.Input(shape=(70,)) # input layer
    embedding_layer = layers.Embedding(len(word_index)+1, 300, trainable = False, weights = [embedding_matrix])(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    bilstm_layer = layers.Bidirectional(layers.LSTM(100 ))(embedding_layer)
#     pooling_layer = layers.GlobalMaxPool1D()(lstm_layer)

    output_layer0 = layers.Dense(128, activation = 'relu')(bilstm_layer)
#     output_layer0 = layers.BatchNormalization()(output_layer0)
    output_layer0 = layers.Dropout(0.25)(output_layer0)

    output_layer1 = layers.Dense(64, activation = 'relu')(output_layer0)
#     output_layer1 = layers.BatchNormalization()(output_layer1)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    
#     output_layer2 = layers.Dense(32, activation = 'relu')(output_layer1)
# #     output_layer2 = layers.BatchNormalization()(output_layer2)
#     output_layer2 = layers.Dropout(0.25)(output_layer2)
    
    output_layer3 = layers.Dense(16, activation = 'relu')(output_layer1)
#     output_layer3 = layers.BatchNormalization()(output_layer3)
#     output_layer3 = layers.Dropout(0.25)(output_layer3)
    
    output_layer4 = layers.Dense(1, activation = 'sigmoid')(output_layer3)
    
    model= models.Model(inputs = input_layer, outputs = output_layer4)
    model.compile(optimizer = optimizers.Adam(learning_rate=0.00001) ,loss = "binary_crossentropy")
    
    return model

In [32]:
### training the model

model = create_bilstm_model()


In [33]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 70)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 70, 300)           6810300   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 70, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               320800    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               25728     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256

In [34]:
train_model(model, train_x, train_y, valid_x, valid_y, 
            neural_network = True, epochs = 200,
            test_vectors = test_seq_x, submissions_data = sample_submission, submissions_file_prefix="bilstm_submission" 
           )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Classification report : 

              precision    recall  f1-score   support

           1       0.88      0.68      0.77       491
           0       0.80      0.93      0.86       651

   micro avg       0.82      0.82      0.82      1142
   macro avg       0.84      0.81      0.81      1142
weighted avg       0.83      0.82      0.82      1142

Exporting data to: 

	 data/bilstm_submission

In [41]:
# 0.8, 0.86

### RCNN

In [61]:
def create_rcnn():
    input_layer = layers.Input(shape = (70,))
    
    embedding_layer = layers.Embedding(len(word_index)+1,300, trainable = False, weights = [embedding_matrix])(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.25)(embedding_layer)
    
    bilstm_layer = layers.Bidirectional(layers.LSTM(100, return_sequences = True))(embedding_layer)
    
    cnn_layer = layers.Convolution1D(100,3, activation = 'relu')(bilstm_layer)
    
    pooling_layer = layers.GlobalMaxPool1D()(cnn_layer)
    
    output_layer0 = layers.Dense(32, activation = 'relu')(pooling_layer)
    output_layer0 = layers.Dropout(0.25)(output_layer0)
    
    output_layer1 = layers.Dense(16, activation = 'relu')(output_layer0)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    
    output_layer = layers.Dense(1, activation = 'sigmoid')(output_layer1)
    
    model = models.Model(inputs = input_layer, outputs = output_layer)
    model.compile(optimizer = optimizers.Adam(lr = 0.0001), loss = 'binary_crossentropy')
    
    return model

In [62]:
model = create_rcnn()

In [63]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 70)]              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 70, 300)           6810300   
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 70, 300)           0         
_________________________________________________________________
bidirectional_8 (Bidirection (None, 70, 200)           320800    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 68, 100)           60100     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 100)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 32)                3232

In [64]:
train_model(model, train_x, train_y, valid_x, valid_y, 
            neural_network = True, epochs = 200,
            test_vectors = test_seq_x, submissions_data = sample_submission, submissions_file_prefix="rcnn_submission" 
           )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Classification report : 

              precision    recall  f1-score   support

           1       0.90      0.68      0.77       491
           0       0.80      0.94      0.86       651

   micro avg       0.83      0.83      0.83      1142
   macro avg       0.85      0.81      0.82      1142
weighted avg       0.84      0.83      0.83      1142

Exporting data to: 

	 data/rcnn_submission_20210308131237.csv
