### Loading data and libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model, preprocessing, model_selection,feature_extraction

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [3]:
import matplotlib.pyplot as plt

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from nlp_utils.model import train_model

In [6]:
data_folder = 'data/'

In [7]:
## using the cleaned files
train_data = pd.read_csv(data_folder+'train_clean.csv'); print(train_data.shape)
test_data = pd.read_csv(data_folder+'test_clean.csv'); print(test_data.shape)

(7613, 6)
(3263, 5)


In [8]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target,text_clean
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order cali...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


In [9]:
train_data[train_data['target'] == 1].sample()['text'].values[0]

'Haha South Tampa is getting flooded hah- WAIT A SECOND I LIVE IN SOUTH TAMPA WHAT AM I GONNA DO WHAT AM I GONNA DO FVCK #flooding'

In [10]:
# train_data.location.value_counts()

In [11]:
sample_submission = pd.read_csv(data_folder+'sample_submission.csv')
# sample_submission.head()

### Word Embeddings

#### Pre-processing

In [12]:
from tensorflow.keras.preprocessing import text, sequence

In [13]:
# text.Tokenizer?

#### Trained with both `text` and `text_clean` columns
`text` performed slightly better than `text_clean`

In [14]:
## creating a tokenizer
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

In [15]:
word_index = tokenizer.word_index

In [16]:
len(word_index)

22700

In [17]:
# convering tweets to sequences and padding them
train_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(train_data['text_clean']), maxlen=70,padding='post')
test_seq_x = sequence.pad_sequences(tokenizer.texts_to_sequences(test_data['text_clean']), maxlen=70,padding='post')

In [18]:
train_seq_x

array([[ 868,  263,  138, ...,    0,    0,    0],
       [ 189,   45,  229, ...,    0,    0,    0],
       [1620, 2158,  714, ...,    0,    0,    0],
       ...,
       [ 283,  590, 1609, ...,    0,    0,    0],
       [  78, 1145,  342, ...,    0,    0,    0],
       [ 209,  153,  546, ...,    0,    0,    0]], dtype=int32)

In [19]:
train_x, valid_x, train_y,valid_y = train_test_split(train_seq_x, train_data['target'], test_size = 0.15, random_state = 42 )

In [20]:
### loading pretrained word-embeddings
embeddings_index = {}

for i, line in enumerate(open('data/wiki-news-300d-1M.vec', encoding='utf-8')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [21]:
len(embeddings_index.keys())

999995

In [22]:
### token embedding mapping
embedding_matrix = np.zeros((len(word_index)+1, 300))

x = 0
unfound_words = []
for word, i in word_index.items():
    vec = embeddings_index.get(word)
    if vec is not None:
        x+=1
        embedding_matrix[i] = vec
    else:
        unfound_words.append(word)

print("Embeddings found for {} words out of {}".format(x, len(word_index)))

Embeddings found for 12084 words out of 22700


In [23]:
# unfound_words[400:450]

In [24]:
# embedding_matrix[-1]

In [25]:
# embedding_matrix.sum(axis=1)

### CNN

In [26]:
from tensorflow.keras import layers

In [27]:
from tensorflow.keras import optimizers

In [28]:
from tensorflow.keras import models

In [29]:
# layers.Conv1D?

In [30]:
# layers.GlobalMaxPool1D?

In [31]:
def create_cnn_model():
    
    input_layer = layers.Input(shape=(70,)) # input layer
    embedding_layer = layers.Embedding(len(word_index)+1, 300, trainable = False, weights = [embedding_matrix])(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)
    
    conv_layer = layers.Conv1D(100,3,activation = 'relu')(embedding_layer)
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

#     output_layer0 = layers.Dense(128, activation = 'relu')(pooling_layer)
# #     output_layer0 = layers.BatchNormalization()(output_layer0)
#     output_layer0 = layers.Dropout(0.25)(output_layer0)

#     output_layer1 = layers.Dense(64, activation = 'relu')(output_layer0)
# #     output_layer1 = layers.BatchNormalization()(output_layer1)
#     output_layer1 = layers.Dropout(0.25)(output_layer1)
    
#     output_layer2 = layers.Dense(32, activation = 'relu')(output_layer1)
# #     output_layer2 = layers.BatchNormalization()(output_layer2)
#     output_layer2 = layers.Dropout(0.25)(output_layer2)
    
    output_layer3 = layers.Dense(32, activation = 'relu')(pooling_layer)
#     output_layer3 = layers.BatchNormalization()(output_layer3)
#     output_layer3 = layers.Dropout(0.25)(output_layer3)
    
    output_layer4 = layers.Dense(1, activation = 'sigmoid')(output_layer3)
    
    model= models.Model(inputs = input_layer, outputs = output_layer4)
    model.compile(optimizer = optimizers.Adam(learning_rate=0.00001) ,loss = "binary_crossentropy")
    
    return model

In [32]:
### training the model

model = create_cnn_model()


In [33]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 70)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 70, 300)           6810300   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 70, 300)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 68, 100)           90100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                3232      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33    

In [34]:
train_model(model, train_x, train_y, valid_x, valid_y, 
            neural_network = True, epochs = 200,
            test_vectors = test_seq_x, submissions_data = sample_submission, submissions_file_prefix="cnn_submission" 
           )

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78