In [1]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize as wtoken
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import Input
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import (Activation, Dropout, Dense, Embedding, Flatten, BatchNormalization, GlobalMaxPooling1D, Lambda, concatenate, LSTM, Concatenate)
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.utils import plot_model
from tensorflow.keras import backend as K

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os, string, gc, re

from tqdm import tqdm


import tldextract as tld


from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# A quick verification to see if the gpu drivers are ready to serve traning DL
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.0.0
Num GPUs Available:  1


In [3]:
path = '/media/moh/abunayla/Disaster_Tweets/Notebooks/data/'

In [75]:
train = pd.read_csv(path + 'train.csv', usecols = ['text'])

test = pd.read_csv(path + 'test.csv', usecols = ['text'])

samsub = pd.read_csv(path +'sample_submission.csv', index_col = 'id')

Y =  pd.read_csv(path +'train.csv', usecols = ['target'])

In [5]:
train.head()

Unnamed: 0,text
0,Our Deeds are the Reason of this #earthquake M...
1,Forest fire near La Ronge Sask. Canada
2,All residents asked to 'shelter in place' are ...
3,"13,000 people receive #wildfires evacuation or..."
4,Just got sent this photo from Ruby #Alaska as ...


In [6]:
train.shape

(7613, 1)

In [7]:
maxlen =300
training_samples =6000
max_words = 15000

In [8]:
tokenizer = Tokenizer(num_words = max_words, oov_token= 'OOV')

In [9]:
texts = train.text.to_list()
test_text = test.text.to_list()

In [10]:
tokenizer.fit_on_texts(test_text)
tokenizer.fit_on_texts(texts)

In [11]:
sequences = tokenizer.texts_to_sequences(texts)
test_sequences = tokenizer.texts_to_sequences(test_text)

In [12]:
word_index = tokenizer.word_index

In [13]:
print('Found %s unique tokens.' %len( word_index))

Found 29320 unique tokens.


In [14]:
data = pad_sequences(sequences, maxlen=maxlen)

In [15]:
test_data = pad_sequences(test_sequences, maxlen=maxlen)

In [16]:
print('Shape of data tensor:', data.shape)

Shape of data tensor: (7613, 300)


In [17]:
print('Shape of test data tensor:', test_data.shape)

Shape of test data tensor: (3263, 300)


In [18]:
labels = np.asarray(Y.target)

In [19]:
print('Shape of labels tensor:', labels.shape)

Shape of labels tensor: (7613,)


In [20]:
# Splits the data into a training and validations sets.
# also will reshuffle the data.
indices = np.arange(data.shape[0])

In [21]:
np.random.shuffle(indices)

In [22]:
data = data[indices]

In [23]:
labels = labels[indices]

In [24]:
x_train = data[:training_samples]

In [25]:
x_train.shape

(6000, 300)

In [26]:
y_train = labels[:training_samples]

In [27]:
x_val = data[training_samples:]

In [28]:
x_val.shape

(1613, 300)

In [29]:
y_val = labels[training_samples:]

In [30]:
assert len(y_val) == len(x_val)

In [31]:
# Glove
glove_dir = path + 'glove.6B.300d.txt'

In [32]:
embeddings_index = {}

In [33]:
f = open(glove_dir)

In [34]:
%timeit
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

400000it [00:21, 18286.72it/s]


In [35]:
print('Found word vectors:', len(embeddings_index))

Found word vectors: 400000


In [36]:
embedding_dim = 300

In [37]:
embedding_matrix = np.zeros((max_words, embedding_dim))

In [38]:
for word, i, in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [39]:
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length= maxlen))
model.add(Flatten())
model.add(Dense(128, activation = 'tanh'))
model.add(Dense(64, activation = 'tanh'))
model.add(Dense(32, activation = 'tanh'))
model.add(Dense(1, activation = 'sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 300)          4500000   
_________________________________________________________________
flatten (Flatten)            (None, 90000)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               11520128  
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 16,030,497
Trainable params: 16,030,497
Non-trainable params: 0
____________________________________________

In [40]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [41]:
model.compile(optimizser = 'rmsprop', loss = 'binary_crossentropy', metrics = ['acc'])

In [42]:
history = model.fit(x_train, y_train, epochs =10, batch_size = 32, validation_data = [x_val, y_val])

Train on 6000 samples, validate on 1613 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [43]:
model.save_weights(path+'pre_trained_glove_model.h5')

In [62]:
preds = model.predict(test_data)
preds = np.around(preds)
preds = preds.astype(int)

In [76]:
samsub.target = preds

In [77]:
samsub.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,0
9,1
11,1


In [80]:
samsub.to_csv('submission.csv')