In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Problem Statement

>### The project is aimed to identified person's words in Twitter whethler announcing disaster. It is going to use RNN with LSTM to train and predict the data.
>### There are 7613 entires in training data with three key attributes, "keyword", "location" and "text". The "keyword" and "location could be null. The "target" denotes whether a tweet is about a real disaster (1) or not (0). 4342 are not disaster and 3271 are real. Since the test set is not labeled. 20% training set will be selected for validation.

In [None]:
dir_test = '/kaggle/input/nlp-getting-started/test/'
dir_train = '/kaggle/input/nlp-getting-started/train/'
text_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
text_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
print(text_train.count())
print(text_train['target'].value_counts())
plt.hist(text_train['target'])

# 2. Exploratory Data Analysis (EDA) 
> ### We could find most of the words in "text" are meaningless stopwords. It might affect the training and anaylsis. Hence, it is going to dropout the stopwords. In addition, assuming that "location" is irrelavant. This colummn would be dropouted as well. Meanwhile, the "keyword" is added at the beginning of the "text", for convenient.

In [None]:
import seaborn as sns
def wordCourtBar(dataSet, index):
    text_counts = dataSet[index].str.findall(r"(\w+)").explode().value_counts()[:20]
    print(text_counts)
    text_counts.plot(kind='barh')

wordCourtBar(text_train,'text')

In [None]:
wordCourtBar(text_train,'keyword')

In [None]:
#cleaning data
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = stopwords.words('english')
stop_words.extend(['t', 'co', 'I', 's', '#', 'A', '2', 'The', 'Û_', 'n', 'In', 'nan', 'http', 'https'])
stop_words = set(stop_words) - set(['no', 'not'])

def dataCleaning(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    #with no lower case conversion
    filtered_sentence = []
  
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
    return filtered_sentence

text_train['joint'] = text_train['keyword'].astype(str) + " "+text_train['text']
text_test['joint'] = text_test['keyword'].astype(str) + " "+text_test['text']
text_train['clean'] = text_train['joint'].apply(dataCleaning)
text_test['clean'] = text_test['joint'].apply(dataCleaning)
text_train['clean'] = text_train['clean'].apply(lambda x: " ".join(x))
text_test['clean'] = text_test['clean'].apply(lambda x: " ".join(x))

text_train.head()

> ### After cleaning the words (except the negative words "no", "not"), we could find some different wordings between real disaster and non-disaster entries in both text and keyword column.

In [None]:
T = text_train[text_train['target'] == 1]
F = text_train[text_train['target'] == 0]
wordCourtBar(T,'clean')

In [None]:
wordCourtBar(F,'clean')

In [None]:
fig = plt.figure(figsize=(8, 72), dpi=100)
sns.countplot(y=text_train.sort_values(by='target', ascending=False)['keyword'],
              hue=text_train.sort_values(by='target', ascending=False)['target'])
plt.tick_params(axis='x', labelsize=15)
plt.tick_params(axis='y', labelsize=12)
plt.legend(loc=1)
plt.title('Target Distribution in Keywords')

plt.show()


# 3. Model Architecture 
> ### Consider the correlation between text and its sequency. A LSTM model is going to be used
> ### In the first model 
LSTM unit is set to 4. tanh is going to be used as activation functions for the output layer. 

> ### In the second model 
LSTM unit is set to 256. tanh is going to be used as activation functions for the output layer. Normalization is used.

In [None]:
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def word_count(str):
    words = str.split()
    counts = len(words)
    return counts

text_train['wordTotal'] = text_train['clean'].apply(word_count)
total_words_train = max(text_train['wordTotal'])
tokenizer = Tokenizer(num_words = total_words_train)
tokenizer.fit_on_texts(text_train['clean'])
train_sequences = tokenizer.texts_to_sequences(text_train['clean'])
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')

y_train = np.asarray(text_train['target'])

In [None]:
import keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop
batchSize = 128 
validationSplit = 0.2 
epochs = 100

model1 = keras.Sequential()
model1.add(layers.Embedding(input_dim=total_words_train, output_dim=40))
model1.add(layers.LSTM(4))
model1.add(layers.Dense(4))
model1.add(layers.Dense(1, activation='tanh'))

model1.build()
model1.summary()
opt = RMSprop(learning_rate=0.01)
model1.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
y_train = np.asarray(text_train['target'])
hist1 = model1.fit(padded_train, y_train, batch_size = batchSize, validation_split = validationSplit, epochs = epochs)

In [None]:
model2 = keras.Sequential()
model2.add(layers.Embedding(input_dim=total_words_train, output_dim=40))
model2.add(layers.LSTM(256))
model2.add(layers.Dense(256))
model2.add(layers.BatchNormalization())
model2.add(layers.Dense(1, activation='tanh'))

model2.build()
model2.summary()
opt = RMSprop(learning_rate=0.01)
model2.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
hist2 = model2.fit(padded_train, y_train, batch_size = batchSize, validation_split = validationSplit, epochs = epochs)

# 4. Results and Analysis 

> ### In the first model,
The final training accuracy was accuracy: 0.5793, and validation accuracy was val_accuracy: 0.5345. It is a fair model. It probably has overfitting. And the accuracy are consistant.


> ### In the second model,
The final training accuracy was accuracy: 0.7724, and validation accuracy was val_accuracy: 0.6704. It is good-fit model. It has a nice training curve. However, It shows over-fitting.         


In [None]:
plt.plot(hist1.history["accuracy"])
plt.plot(hist1.history['val_accuracy'])
plt.title("Model 1 Evaluation")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Training Accuracy","Validation Accuracy"])

plt.show()
plt.plot(hist2.history["accuracy"])
plt.plot(hist2.history['val_accuracy'])
plt.title("Model 2 Evaluation")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Training Accuracy","Validation Accuracy"])

plt.show()

# 5. Conclusion 
> ### Comparing two models,
The first model performs fair in terms of accuracy. However it is overfitting. After normalized, the second model improved the accuracy, but it is still overfitting

> ### Future Improvement
It is suggested to use dropout layer, to solve overfitting. And also consider other RNN technique, such as GRU. Maybe SGD or Adam could be tried as alternative optimization method.

In [None]:
#Submission
import os

text_test['wordTotal'] = text_test['clean'].apply(word_count)
total_words_train = max(text_test['wordTotal'])

tokenizer = Tokenizer(num_words = total_words_train)
tokenizer.fit_on_texts(text_test['clean'])
test_sequences = tokenizer.texts_to_sequences(text_test['clean'])
padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post') 
predictions = model2.predict(padded_test, verbose=1)
print(predictions)
pred = np.transpose(predictions)[0]

print(pred)

submission_df = pd.DataFrame()
submission_df['id'] = text_test['id']
submission_df['target'] = list(map(lambda x: 0 if x < 0.5 else 1, pred))
submission_df.to_csv('/kaggle/working/submission.csv', index=False)
submission_df.head()