In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="darkgrid")

# Reading traind and test files

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
print(f"Train shape = {train.shape}\nTest shape = {test.shape}")

In [None]:
train.head()

In [None]:
test.head()

# EDA

In [None]:
#checking null values
print(f"NULL values\n\nTrain\n{train.isnull().sum()}\n\nTest\n{test.isnull().sum()}")

In [None]:
#train describe
train.describe(include="all")

In [None]:
#test describe
test.describe(include="all")

In [None]:
#disaster or not frequency
target_counts=train["target"].value_counts()
print(target_counts)

p1= round(100*(target_counts[1]/sum(target_counts)),2)
p2= round(100-p1,2)
print(f"\nPercentage of Disaster Tweets: {p1}%\nPercentage of Non Disaster Tweets: {p2}%")

In [None]:
sns.countplot(train["target"])
plt.title("Disaster (1) or not (0) barplot")

In [None]:
#maximum length tweet
print("Train max length tweet:",train["text"].apply(len).max())
print("Test max length tweet:",test["text"].apply(len).max(),"\n")

#minimum length tweet
print("Train min length tweet:",train["text"].apply(len).min())
print("Test min length tweet:",test["text"].apply(len).min())


In [None]:
#separating train into 2 datasets to get value_counts of keywords
print("Disaster Keywords\n\n",train[train["target"]==1]["keyword"].value_counts()[:10],"\n")
print("Non Disaster Keywords\n\n",train[train["target"]==0]["keyword"].value_counts()[:10])

In [None]:
#barplots of keywords
plt.figure(figsize=(18,6))
plt.subplot(121)
plt.title("Most commom keywords of disaster tweets")
sns.barplot(x=train[train["target"]==1]["keyword"].value_counts()[:10],\
              y=train[train["target"]==1]["keyword"].value_counts()[:10].index,orient="h")

plt.subplot(122)
plt.title("Most commom keywords of non disaster tweets")
sns.barplot(x=train[train["target"]==0]["keyword"].value_counts()[:10],\
              y=train[train["target"]==0]["keyword"].value_counts()[:10].index,orient="h")
plt.show()

In [None]:
#separating train into 2 datasets to get value_counts of locations
print("Disaster Keywords\n\n",train[train["target"]==1]["location"].value_counts()[:10],"\n")
print("Non Disaster Keywords\n\n",train[train["target"]==0]["location"].value_counts()[:10])

In [None]:
#barplots of locations
plt.figure(figsize=(18,6))
plt.subplot(121)
plt.title("Most commom locations of disaster tweets")
sns.barplot(x=train[train["target"]==1]["location"].value_counts()[:10],\
              y=train[train["target"]==1]["location"].value_counts()[:10].index,orient="h")

plt.subplot(122)
plt.title("Most commom locations of non disaster tweets")
sns.barplot(x=train[train["target"]==0]["location"].value_counts()[:10],\
              y=train[train["target"]==0]["location"].value_counts()[:10].index,orient="h")
plt.show()

In [None]:
#checking for line break in tweets
train['text'].str.contains("\n").value_counts()

In [None]:
#checking for links in tweets
train['text'].str.contains("https").value_counts()

# Feature Engineering

In [None]:
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.stem.snowball import SnowballStemmer
import string
import re

stemmer_snowball = SnowballStemmer("english")
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')

In [None]:
def remove_links(tweet):
    link = re.compile(r'https?://\S+|www\.\S+')
    return link.sub(r"",tweet)

In [None]:
#removing links
train["text"] = train["text"].apply(remove_links)
test["text"] = test["text"].apply(remove_links)

In [None]:
#transforming all tweets into lower case letters
train["text"] = train["text"].apply(lambda x: x.lower())
test["text"] = test["text"].apply(lambda x: x.lower())

In [None]:
#Tokenization
tokenizer.fit_on_texts(train["text"])
sequences_train=tokenizer.texts_to_sequences(train["text"]) 
train["text"] = tokenizer.sequences_to_texts(sequences_train)

sequences_test=tokenizer.texts_to_sequences(test["text"]) 
test["text"] = tokenizer.sequences_to_texts(sequences_test)


# Validation with ML models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

vec_rf = Pipeline([("Tfid",TfidfVectorizer()),("rfclass",RandomForestClassifier())])
vec_xg = Pipeline([("Tfid",TfidfVectorizer()),("xgboost",XGBClassifier())])
vec_nb = Pipeline([("Tfidf",TfidfVectorizer()),("clf",MultinomialNB())])



In [None]:
X = train["text"]
y = train["target"]
results_nb = []
results_xg = []
results_rf = []
results_lstm = []

kf = RepeatedKFold(n_splits=2,n_repeats=5,random_state=10)

for train_l, valid_l in kf.split(X):
    X_train, X_valid = X.iloc[train_l],X.iloc[valid_l]
    y_train, y_valid = y.iloc[train_l],y.iloc[valid_l]
    
    vec_nb.fit(X_train,y_train)
    p = vec_nb.predict(X_valid)
    results_nb.append(f1_score(p,y_valid))
    
    vec_xg.fit(X_train,y_train)
    p = vec_xg.predict(X_valid)
    results_xg.append(f1_score(p,y_valid))
    
    vec_rf.fit(X_train,y_train)
    p = vec_rf.predict(X_valid)
    results_rf.append(f1_score(p,y_valid))
    
    

In [None]:
print("Random Forest = ",np.mean(results_rf))
print("XGB = ",np.mean(results_xg))
print("Naive Baiyes = ",np.mean(results_nb))

# LSTM Model

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [None]:
VOCAB_SIZE= len(tokenizer.word_index)+1
EPOCHS=30
MAXLEN=40

In [None]:
X = pad_sequences(sequences_train,maxlen=MAXLEN, padding='post', truncating='post')
test_pad = pad_sequences(sequences_test,maxlen=MAXLEN, padding='post', truncating='post')

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE,32,input_length=MAXLEN),
    tf.keras.layers.SpatialDropout1D(0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(1,activation="relu")
])
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_acc', 
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True)

model.summary()
model.compile(loss="binary_crossentropy",optimizer="Adam", metrics=['acc'])

In [None]:
history = model.fit(X_train,y_train,batch_size=64,epochs=EPOCHS,\
                    validation_data = (X_val,y_val),callbacks=[early_stopping])

# Create Submission

In [None]:
#LSTM submission
y_test = model.predict(test_pad)
p = [int(i>0.5) for i in y_test]

In [None]:
sub = pd.Series(p,index=test["id"],name="target")
sub.to_csv("submission.csv",header=True)