In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import time
%matplotlib inline

### Reading the Data

In [2]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
train = train[['text','target']]
train.drop_duplicates(inplace=True)
test = pd.read_csv("../input/nlp-getting-started/test.csv")
test = test[['id','text']]

### Cleaning the tweets

In [3]:
import string
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def clean(text):
    # only aplhabest
    text =  re.sub(r'[^A-Za-z ]+', '', text) 
    # converting all the teweets to lower case
    text = text.lower()
    # remove urls and html tags
    url = re.compile(r"https?://\s+|www\.\S+")
    text = url.sub(r"", text)
    html = re.compile(r"<.*?>")
    text = html.sub(r"",text)
    # removing emojis
    emoji = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    text = emoji.sub(r'',text)
    #removing punctutions
    punctuations = str.maketrans("","", string.punctuation)
    text = text.translate(punctuations)
    #lemmatization
    text = text.split(' ')
    text = [lemmatizer.lemmatize(i) for i in text]
    text = ' '.join(text) 
    return text

In [4]:
train['text'] = train.text.apply(lambda x: clean(x))

In [5]:
def tweet_cleaner(tweet):
    # Acronyms and miswritten words
    tweet = re.sub(r"Typhoon-Devastated", "typhoon devastated", tweet)
    tweet = re.sub(r"TyphoonDevastated", "typhoon devastated", tweet)
    tweet = re.sub(r"typhoondevastated", "typhoon devastated", tweet)
    tweet = re.sub(r"MH370", "Malaysia Airlines Flight", tweet)
    tweet = re.sub(r"MH", "Malaysia Airlines Flight", tweet)
    tweet = re.sub(r"mh370", "Malaysia Airlines Flight", tweet)
    tweet = re.sub(r"year-old", "years old", tweet)
    tweet = re.sub(r"yearold", "years old", tweet)
    tweet = re.sub(r"yr old", "years old", tweet)
    tweet = re.sub(r"PKK", "Kurdistan Workers Party", tweet)
    tweet = re.sub(r"MP", "madhya pradesh", tweet)
    tweet = re.sub(r"rly", "railway", tweet)
    tweet = re.sub(r"CDT", "Central Daylight Time", tweet)
    tweet = re.sub(r"sensorsenso", "sensor senso", tweet)
    tweet = re.sub(r"pm", "", tweet)
    tweet = re.sub(r"PM", "", tweet)
    tweet = re.sub(r"nan", " ", tweet)
    tweet = re.sub(r"terrorismturn", "terrorism turn", tweet)
    tweet = re.sub(r"epicente", "epicenter", tweet)
    tweet = re.sub(r"epicenterr", "epicenter", tweet)
    tweet = re.sub(r"WAwildfire", "Washington Wildfire", tweet)
    tweet = re.sub(r"prebreak", "pre break", tweet)
    tweet = re.sub(r"nowplaying", "now playing", tweet)
    tweet = re.sub(r"RT", "retweet", tweet)
    tweet = re.sub(r"EbolaOutbreak", "Ebola Outbreak", tweet)
    tweet = re.sub(r"LondonFire", "London Fire", tweet)
    tweet = re.sub(r"IDFire", "Idaho Fire", tweet)
    tweet = re.sub(r"withBioterrorism&use", "with Bioterrorism & use", tweet)
    tweet = re.sub(r"NASAHurricane", "NASA Hurricane", tweet)
    tweet = re.sub(r"withweapons", "with weapons", tweet)
    tweet = re.sub(r"NuclearPower", "Nuclear Power", tweet)
    tweet = re.sub(r"WhiteTerrorism", "White Terrorism", tweet)
    tweet = re.sub(r"MyanmarFlood", "Myanmar Flood", tweet)
    tweet = re.sub(r"ExtremeWeather", "Extreme Weather", tweet)

    # Special characters
    tweet = re.sub(r"%20", " ", tweet)
    tweet = re.sub(r"%", " ", tweet)
    tweet = re.sub(r"@", " ", tweet)
    tweet = re.sub(r"#", " ", tweet)
    tweet = re.sub(r"'", " ", tweet)
    tweet = re.sub(r"\x89û_", " ", tweet)
    tweet = re.sub(r"\x89ûò", " ", tweet)
    tweet = re.sub(r"16yr", "16 year", tweet)
    tweet = re.sub(r"re\x89û_", " ", tweet)
    tweet = re.sub(r"\x89û", " ", tweet)
    tweet = re.sub(r"\x89Û", " ", tweet)
    tweet = re.sub(r"re\x89Û", "re ", tweet)
    tweet = re.sub(r"re\x89û", "re ", tweet)
    tweet = re.sub(r"\x89ûª", "'", tweet)
    tweet = re.sub(r"\x89û", " ", tweet)
    tweet = re.sub(r"\x89ûò", " ", tweet)
    tweet = re.sub(r"\x89Û_", "", tweet)
    tweet = re.sub(r"\x89ÛÒ", "", tweet)
    tweet = re.sub(r"\x89ÛÓ", "", tweet)
    tweet = re.sub(r"\x89ÛÏWhen", "When", tweet)
    tweet = re.sub(r"\x89ÛÏ", "", tweet)
    tweet = re.sub(r"China\x89Ûªs", "China's", tweet)
    tweet = re.sub(r"let\x89Ûªs", "let's", tweet)
    tweet = re.sub(r"\x89Û÷", "", tweet)
    tweet = re.sub(r"\x89Ûª", "", tweet)
    tweet = re.sub(r"\x89Û\x9d", "", tweet)
    tweet = re.sub(r"å_", "", tweet)
    tweet = re.sub(r"\x89Û¢", "", tweet)
    tweet = re.sub(r"\x89Û¢åÊ", "", tweet)
    tweet = re.sub(r"fromåÊwounds", "from wounds", tweet)
    tweet = re.sub(r"åÊ", "", tweet)
    tweet = re.sub(r"åÈ", "", tweet)
    tweet = re.sub(r"JapÌ_n", "Japan", tweet)    
    tweet = re.sub(r"Ì©", "e", tweet)
    tweet = re.sub(r"å¨", "", tweet)
    tweet = re.sub(r"SuruÌ¤", "Suruc", tweet)
    tweet = re.sub(r"åÇ", "", tweet)
    tweet = re.sub(r"å£3million", "3 million", tweet)
    tweet = re.sub(r"åÀ", "", tweet)

    # Contractions
    tweet = re.sub(r"he's", "he is", tweet)
    tweet = re.sub(r"there's", "there is", tweet)
    tweet = re.sub(r"We're", "We are", tweet)
    tweet = re.sub(r"That's", "That is", tweet)
    tweet = re.sub(r"won't", "will not", tweet)
    tweet = re.sub(r"they're", "they are", tweet)
    tweet = re.sub(r"Can't", "Cannot", tweet)
    tweet = re.sub(r"wasn't", "was not", tweet)
    tweet = re.sub(r"don\x89Ûªt", "do not", tweet)
    tweet = re.sub(r"aren't", "are not", tweet)
    tweet = re.sub(r"isn't", "is not", tweet)
    tweet = re.sub(r"What's", "What is", tweet)
    tweet = re.sub(r"haven't", "have not", tweet)
    tweet = re.sub(r"hasn't", "has not", tweet)
    tweet = re.sub(r"There's", "There is", tweet)
    tweet = re.sub(r"He's", "He is", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"You're", "You are", tweet)
    tweet = re.sub(r"I'M", "I am", tweet)
    tweet = re.sub(r"Im", "I am", tweet)
    tweet = re.sub(r"shouldn't", "should not", tweet)
    tweet = re.sub(r"wouldn't", "would not", tweet)
    tweet = re.sub(r"i'm", "I am", tweet)
    tweet = re.sub(r"I\x89Ûªm", "I am", tweet)
    tweet = re.sub(r"I'm", "I am", tweet)
    tweet = re.sub(r"Isn't", "is not", tweet)
    tweet = re.sub(r"Here's", "Here is", tweet)
    tweet = re.sub(r"you've", "you have", tweet)
    tweet = re.sub(r"you\x89Ûªve", "you have", tweet)
    tweet = re.sub(r"we're", "we are", tweet)
    tweet = re.sub(r"what's", "what is", tweet)
    tweet = re.sub(r"couldn't", "could not", tweet)
    tweet = re.sub(r"we've", "we have", tweet)
    tweet = re.sub(r"it\x89Ûªs", "it is", tweet)
    tweet = re.sub(r"doesn\x89Ûªt", "does not", tweet)
    tweet = re.sub(r"It\x89Ûªs", "It is", tweet)
    tweet = re.sub(r"Here\x89Ûªs", "Here is", tweet)
    tweet = re.sub(r"who's", "who is", tweet)
    tweet = re.sub(r"I\x89Ûªve", "I have", tweet)
    tweet = re.sub(r"y'all", "you all", tweet)
    tweet = re.sub(r"can\x89Ûªt", "cannot", tweet)
    tweet = re.sub(r"would've", "would have", tweet)
    tweet = re.sub(r"it'll", "it will", tweet)
    tweet = re.sub(r"we'll", "we will", tweet)
    tweet = re.sub(r"wouldn\x89Ûªt", "would not", tweet)
    tweet = re.sub(r"We've", "We have", tweet)
    tweet = re.sub(r"he'll", "he will", tweet)
    tweet = re.sub(r"Y'all", "You all", tweet)
    tweet = re.sub(r"Weren't", "Were not", tweet)
    tweet = re.sub(r"Didn't", "Did not", tweet)
    tweet = re.sub(r"they'll", "they will", tweet)
    tweet = re.sub(r"they'd", "they would", tweet)
    tweet = re.sub(r"DON'T", "DO NOT", tweet)
    tweet = re.sub(r"That\x89Ûªs", "That is", tweet)
    tweet = re.sub(r"they've", "they have", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"should've", "should have", tweet)
    tweet = re.sub(r"You\x89Ûªre", "You are", tweet)
    tweet = re.sub(r"where's", "where is", tweet)
    tweet = re.sub(r"Don\x89Ûªt", "Do not", tweet)
    tweet = re.sub(r"we'd", "we would", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"weren't", "were not", tweet)
    tweet = re.sub(r"They're", "They are", tweet)
    tweet = re.sub(r"Can\x89Ûªt", "Cannot", tweet)
    tweet = re.sub(r"you\x89Ûªll", "you will", tweet)
    tweet = re.sub(r"I\x89Ûªd", "I would", tweet)
    tweet = re.sub(r"let's", "let us", tweet)
    tweet = re.sub(r"it's", "it is", tweet)
    tweet = re.sub(r"can't", "can not", tweet)
    tweet = re.sub(r"cant", "can not", tweet)
    tweet = re.sub(r"don't", "do not", tweet)
    tweet = re.sub(r"dont", "do not", tweet)
    tweet = re.sub(r"you're", "you are", tweet)
    tweet = re.sub(r"i've", "I have", tweet)
    tweet = re.sub(r"that's", "that is", tweet)
    tweet = re.sub(r"i'll", "I will", tweet)
    tweet = re.sub(r"doesn't", "does not", tweet)
    tweet = re.sub(r"i'd", "I would", tweet)
    tweet = re.sub(r"didn't", "did not", tweet)
    tweet = re.sub(r"ain't", "am not", tweet)
    tweet = re.sub(r"you'll", "you will", tweet)
    tweet = re.sub(r"I've", "I have", tweet)
    tweet = re.sub(r"Don't", "do not", tweet)
    tweet = re.sub(r"I'll", "I will", tweet)
    tweet = re.sub(r"I'd", "I would", tweet)
    tweet = re.sub(r"Let's", "Let us", tweet)
    tweet = re.sub(r"you'd", "You would", tweet)
    tweet = re.sub(r"It's", "It is", tweet)
    tweet = re.sub(r"Ain't", "am not", tweet)
    tweet = re.sub(r"Haven't", "Have not", tweet)
    tweet = re.sub(r"Could've", "Could have", tweet)
    tweet = re.sub(r"youve", "you have", tweet)  
    tweet = re.sub(r"donå«t", "do not", tweet)
    return tweet

In [6]:
train['text'] = train.text.apply(lambda x: tweet_cleaner(x))

### Removing stop words

In [7]:
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)
train["text"] = train["text"].map(remove_stopwords)

In [8]:
train.text

0              deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3       people receive wildfire evacuation order calif...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7604    worldnews fallen powerlines glink tram update ...
7605    flip side im walmart bomb everyone evacuate st...
7606    suicide bomber kill saudi security site mosque...
7608    two giant crane holding bridge collapse nearby...
7612    latest home razed northern california wildfire...
Name: text, Length: 7521, dtype: object

### Embeddings using GloVe

In [9]:
from nltk.tokenize import word_tokenize
import nltk

def create_corpus_tk(df):
    corpus=[]
    for text in train["text"]:
        words = [word.lower() for word in word_tokenize(text)]
        corpus.append(words)
    return corpus

In [10]:
corpus = create_corpus_tk(train)

In [11]:
num_words = len(corpus)
print(num_words)

7521


### Train/Test Split

In [12]:
train_size = int(train.shape[0]*0.8)

train_sentences = train.text[:train_size]
train_labels = train.target[:train_size]

test_sentences = train.text[train_size:]
test_labels = train.target[train_size:]

### Convert texts to Sequence of numbers

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_len = 50

In [14]:
tokenizer = Tokenizer(num_words = num_words)
tokenizer.fit_on_texts(train_sentences)

In [15]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [16]:
train_sentences.head()

0           deed reason earthquake may allah forgive u
1                forest fire near la ronge sask canada
2    resident asked shelter place notified officer ...
3    people receive wildfire evacuation order calif...
4    got sent photo ruby alaska smoke wildfire pour...
Name: text, dtype: object

In [17]:
train_sequences[1:5]

[[82, 2, 147, 449, 5015, 5016, 1133],
 [1371, 1134, 1772, 450, 5017, 971, 138, 1772, 450, 308, 1135],
 [11, 3365, 104, 138, 308, 33],
 [38, 1052, 117, 5018, 1223, 879, 104, 5019, 98]]

In [18]:
# padding our sequences to pass it in keras model (inputs should be of the same size)
train_padded = pad_sequences(
    train_sequences, maxlen=max_len, truncating="post", padding="post"
)

In [19]:
train_padded

array([[3363,  495,  248, ...,    0,    0,    0],
       [  82,    2,  147, ...,    0,    0,    0],
       [1371, 1134, 1772, ...,    0,    0,    0],
       ...,
       [  20,   72,  312, ...,    0,    0,    0],
       [ 118,    3, 1318, ...,    0,    0,    0],
       [1318,  444,   17, ...,    0,    0,    0]], dtype=int32)

In [20]:
# padding the test data
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(
    test_sequences, maxlen=max_len, padding="post", truncating="post"
)

In [21]:
test_padded

array([[ 118,  234,  615, ...,    0,    0,    0],
       [ 291, 1318,  679, ...,    0,    0,    0],
       [ 381, 1318,  653, ...,    0,    0,    0],
       ...,
       [1083,  171, 1842, ...,    0,    0,    0],
       [  50,  570,  865, ...,    0,    0,    0],
       [ 106,   23,  342, ...,    0,    0,    0]], dtype=int32)

In [22]:
print(train.text[0])
print(train_sequences[0])

deed reason earthquake may allah forgive u
[3363, 495, 248, 57, 1532, 3364, 7]


In [23]:
# creating a dictionary out of the words and their respective sequences
word_index = tokenizer.word_index
print(len(word_index))

17160


In [24]:
word_index["like"]

3

### Creating the Embedding dict

In [25]:
embedding_dict = {}
with open("../input/glovetwitter27b100dtxt/glove.twitter.27B.100d.txt") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vectors = np.array(values[1:],"float32")
        embedding_dict[word] = vectors
f.close()

In [26]:
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words,100))

for word, i in word_index.items():
    if i<num_words:
        emb_vec = embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i] = emb_vec

In [27]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.26750001,  0.77982002, -0.87296999, ...,  0.77863997,
        -0.35301   ,  1.02509999],
       [ 0.066373  ,  1.09249997, -0.59674001, ...,  0.040076  ,
        -0.12083   , -0.1785    ],
       ...,
       [ 0.048457  , -0.54645997, -0.05239   , ..., -0.32775   ,
        -0.22183   ,  0.18402   ],
       [-0.02567   ,  0.67838001, -0.26210001, ...,  0.11029   ,
         0.92426997, -0.6354    ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [28]:
print(train_padded.shape)
print(train_labels.shape)

(6016, 50)
(6016,)


In [29]:
print(test_padded.shape)
print(test_labels.shape)

(1505, 50)
(1505,)


### Baseline Model with GloVe

In [30]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from keras.initializers import Constant
from keras.optimizers import adam_v2

model = Sequential()

model.add(
    Embedding(
        num_words,
        100,
        embeddings_initializer = Constant(embedding_matrix),
        input_length = max_len,
        trainable = False
    )
)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2)) 
# model.add(LSTM(100,dropout=0.1))
model.add(Dense(1, activation="sigmoid"))

optimizer = adam_v2.Adam(learning_rate=5e-3)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
model.summary() 


User settings:

   KMP_AFFINITY=granularity=fine,verbose,compact,1,0
   KMP_BLOCKTIME=0
   KMP_DUPLICATE_LIB_OK=True
   KMP_INIT_AT_FORK=FALSE
   KMP_SETTINGS=1

Effective settings:

   KMP_ABORT_DELAY=0
   KMP_ADAPTIVE_LOCK_PROPS='1,1024'
   KMP_ALIGN_ALLOC=64
   KMP_ALL_THREADPRIVATE=128
   KMP_ATOMIC_MODE=2
   KMP_BLOCKTIME=0
   KMP_CPUINFO_FILE: value is not defined
   KMP_DETERMINISTIC_REDUCTION=false
   KMP_DEVICE_THREAD_LIMIT=2147483647
   KMP_DISP_NUM_BUFFERS=7
   KMP_DUPLICATE_LIB_OK=true
   KMP_ENABLE_TASK_THROTTLING=true
   KMP_FORCE_REDUCTION: value is not defined
   KMP_FOREIGN_THREADS_THREADPRIVATE=true
   KMP_FORKJOIN_BARRIER='2,2'
   KMP_FORKJOIN_BARRIER_PATTERN='hyper,hyper'
   KMP_GTID_MODE=3
   KMP_HANDLE_SIGNALS=false
   KMP_HOT_TEAMS_MAX_LEVEL=1
   KMP_HOT_TEAMS_MODE=0
   KMP_INIT_AT_FORK=true
   KMP_LIBRARY=throughput
   KMP_LOCK_KIND=queuing
   KMP_MALLOC_POOL_INCR=1M
   KMP_NUM_LOCKS_IN_BLOCK=1
   KMP_PLAIN_BARRIER='2,2'
   KMP_PLAIN_BARRIER_PATTERN='hyper,hype

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 100)           1716100   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 50, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 1,796,601
Trainable params: 80,501
Non-trainable params: 1,716,100
_________________________________________________________________


In [31]:
history  = model.fit(
    train_padded,
    train_labels,
    epochs=15,
    validation_data=(test_padded, test_labels),
    verbose=1,
    batch_size=32
)

2021-12-31 11:44:46.439460: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [32]:
# test data for submission
test['text'] = test.text.apply(lambda x: clean(x))
test["text"] = test["text"].map(remove_stopwords)
test['text'] = test.text.apply(lambda x: tweet_cleaner(x))

In [33]:
sequences = tokenizer.texts_to_sequences(test.text)
padded = pad_sequences(sequences, maxlen=max_len, padding="post", truncating="post")

In [34]:
pred = model.predict(padded)
pred_int = pred.round().astype("int")

In [35]:
pred

array([[0.6889759 ],
       [0.603262  ],
       [0.8279537 ],
       ...,
       [0.8037715 ],
       [0.81113076],
       [0.4241085 ]], dtype=float32)

In [36]:
pred_int.shape

(3263, 1)

In [37]:
test.shape

(3263, 2)

In [38]:
test["target"] = pred_int
submission = test[['id','target']]

In [39]:
submission.to_csv("submission.csv", index=False)

In [40]:
test.head(100)

Unnamed: 0,id,text,target
0,0,happened terrible car crash,1
1,2,heard earthquake different city stay safe ever...,1
2,3,forest fire spot pond goose fleeing across str...,1
3,9,apocalypse lighting spokane wildfire,1
4,11,typhoon soudelor kill china taiwan,1
...,...,...,...
95,323,nature appropriates love burn become annihilat...,0
96,324,ninahoag shred psych work friendship would ann...,0
97,325,thehill example conservative annihilated burto...,0
98,326,aug kaiserjaegers wiped francis joseph crack r...,0
