In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import time

In [3]:
df = pd.read_csv('/content/train.csv')

In [4]:
# df.shape
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
print((df.target == 1).sum())
print((df.target == 0).sum())

3271
4342


In [6]:
# Preprocessing
# To remove URL's and Punctuations
import re
import string

def remove_url(text):
  url = re.compile(r'https?://\S+|www\.\S+')
  return url.sub(r'',text)

def remove_punct(text):
  translator = str.maketrans('','',string.punctuation)
  return text.translate(translator)

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
pattern = re.compile(r'https?://\S+|www\.\S+')
for t in df.text:
  matches = pattern.findall(t)
  for match_new in matches:
    print(t)
    print(match_new)
    print(pattern.sub(r'',t))
  if len(matches) > 0:
    break


@bbcmtd Wholesale Markets ablaze http://t.co/lHYXEOHY6C
http://t.co/lHYXEOHY6C
@bbcmtd Wholesale Markets ablaze 


In [8]:
df['text'] = df.text.map(remove_url)
df['text'] = df.text.map(remove_punct)

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english'))

def remove_stopwords(text):
  filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
  return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
# stop
df['text'] = df.text.map(remove_stopwords)

In [11]:
df.text

0            deeds reason earthquake may allah forgive us
1                   forest fire near la ronge sask canada
2       residents asked shelter place notified officer...
3       13000 people receive wildfires evacuation orde...
4       got sent photo ruby alaska smoke wildfires pou...
                              ...                        
7608    two giant cranes holding bridge collapse nearb...
7609    ariaahrary thetawniest control wild fires cali...
7610                      m194 0104 utc5km volcano hawaii
7611    police investigating ebike collided car little...
7612    latest homes razed northern california wildfir...
Name: text, Length: 7613, dtype: object

In [12]:
from collections import Counter

def counter_word(text_col):
  count = Counter()
  for text in text_col.values:
    for word in text.split():
      count[word]+=1
  return count

counter = counter_word(df.text)

In [13]:
len(counter)

17971

In [14]:
# counter
counter.most_common(10)

[('like', 345),
 ('im', 299),
 ('amp', 298),
 ('fire', 250),
 ('get', 229),
 ('new', 224),
 ('via', 220),
 ('people', 196),
 ('one', 193),
 ('news', 193)]

In [15]:
num_unique_words = len(counter)

In [16]:
# Spliting the Dataset for training and validation
train_size = int(df.shape[0] * 0.8)

train_df = df[:train_size]
valid_df = df[train_size:]

# Spliting the text and labels
train_sentences = train_df.text.to_numpy()
train_labels = train_df.target.to_numpy()
val_sentences = valid_df.text.to_numpy()
val_labels = valid_df.target.to_numpy()

In [17]:
train_sentences.shape, val_sentences.shape

((6090,), (1523,))

In [18]:
# For Tokenizations of the text And Training

from tensorflow.keras.preprocessing.text import Tokenizer

token = Tokenizer(num_words=num_unique_words)
token.fit_on_texts(train_sentences)

In [19]:
word_index = token.word_index

In [20]:
# word_index
train_sequences = token.texts_to_sequences(train_sentences)
val_sequences = token.texts_to_sequences(val_sentences)

In [21]:
print(train_sentences[10:20])
print(train_sequences[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[[520, 8, 395, 156, 297, 411], [749, 470, 2248, 138, 2249, 2813, 521, 611, 188, 470, 2248, 189, 189, 5679, 117], [2814, 117, 1884, 5680, 2248, 1285, 1450, 522, 256, 644, 2815], [99, 3742, 612, 1451, 3742], [111, 91, 336, 3743, 3744, 52, 22, 312], [433, 26], [42, 5681], [237, 1286], [52, 698], [5682]]


In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 40

train_padded = pad_sequences(train_sequences,maxlen=max_length,padding='post',truncating='post')
val_padded = pad_sequences(val_sequences,maxlen=max_length,padding='post',truncating='post')

train_padded.shape,val_padded.shape

((6090, 40), (1523, 40))

In [23]:
# train_padded[10]
print(train_sentences[10])
print(train_sequences[10])
print(train_padded[10])

three people died heat wave far
[520, 8, 395, 156, 297, 411]
[520   8 395 156 297 411   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0]


In [24]:
# Check reversing the indices
# Flip the key and values

reverse_word_index = dict([idx,word] for (word,idx) in word_index.items())

reverse_word_index

{1: 'like',
 2: 'amp',
 3: 'fire',
 4: 'im',
 5: 'get',
 6: 'via',
 7: 'new',
 8: 'people',
 9: 'news',
 10: 'dont',
 11: 'emergency',
 12: 'one',
 13: '2',
 14: 'us',
 15: 'video',
 16: 'disaster',
 17: 'burning',
 18: 'body',
 19: 'would',
 20: 'buildings',
 21: 'police',
 22: 'crash',
 23: 'first',
 24: 'california',
 25: 'still',
 26: 'man',
 27: 'got',
 28: 'know',
 29: 'day',
 30: 'back',
 31: 'going',
 32: 'two',
 33: 'time',
 34: 'full',
 35: 'accident',
 36: 'see',
 37: 'world',
 38: 'attack',
 39: 'nuclear',
 40: 'youtube',
 41: 'may',
 42: 'love',
 43: 'go',
 44: 'rt',
 45: 'many',
 46: 'cant',
 47: '3',
 48: 'watch',
 49: 'collapse',
 50: 'dead',
 51: 'today',
 52: 'car',
 53: 'mass',
 54: 'want',
 55: 'years',
 56: 'work',
 57: 'train',
 58: 'last',
 59: 'good',
 60: 'think',
 61: 'families',
 62: 'hiroshima',
 63: 'life',
 64: 'fires',
 65: 'best',
 66: 'could',
 67: 'say',
 68: 'u',
 69: 'death',
 70: 'hot',
 71: 'forest',
 72: 'way',
 73: 'killed',
 74: 'need',
 75: 'le

In [25]:
def decode(sequence):
  return ' '.join([reverse_word_index.get(idx, "?") for idx in sequence])

In [26]:
decoded_txt = decode(train_sequences[40])
print(train_sequences[40])
print(decoded_txt)

[171, 2819]
check nsfw


In [27]:
# Creating LSTM Model for training. Using the RNN approach

from tensorflow.keras import layers,models

model = keras.models.Sequential()
model.add(layers.Embedding(num_unique_words,40,input_length=max_length))
model.add(layers.LSTM(70,dropout=0.2))
model.add(layers.Dense(1,activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 40)            718840    
                                                                 
 lstm (LSTM)                 (None, 70)                31080     
                                                                 
 dense (Dense)               (None, 1)                 71        
                                                                 
Total params: 749991 (2.86 MB)
Trainable params: 749991 (2.86 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
loss = keras.losses.BinaryCrossentropy(from_logits=True)
optim = keras.optimizers.Adam(lr=0.001)
metrics = ['accuracy','mse']

model.compile(loss=loss,optimizer=optim,metrics=metrics)

# model.fit(train_padded,train_labels,epochs=40,validation_data=(val_padded,val_labels,verbose=2))



In [29]:
model.fit(train_padded,train_labels,epochs=40,validation_data=(val_padded,val_labels),verbose='auto')

Epoch 1/40


  output, from_logits = _get_logits(


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x787ca5ca91b0>

In [30]:
import pickle

nlp_file = 'nlp_basics.pkl'

with open(nlp_file,'wb') as file:
  pickle.dump(model,file)

In [31]:
prediction = model.predict(train_padded)
predictions = [1 if p > 0.5 else 0 for p in prediction]



In [32]:
print(train_sentences[10:20])

print(train_labels[10:20])
print(predictions[10:20])

['three people died heat wave far'
 'haha south tampa getting flooded hah wait second live south tampa gonna gonna fvck flooding'
 'raining flooding florida tampabay tampa 18 19 days ive lost count'
 'flood bago myanmar arrived bago'
 'damage school bus 80 multi car crash breaking' 'whats man' 'love fruits'
 'summer lovely' 'car fast' 'goooooooaaaaaal']
[1 1 1 1 1 0 0 0 0 0]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
