<a href="https://colab.research.google.com/github/imkunals726/NLP_Disaster_tweets/blob/master/NLP_Disaster_Tweets_using_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install kaggle

In [0]:
from google.colab import files
# files.upload()

In [0]:

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [10]:
# !kaggle datasets list -s nlp-gett
!kaggle datasets download -d misakrug/nlpgettingstarted 

Downloading nlpgettingstarted.zip to /content
  0% 0.00/593k [00:00<?, ?B/s]
100% 593k/593k [00:00<00:00, 37.6MB/s]


In [0]:
# !unzip nlpgettingstarted.zip

HELPER Functions

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer , TfidfTransformer
from sklearn.preprocessing import StandardScaler

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer( 'english')

def num_remover( val):
    tokens = val.split()
    nums = [ str(i) for i in range(10)]
    final_tokens = []
    for token in tokens:
        token = token.strip()
        if not any( token.startswith( num ) for num in nums):
            final_tokens.append(token)
    return ' '.join(final_tokens)


def replace_urls(text):
    tokens = text.split()
    
    final_tokens = []
    
    for token in tokens:
        if token.lower().startswith('http'):
            final_tokens.append('url')
        elif token.lower().startswith('@'):
            final_tokens.append('taggeduser')
        else:
            final_tokens.append(token)
    return ' '.join(final_tokens)


def clean_text(df):

    replace_words = [ '&amp' , 'and' , '#' ]

    df['text'] = df['text'].apply(replace_urls)

    for word in replace_words :
        df[ 'text' ] = df[ 'text' ].str.replace( word , '' )

    df[ 'text' ] = df['text' ].apply( lambda txt : ' '.join( stemmer.stem(lemmatizer.lemmatize( word ) ) for word in txt.split( ' ') ) )

    df['keyword'] = df['keyword'].fillna('').str.replace('%20' , ' ')
    df[ 'text' ] = df.apply( lambda row : str( row[ 'text' ] ) + ' ' + str(row[ 'keyword' ]) if row[ 'keyword' ] else row[ 'text' ] , axis = 1)

    df['text'] = df['text'].apply(num_remover)
    
    return df

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


Lets Clean the text

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
train_df = clean_text(train_df)

In [0]:
from sklearn.model_selection  import train_test_split
X_train, X_val , y_train, y_val = train_test_split(train_df[['text']] , train_df['target'])

In [0]:
import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
vocab_size = 10000
embedding_dim = 100
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [0]:
def prepare_data(sentences_to_convert, tokenizer):
  # word_index = tokenizer.word_index
  training_sequences = tokenizer.texts_to_sequences(sentences_to_convert)
  training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

  return training_padded

In [0]:
training_sentences = X_train['text']

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

testing_sentences = X_val['text']

training_padded = prepare_data(training_sentences, tokenizer)
testing_padded = prepare_data(testing_sentences, tokenizer)


In [0]:
def create_model():
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
      tf.keras.layers.GlobalAveragePooling1D(),
      tf.keras.layers.Dense(24, activation='relu'),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  return model

In [11]:
model = create_model()
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1000000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 24)                2424      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 1,002,449
Trainable params: 1,002,449
Non-trainable params: 0
_________________________________________________________________


In [12]:
num_epochs = 4
history = model.fit(training_padded, y_train, epochs=num_epochs, validation_data=(testing_padded, y_val), verbose=2)

Epoch 1/4
179/179 - 2s - loss: 0.6818 - accuracy: 0.5638 - val_loss: 0.6691 - val_accuracy: 0.5804
Epoch 2/4
179/179 - 2s - loss: 0.6409 - accuracy: 0.6192 - val_loss: 0.5764 - val_accuracy: 0.7516
Epoch 3/4
179/179 - 2s - loss: 0.4844 - accuracy: 0.8049 - val_loss: 0.4438 - val_accuracy: 0.8162
Epoch 4/4
179/179 - 2s - loss: 0.3749 - accuracy: 0.8473 - val_loss: 0.4112 - val_accuracy: 0.8277


In [0]:
test_df = pd.read_csv('test.csv')

In [0]:
test_df = clean_text(test_df)

In [0]:
train_df = clean_text(train_df)

In [0]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_df['text'])

In [0]:
training_padded = prepare_data(train_df['text'] , tokenizer)

In [0]:
testing_padded = prepare_data(test_df['text'] , tokenizer)

In [0]:
model = create_model()

In [20]:
num_epochs = 4
history = model.fit(training_padded, train_df['target'], epochs=num_epochs, verbose=2)

Epoch 1/4
238/238 - 3s - loss: 0.6638 - accuracy: 0.5873
Epoch 2/4
238/238 - 3s - loss: 0.4979 - accuracy: 0.7960
Epoch 3/4
238/238 - 3s - loss: 0.3884 - accuracy: 0.8380
Epoch 4/4
238/238 - 3s - loss: 0.3373 - accuracy: 0.8644


In [0]:
results = model.predict(testing_padded)

In [0]:
 results = (results > 0.5).astype(int)

In [26]:
results

array([[1],
       [0],
       [1],
       ...,
       [1],
       [1],
       [1]])

In [0]:
test_df['target'] = results

In [28]:
test_df.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [29]:
sub = pd.read_csv('sample_submission.csv')
sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [0]:
submission = test_df[['id' , 'target']]

In [0]:
submission.to_csv('using_embeddings.csv', index=False)