# Disater tweets

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import re
import string
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score

import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

import spacy
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Concatenate, BatchNormalization, Bidirectional, Flatten
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import SGD

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

In [3]:
config = {
    'verbose' : True,
    'method' : 'BERT',
    'test_split' : 0.25,
    'val_split' : 0.2,
    'model_params' : {
        'XGB' : {
            'word2vec' : True
        },
        'LSTM' : {
            'word2vec' : False
        },
        'BERT' : {
            'word2vec' : False
        }
    },
    'preprocessing' : {
        'remove_stop_words' : True,
        'lemmatization_func' : 'spacy'
    }
}

In [4]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample_sub = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
sample_sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [6]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def _lem_wordnet(word):
    lemmatizer = WordNetLemmatizer()
    tag_dict = {'J': wordnet.ADJ,
                'N': wordnet.NOUN,
                'V': wordnet.VERB,
                'R': wordnet.ADV}
    tag = nltk.pos_tag([word])[0][1]
    lem_tag = tag_dict.get(tag[0].upper(), tag_dict['N'])
    return lemmatizer.lemmatize(word, lem_tag)

def lemmatize(text, nlp, func):
    if func=='wordnet':
        lemmatized_words = [_lem_wordnet(word) for word in nltk.word_tokenize(text)]
        
    elif func=='spacy':
        doc = nlp.tokenizer(text)
        lemmatized_words = [token.lemma_ for token in doc]
        if config['preprocessing']['remove_stop_words']:
            lemmatized_words = [lemma for lemma in lemmatized_words if not nlp.vocab[lemma].is_stop]
    
    
    return ' '.join(lemmatized_words)

def text_preprocessing(text, nlp=None, func=config['preprocessing']['lemmatization_func']):
    clean_txt = clean_text(text)
    lemmatized_text = lemmatize(clean_txt, nlp, func)
    return lemmatized_text

## Word2vec

In [7]:
def word2vec(X):
    nlp = spacy.load('en_core_web_lg')

    with nlp.disable_pipes():
        vectors = np.array([nlp(txt).vector for txt in X])
        
    return vectors

In [8]:
def get_X(data):
    X = data['text']
    if config['verbose']:
        print('Raw data extracted')
        
    nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner'])
    X = X.apply(lambda x: text_preprocessing(x, nlp=nlp))
    if config['verbose']:
        print('Text preprocessing done')
    
    if config['model_params'][config['method']]['word2vec']:
        X = word2vec(X)
        if config['verbose']:
            print('Word2vec done')
    
    return X
    
def get_Y(data):
    Y = data['target']
    return Y

In [9]:
X, Y = get_X(data), get_Y(data)

Raw data extracted
Text preprocessing done


In [10]:
def get_preds(X, model):
    pred = np.squeeze(model.predict(X))
    predictions = [round(value) for value in pred]
    return predictions

## XGBoost

In [11]:
if config['method'] == 'XGB':
    def score(params):
        X_train, X_val, Y_train, Y_val = train_test_split(
            X, 
            Y, 
            test_size=config['test_split']
        )

        model = xgb.XGBClassifier(**params)
        model.fit(X_train, Y_train)
        preds_val = get_preds(X_val, model)
        score = accuracy_score(Y_val, preds_val)
        if config['verbose']:
            print(f'\tScore: {score}')
        loss = 1 - score
        return {'loss': loss, 'status': STATUS_OK}

    space = {
        'n_estimators': hp.choice('n_estimators', np.arange(50, 200, dtype=int)),
        'eta': hp.quniform('eta', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.choice('min_child_weight', np.arange(1, 8, 1, dtype=int)),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.3, 0.9, 0.05)
    }
    params_fixed = {
        'eval_metric': 'logloss',
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'tree_method': 'exact',
        'use_label_encoder' : False
    }
    space.update(params_fixed)

In [12]:
if config['method'] == 'XGB':
    best_params = fmin(score, space, algo=tpe.suggest, max_evals=100)
    best_params.update(param_fixed)
    model_final = xgb.XGBClassifier(**best_params)
    model_final.fit(X, Y)

## LSTM

In [13]:
if config['method'] == 'LSTM':
    def score(params, give_model=False):
        X_train, X_val, Y_train, Y_val = train_test_split(
            X, 
            Y, 
            test_size=config['test_split']
        )
        
        if give_model:
            X_train, Y_train = X, Y
        
        encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=params['vocab_size'])
        encoder.adapt(list(X_train))

        vocab = np.array(encoder.get_vocabulary())
        
        # Creating layers
        layers = [
            encoder,
            tf.keras.layers.Embedding(
                input_dim=len(encoder.get_vocabulary()),
                output_dim=params['embed_out_dim'],
                mask_zero=True
            ),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(params['embed_out_dim']))
        ]
        
        dense_layers = [tf.keras.layers.Dense(params[f'dense_{n+1}_dim'], activation=params['dense_activation']) for n in range(params['dense_layers'])]
        layers += dense_layers
        
        layers.append(tf.keras.layers.Dense(1, activation=params['output_activation']))

        model = tf.keras.Sequential(layers)
        model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                      optimizer=tf.keras.optimizers.Adam(1e-4),
                      metrics=params['metrics'])

        history = model.fit(X_train, Y_train, validation_split=0.2, epochs=params['epochs'], verbose=0)
        
        preds_val = get_preds(X_val, model)
        score = accuracy_score(Y_val, preds_val)
        if config['verbose']:
            print(f'\tScore: {score}')
        loss = 1 - score
        
        if give_model:
            return loss, model
        
        else:
            return {'loss': loss, 'status': STATUS_OK}
        
        
    space = {
        'vocab_size' : hp.choice('vocab_size', np.arange(1000, 50000, 1000, dtype=int)),
        'embed_out_dim' : hp.choice('embed_out_dim', np.arange(32, 128, dtype=int)),
        'dense_layers' : hp.choice('dense_layers', np.arange(1, 4, dtype=int)),
        'dense_1_dim' : hp.choice('dense_1_dim', np.arange(4, 64, dtype=int)),
        'dense_2_dim' : hp.choice('dense_2_dim', np.arange(4, 64, dtype=int)),
        'dense_3_dim' : hp.choice('dense_3_dim', np.arange(4, 64, dtype=int)),
        'dense_activation' : hp.choice('dense_activation', ['sigmoid', 'relu']),
        'epochs' : hp.choice('epochs', np.arange(20, 80, 10, dtype=int))
    }
    params_fixed = {
        'output_activation' : 'sigmoid',
        'metrics' : ['accuracy']
    }
    space.update(params_fixed)
    
    best_params = fmin(score, space, algo=tpe.suggest, max_evals=100)
    best_params.update(params_fixed)
    
    loss, model_final = score(best_params, give_model=True)

## BERT

In [14]:
if config['method'] == 'BERT':
    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model.summary()

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


In [15]:
if config['method'] == 'BERT': 
    X_train, X_val, Y_train, Y_val = train_test_split(
        X, 
        Y, 
        test_size=config['test_split']
    )
    data_train = pd.DataFrame([X_train, Y_train]).transpose()
    data_val = pd.DataFrame([X_val, Y_val]).transpose()
    
    def convert_data_to_examples(train, val, DATA_COLUMN, LABEL_COLUMN):
        train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                                 text_a = x[DATA_COLUMN], 
                                                                 text_b = None,
                                                                 label = x[LABEL_COLUMN]), axis = 1)

        validation_InputExamples = val.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                                    text_a = x[DATA_COLUMN], 
                                                                    text_b = None,
                                                                    label = x[LABEL_COLUMN]), axis = 1)
        return train_InputExamples, validation_InputExamples

    def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
        features = [] # -> will hold InputFeatures to be converted later

        for e in examples:
            # Documentation is really strong for this method, so please take a look at it
            input_dict = tokenizer.encode_plus(
                e.text_a,
                add_special_tokens=True,
                max_length=max_length, # truncates if len(s) > max_length
                return_token_type_ids=True,
                return_attention_mask=True,
                padding='max_length', # pads to the right by default # CHECK THIS for pad_to_max_length
                truncation=True
            )

            input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
                input_dict["token_type_ids"], input_dict['attention_mask'])

            features.append(
                InputFeatures(
                    input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
                )
            )

        def gen():
            for f in features:
                yield (
                    {
                        "input_ids": f.input_ids,
                        "attention_mask": f.attention_mask,
                        "token_type_ids": f.token_type_ids,
                    },
                    f.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )
    
    DATA_COLUMN = 'text'
    LABEL_COLUMN = 'target'

    train_InputExamples, validation_InputExamples = convert_data_to_examples(data_train, data_val, DATA_COLUMN, LABEL_COLUMN)

    train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
    train_data = train_data.shuffle(100).batch(32).repeat(2)

    validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
    validation_data = validation_data.batch(32)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                  metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

    model.fit(train_data, epochs=10, validation_data=validation_data)
    
    class BERT_Model:
        def __init__(self, model):
            self.model = model
            
        def predict(self, X):
            X = list(X)

            tf_batch = tokenizer(X, max_length=128, padding=True, truncation=True, return_tensors='tf')
            tf_outputs = self.model(tf_batch)
            tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
            label = tf.argmax(tf_predictions, axis=1)
            label = label.numpy()
            return label
        
    model_final = BERT_Model(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Submitting results

In [16]:
def save_submission(data, Y):
    df = pd.DataFrame(data['id'])
    df['target'] = Y
    df.to_csv('submission.csv', index=False)

In [17]:
X_test = get_X(test_data)
preds_test = get_preds(X_test, model_final)
save_submission(test_data, preds_test)

Raw data extracted
Text preprocessing done
