In [1]:
# Use the official tokenization script created by the Google team
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

# text processing libraries
import re
import string
import nltk
from nltk.corpus import stopwords

# sklearn 
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV,StratifiedKFold,RandomizedSearchCV

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
 
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load training and testing data
df = pd.read_csv('../input/nlp-getting-started/train.csv',index_col=0)
df_test = pd.read_csv('../input/nlp-getting-started/test.csv',index_col=0)
tweets = df['text']
y = df['target']
y = np.array(y).astype('float32')
tweets_test = df_test['text']

In [4]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower() # make text lower case
    text = re.sub('\[.*?\]', '', text) # remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove URLs
    text = re.sub('<.*?>+', '', text) # remove html tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text) # remove words conatinaing numbers
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)

    return text

In [5]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Applying the de=emojifying function to both test and training datasets
tweets2 = tweets.apply(lambda x: remove_emoji(x))
tweets_test2 = tweets_test.apply(lambda x: remove_emoji(x))

In [6]:
def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer_reg = nltk.tokenize.RegexpTokenizer(r'\w+')
    
    nopunc = clean_text(text)
    tokenized_text = tokenizer_reg.tokenize(nopunc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(remove_stopwords)
    return combined_text

# Applying the cleaning function to both test and training datasets
tweets2 = tweets2.apply(lambda x: text_preprocessing(x))
tweets_test2 = tweets_test2.apply(lambda x: text_preprocessing(x))

# Let's take a look at the updated text
tweets.head()

id
1    Our Deeds are the Reason of this #earthquake M...
4               Forest fire near La Ronge Sask. Canada
5    All residents asked to 'shelter in place' are ...
6    13,000 people receive #wildfires evacuation or...
7    Just got sent this photo from Ruby #Alaska as ...
Name: text, dtype: object

## Bag of Words Vectorizer

In [7]:
#count_vectorizer = CountVectorizer()
count_vectorizer = CountVectorizer(ngram_range = (1,1), min_df = 1)
train_vectors = count_vectorizer.fit_transform(tweets2)
test_vectors = count_vectorizer.transform(tweets_test2)

## Keeping only non-zero elements to preserve space 
train_vectors.shape

(7613, 16412)

## Tf-IDF Vectorizer

In [8]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df = 2, max_df = 0.5)
train_tfidf = tfidf.fit_transform(tweets2)
test_tfidf = tfidf.transform(tweets_test2)

train_tfidf.shape

(7613, 11077)

### Using Naives Bayes on Bag of Words

In [9]:
# Fitting a simple Naive Bayes on BoW
NB_bow = MultinomialNB()
scores = model_selection.cross_val_score(NB_bow, train_vectors, y, cv=5, scoring="f1")
scores.mean()

0.6584930948850116

### Using Naives Bayes on TF-IDF

In [10]:
# Fitting a simple Naive Bayes on TFIDF
NB_tfidf = MultinomialNB()
scores = model_selection.cross_val_score(NB_tfidf, train_tfidf, y, cv=5, scoring="f1")
scores.mean()

0.6187711183101462

### Naives Bayes using a Grid Search Model on Bag of Words

In [11]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model = GridSearchCV(estimator=clf, param_grid=param_grid, 
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model.fit(train_vectors, y)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Best score: 0.731
Best parameters set:
	nb__alpha: 10


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.6s finished


### Naives Bayes using a Grid Search on TF-IDF

In [12]:
nb_model = MultinomialNB()

# Create the pipeline 
clf = pipeline.Pipeline([('nb', nb_model)])

# parameter grid
param_grid = {'nb__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Initialize Grid Search Model
model2 = GridSearchCV(estimator=clf, param_grid=param_grid, 
                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)

# Fit Grid Search Model
model2.fit(train_tfidf, y)  # we can use the full data here but im only using xtrain. 
print("Best score: %0.3f" % model2.best_score_)
print("Best parameters set:")
best_parameters = model2.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0157s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0362s.) Setting batch_size=4.


Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best score: 0.721
Best parameters set:
	nb__alpha: 1


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished


In [13]:
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission["target"] = model.predict(test_vectors)

import os
os.chdir('/kaggle/working')
    
submission.to_csv("submission1.csv", index=False)

submission2 = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission2["target"] = model2.predict(test_tfidf)

submission2.to_csv("submission2.csv", index=False)


# USING BERT MODEL

In [14]:
# The Encoding function takes the text column from train or test dataframe, the tokenizer,
# and the maximum length of text string as input.

# Outputs:
# Tokens
# Pad masks - BERT learns by masking certain tokens in each sequence.
# Segment id

def bert_encode(texts, tokenizer, max_len = 512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [15]:
def build_model(bert_layer, max_len = 512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [16]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [17]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [18]:
train_input = bert_encode(tweets2.values, tokenizer, max_len=160)
test_input = bert_encode(tweets_test2.values, tokenizer, max_len=160)
train_labels = df.target.values

In [19]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [20]:
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', save_best_only=True)

train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    callbacks=[checkpoint],
    batch_size=16
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
