In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import re
import sklearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from sklearn.metrics import accuracy_score

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import wordcloud
from wordcloud import WordCloud

import tensorflow_hub as hub
import bert
from bert import tokenization

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

init_notebook_mode(connected=True)

log = logging.getLogger(name=__name__)
log.setLevel(logging.INFO)
logging.captureWarnings(True)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

ch.setFormatter(formatter)
log.addHandler(ch)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 160

SEED = 1
tf.random.set_seed(
    SEED
)
#tf.keras.backend.set_floatx('float64')

log.info(f"Python version: {sys.version}")
log.info(f"Numpy version: {np.__version__}")
log.info(f"Pandas version: {pd.__version__}")
log.info(f"Scikit-learn version: {sklearn.__version__}")
log.info(f"TensorFlow version: {tf.__version__}")
log.info(f"Plotly version: {plotly.__version__}")
log.info(f"WordCloud version: {wordcloud.__version__}")
log.info(f"tensorflow.random seed: {SEED}")

UNK = "UNK"
NUM = "QNUM"
AT = "QAT"
SUCCESS = 0
stopwords = (nltk.corpus.stopwords.words("english") 
    #+ ["u", "im", "us", "th", "st", "nd", "r", "rt", "f", "v", "x"]
)

old_text = "text"
text = "t"
hashtag = "hashtag"
at = "at"
href = "href"
target = "target"
keyword = "keyword"
location = "location"

y_cols = [target+"_0", target+"_1"]

2020-11-09 19:26:40,154 - __main__ - INFO - Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]
2020-11-09 19:26:40,154 - __main__ - INFO - Numpy version: 1.18.5
2020-11-09 19:26:40,155 - __main__ - INFO - Pandas version: 1.1.4
2020-11-09 19:26:40,155 - __main__ - INFO - Scikit-learn version: 0.23.2
2020-11-09 19:26:40,156 - __main__ - INFO - TensorFlow version: 2.3.0
2020-11-09 19:26:40,157 - __main__ - INFO - Plotly version: 4.12.0
2020-11-09 19:26:40,157 - __main__ - INFO - WordCloud version: 1.8.0
2020-11-09 19:26:40,157 - __main__ - INFO - tensorflow.random seed: 1


In [2]:
class LabelEncoderExt(preprocessing.LabelEncoder):
    '''
    '''
    def __init__(self):

        super().__init__()

    def fit(self, y):

        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([UNK])))
        super().fit(y)

    def transform(self, y):

        y[~np.isin(y, self.classes_, assume_unique=True)] = UNK
        return super().transform(y)

    def fit_transform(self, y):

        self.fit(y)
        return self.transform(y)

In [3]:
data_bn = "data"
data_dir = os.path.abspath(
    os.path.join(__name__, os.pardir, os.pardir, data_bn)
)

log.info(f"Data directory: {data_dir}")

2020-11-09 19:26:40,185 - __main__ - INFO - Data directory: /home/jimmy/github/kaggle/nlp_disaster_tweets/data


In [4]:
train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

In [5]:
df_train = pd.read_csv(train_fn)
df_test = pd.read_csv(test_fn)

log.info(f"Training data shape: {df_train.shape}")
log.info(f"Test data shape: {df_test.shape}")

2020-11-09 19:26:40,266 - __main__ - INFO - Training data shape: (7613, 5)
2020-11-09 19:26:40,266 - __main__ - INFO - Test data shape: (3263, 4)


In [6]:
solution_fn = os.path.join(data_dir, "socialmedia-disaster-tweets-DFE.csv")
df_X = pd.read_csv(solution_fn, sep=',', header=0, encoding = "ISO-8859-1")
df_X = df_X.rename({"tweetid": "id"}, axis=1).astype({"id": int})
df_X[target] = df_X["choose_one"].apply(lambda x: 1 if x=="Relevant" else 0)

In [7]:
df_test = df_test.merge(df_X[["id", "target"]], how="inner", left_on="id", right_on=df_X.index).rename({"target_y": target}, axis=1)

In [8]:
train_pts = df_train.shape[0]

In [9]:
df_train = pd.concat([df_train, df_test], ignore_index=True)
df_train = df_train.drop(["id_x", "id_y"], axis=1)

In [10]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"

tf.io.gfile.listdir(gs_folder_bert)

tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
    do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [11]:
bert_token = lambda x: tokenizer.convert_tokens_to_ids(['[CLS]'] + tokenizer.tokenize(x) + ['[SEP]'])  

[101,
 2256,
 15616,
 2024,
 1996,
 3114,
 1997,
 2023,
 1001,
 8372,
 2089,
 16455,
 9641,
 2149,
 2035,
 102]

In [12]:
def to_lower(df):
    '''
    '''
    df[text] = df[text].apply(lambda x: x.casefold())
    
    return SUCCESS


def hash_handling(df):
    '''
    '''
    reg_hash_full = re.compile("(#)\w+")
    reg_hash = re.compile("(#)")
    
    f = lambda x: [y.group() for y in reg_hash_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[hashtag] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_hash.sub(' ', x))
    
    return SUCCESS


def at_handling(df):
    '''
    '''
    reg_at = re.compile("(@)")
    reg_at_full = re.compile("(@)\w+")
    
    f = lambda x: [y.group() for y in reg_at_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[at] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_at_full.sub(' '+AT+' ', x))
    
    return SUCCESS


def count_at(df):
    """
    """
    df[at] = df[at].apply(lambda x: len(x.split()))
    
    return SUCCESS


def href_handling(df):
    '''
    '''
    reg_href_full = re.compile("(htt)\S+")
    
    f = lambda x: len(list(reg_href_full.finditer(x)))
    
    df[href] = df[text].apply(f)
    df[text] = df[text].apply(lambda x: reg_href_full.sub(' http ', x))
    
    return SUCCESS


def html_special_handling(df):
    '''
    '''
    reg_html = re.compile("(&)\w+(;)")
    df[text] = df[text].apply(lambda x: reg_html.sub(' html ', x))
    
    return SUCCESS
    
    
def xc2x89_byte_handling(df):
    '''
    '''
    reg_x89 = re.compile(b"\xc2\x89".decode('utf-8')+"\S+")
    df[text] = df[text].apply(lambda x: reg_x89.sub(' ', x))
    
    return SUCCESS
    
    
def special_char_handling(df):
    '''
    '''
    reg_special = re.compile("[^\w\s@]")
    df[text] = df[text].apply(lambda x: reg_special.sub(' ', x))
    df[text] = df[text].apply(lambda x: re.sub('_', ' ', x)) 
    
    return SUCCESS


def contraction_handling(df):
    '''
    '''
    reg_contract = re.compile("\s(s|m|t|(nt)|(ve)|w)\s")
    df[text] = df[text].apply(lambda x: reg_contract.sub(' ', x))
    
    return SUCCESS


def encode_numerals(df):
    '''
    '''
    reg_numerals = re.compile("\d+[\s\d]*")
    df[text] = df[text].apply(lambda x: reg_numerals.sub(' '+NUM+' ', x))
    
    return SUCCESS
    
    
def remove_stopwords(df):
    """
    """
    f = (lambda x: 
        ' '.join([y for y in x.strip().split() if y not in stopwords])
    )
    df[text] = df[text].apply(f)
    
    return SUCCESS   


def has_location(df):
    """
    """
    df[location] = df[location].apply(lambda x: 1 if pd.isnull(x) else 0)
    
    return SUCCESS


def preprocess(df):
    """
    """
    df[text] = df[old_text]
    df[keyword].fillna('', inplace=True)
    to_lower(df)
    hash_handling(df)
    at_handling(df)
    count_at(df)
    href_handling(df)
    html_special_handling(df)
    xc2x89_byte_handling(df)
    special_char_handling(df)
    contraction_handling(df)
    remove_stopwords(df)
    encode_numerals(df)
    has_location(df)
    #df[text] = '[CLS] ' + df[text] + ' [SEP]'
    
    return SUCCESS

In [13]:
_ = preprocess(df_train)

In [14]:
def tokenize_dataframe(df, col, max_len=20):
    """
    """
    df_tmp = pd.DataFrame(df[col].apply(lambda x: reversed(x.split())).tolist())
    orig_len = len(df_tmp.columns)
    df_tmp = df_tmp.rename(
        lambda x: col+"_{:02d}".format(max_len-1-x), 
        axis=1
    )
    
    enum_cols = [col+"_{:02d}".format(i) for i in range(max_len)]
    if orig_len < max_len:
        compl_cols = [x for x in enum_cols if x not in df_tmp.columns]
        df_tmp[compl_cols] = np.nan

    df_merged = df.merge(
        df_tmp[enum_cols],
        how="outer",
        left_index=True,
        right_index=True
    )
    
    return df_merged, enum_cols


def filter_infrequent(df, cols, cutoff=5):
    """
    """
    unique_words, word_counts = (
        np.unique(df[cols].values.flatten(), return_counts=True)
    )
    infreq_dict = {
        x: (x if word_counts[i] >= cutoff else UNK)
            for i, x in np.ndenumerate(unique_words)
    }

    f = lambda x: infreq_dict[x]
    df[cols] = df[cols].applymap(f)
    
    return SUCCESS


def transform_data(df):
    """
    """
    _ = preprocess(df)
    df, text_cols = tokenize_dataframe(df, text, max_len=25)
    
    df[text_cols] = df[text_cols].fillna('')
    
    lemmatizer = WordNetLemmatizer() 
    ps = PorterStemmer()

    df[text_cols] = df[text_cols].applymap(lambda x: ps.stem(x))
    df[text_cols] = df[text_cols].applymap(lambda x: lemmatizer.lemmatize(x))

    _ = filter_infrequent(df, text_cols, cutoff=10)
        
    df, hash_cols = tokenize_dataframe(df, hashtag, max_len=3)
    df[hash_cols] = df[hash_cols].fillna('')

    _ = filter_infrequent(df, hash_cols, cutoff=5)
    
    return df, text_cols, hash_cols

In [49]:
num_unique_words = len(tokenizer.vocab)
#tokenizer = Tokenizer(num_words=num_unique_words)
#tokenizer.fit_on_texts(df_train[text].values)
#tmp = tokenizer.texts_to_sequences(df_train[text].values)

pre_words_ids = df_train[text].apply(bert_token)
pre_masks = df_train[text].apply(lambda x: [1]*(len(tokenizer.tokenize(x))+2))

words_ids = pad_sequences(pre_words_ids)
masks = pad_sequences(pre_masks)
type_ids = np.zeros(words_ids.shape, dtype=np.int)

text_cols = [text+"_{:02d}".format(i) for i in range(words_ids.shape[1])]
mask_cols = ["mask"+"_{:02d}".format(i) for i in range(words_ids.shape[1])]
type_cols = ["type"+"_{:02d}".format(i) for i in range(words_ids.shape[1])]

df_train[text_cols] = words_ids
df_train[mask_cols] = masks
df_train[type_cols] = type_ids

df_train, text_cols = transform_data(df_train)

wc_size = (12, 12)

tdf = df_train[df_train[target]==1]

unique_words, word_counts = (
    np.unique(tdf[text_cols].values.flatten(), return_counts=True)
)
sm = np.sum(word_counts)
frequency_dict = {
    x: word_counts[i]/sm 
        for i, x in np.ndenumerate(unique_words)
}
try:
    frequency_dict.pop(NUM)
except:
    pass
try:
    frequency_dict.pop(UNK)
except:
    pass

wordcloud = WordCloud(
    width=1000, height=1000, 
    background_color='white',
    min_font_size=10
).generate_from_frequencies(frequency_dict)
fig = plt.figure(figsize=wc_size, facecolor=None)
ax = fig.add_subplot()
a = ax.imshow(wordcloud) 

tdf = df_train[df_train[target]==0]
unique_words, word_counts = (
    np.unique(tdf[text_cols].values.flatten(), return_counts=True)
)
sm = np.sum(word_counts)
frequency_dict = {
    x: word_counts[i]/sm
    for i, x in np.ndenumerate(unique_words)
}
try:
    frequency_dict.pop(NUM)
except:
    pass
try:
    frequency_dict.pop(UNK)
except:
    pass

wordcloud = WordCloud(
    width=1000, height=1000, 
    background_color='white',
    min_font_size=10
).generate_from_frequencies(frequency_dict)
fig = plt.figure(figsize=wc_size, facecolor=None) 
ax = fig.add_subplot()
ret = ax.imshow(wordcloud) 

enc = LabelEncoderExt()
df_train[text_cols] = (enc
    .fit_transform(df_train[text_cols].values.flatten())
    .reshape(df_train[text_cols].shape)
)
num_unique_words = enc.classes_.shape[0]

log.info(f"Number of unique words: {num_unique_words}")

In [17]:
log.info(f"Number of unique words: {num_unique_words}")

2020-11-09 19:26:45,157 - __main__ - INFO - Number of unique words: 30522


hash_enc = LabelEncoderExt()
df_train[hash_cols] = (hash_enc
    .fit_transform(df_train[hash_cols].values.flatten())
    .reshape(df_train[hash_cols].shape)
)
num_unique_hash = hash_enc.classes_.shape[0]

log.info(f"Number of unique hashtags: {num_unique_hash}")

key_enc = LabelEncoderExt()
df_train[keyword] = (key_enc
    .fit_transform(df_train[keyword])
)
num_unique_keywords = key_enc.classes_.shape[0]

log.info(f"Number of unique keywords: {num_unique_keywords}")

v0 = most_freq_bigrams(
    df_train[df_train[target]==0],
    enc, text_cols, top_n=50
)
v1 = most_freq_bigrams(
    df_train[df_train[target]==1],
    enc, text_cols, top_n=50
)

bigrams0 = np.array(['_'.join(x.tolist()) for x in v0[2]])
bigrams1 = np.array(['_'.join(x.tolist()) for x in v1[2]])

bigr_cnt0 = np.vstack([bigrams0, v0[0].values])
bigr_cnt1 = np.vstack([bigrams1, v1[0].values])

fig = go.Figure()
bar0 = go.Bar(name="Not disaster", x=bigr_cnt0[0], y=bigr_cnt0[1])
bar1 = go.Bar(name="Disaster", x=bigr_cnt1[0], y=bigr_cnt1[1])

fig.add_trace(bar0)
fig.add_trace(bar1)

fig.update_layout(barmode='group')

iplot(fig)

df_test, _, _ = transform_data(df_test)

df_test[text_cols] = (enc
    .transform(df_test[text_cols].values.flatten())
    .reshape(df_test[text_cols].shape)
)
df_test[hash_cols] = (hash_enc
    .transform(df_test[hash_cols].values.flatten())
    .reshape(df_test[hash_cols].shape)
)
df_test[keyword] = (key_enc
    .transform(df_test[keyword])
)

In [18]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)

In [19]:
class TwolayerModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=32,
            units=40,
            embed_dim=100,
    ):
        """
        """
        self.inps = [
            (None, len(text_cols)),
        ]
        self.bs = batch_size
        out_dim = 2
        
        super(TwolayerModel, self).__init__()
        
        self._embed1 = tf.keras.layers.Embedding(
            num_unique_words,
            embed_dim,
            input_length=self.inps[0][1],
            name="word_embedding",
            #trainable=False,
        )
        self._lstm1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(
                units,
                name="lstm1",
                return_sequences=True,
            )
        )
        
        self._lstm2 = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(
                units,
                name="lstm2",
            )
        )

        self._dense2 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.softmax,
            name="final",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.BinaryAccuracy()]
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        #self._loss = tf.keras.losses.KLDivergence()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )

        self.build(self.inps[0])
        

    #@tf.function
    def call(self, inputs):
        inp1 = inputs
        
        x1 = self._embed1(inp1)
        y1 = self._lstm1(x1)
        y1 = self._lstm2(y1)
        out = self._dense2(y1)
        
        return out
    

class OnelayerModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=32,
            units=40,
            embed_dim=100,
    ):
        """
        """
        self.inps = [
            (None, len(text_cols)),
        ]
        self.bs = batch_size
        out_dim = 2
        
        super(OnelayerModel, self).__init__()
        
        self._embed1 = tf.keras.layers.Embedding(
            num_unique_words,
            embed_dim,
            input_length=self.inps[0][1],
            name="word_embedding",
            #trainable=False,
        )
        self._lstm1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.GRU(
                units,
                name="lstm1",
            )
        )
        
        self._dense2 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.softmax,
            name="final",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.BinaryAccuracy()]
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        #self._loss = tf.keras.losses.KLDivergence()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )

        self.build(self.inps[0])
        

    #@tf.function
    def call(self, inputs):
        inp1 = inputs
        
        x1 = self._embed1(inp1)
        y1 = self._lstm1(x1)
        out = self._dense2(y1)
        
        return out
    
    
class ConvModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=32,
            units=40,
            embed_dim=100,
    ):
        """
        """
        self.inps = [
            (None, len(text_cols)),
        ]
        self.bs = batch_size
        out_dim = 2
        
        super(ConvModel, self).__init__()
        
        self.bert_layer = bert_layer
        
        self._embed1 = tf.keras.layers.Embedding(
            num_unique_words,
            embed_dim,
            input_length=self.inps[0][1],
            name="word_embedding",
        )
            
        filters = 100
        window = 5
        
        self._conv1 = tf.keras.layers.Conv1D(
            filters,
            window
        )
        
        self._flatten = tf.keras.layers.Flatten()
        
        self._dense1 = tf.keras.layers.Dense(
            units,
            activation=tf.nn.relu,
            name="dense",
        )
        
        self._dense2 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.softmax,
            name="final",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.BinaryAccuracy()]
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        #self._loss = tf.keras.losses.KLDivergence()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )

        self.build(self.inps[0])
        

    def call(self, inputs):
        inp1 = inputs
        
        x1 = self._embed1(inp1)
        x1 = self._conv1(x1)
        y1 = self._flatten(x1)
        z = self._dense1(y1)
        out = self._dense2(z)
        
        return out
    
class BERTModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=32,
            units=40,
            embed_dim=100,
    ):
        """
        """
        self.inps = [
            (None, len(text_cols)),
        ]
        self.bs = batch_size
        out_dim = 2
        
        super(ConvModel, self).__init__()
        
        self._embed1 = tf.keras.layers.Embedding(
            num_unique_words,
            embed_dim,
            input_length=self.inps[0][1],
            name="word_embedding",
        )
            
        filters = 100
        window = 5
        
        self._conv1 = tf.keras.layers.Conv1D(
            filters,
            window
        )
        
        self._flatten = tf.keras.layers.Flatten()
        
        self._dense1 = tf.keras.layers.Dense(
            units,
            activation=tf.nn.relu,
            name="dense",
        )
        
        self._dense2 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.softmax,
            name="final",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.BinaryAccuracy()]
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        #self._loss = tf.keras.losses.KLDivergence()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )

        self.build(self.inps[0])
        

    def call(self, inputs):
        inp1 = inputs
        
        x1 = self._embed1(inp1)
        x1 = self._conv1(x1)
        y1 = self._flatten(x1)
        z = self._dense1(y1)
        out = self._dense2(z)
        
        return out

In [20]:
model = TwolayerModel(batch_size=32, units=20, embed_dim=200)
#model = ConvModel(batch_size=512, units=30, embed_dim=200)
#model = OnelayerModel(batch_size=256, units=50, embed_dim=200)

model.summary()

Model: "twolayer_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
word_embedding (Embedding)   multiple                  6104400   
_________________________________________________________________
bidirectional (Bidirectional multiple                  26640     
_________________________________________________________________
bidirectional_1 (Bidirection multiple                  7440      
_________________________________________________________________
final (Dense)                multiple                  82        
_________________________________________________________________
binary_accuracy (BinaryAccur multiple                  2         
Total params: 6,138,564
Trainable params: 6,138,562
Non-trainable params: 2
_________________________________________________________________


In [21]:
tfboard_dir = "logs"
if not os.path.exists(tfboard_dir):
    os.mkdir(tfboard_dir)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=tfboard_dir,
    histogram_freq=1,
    write_graph=True,
    write_images=True,
)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_binary_accuracy",
    min_delta=1e-5,
    patience=10,
    baseline=0.5,
    restore_best_weights=True,
)

In [22]:
df_test = df_train.iloc[train_pts:]
df_train = df_train.iloc[:train_pts]

log.info(f"Dataset size: {df_train.shape[0]}")

remainder = df_train.shape[0] % model.bs
pad_size = model.bs - remainder if remainder !=0 else 0
log.info(f"Remainder from batch size: {remainder}\n"
         f"Padding {pad_size} elements."
)

X1 = df_train[text_cols].values
if pad_size > 0:
    X1 = np.vstack([X1, np.zeros((pad_size, X1.shape[1]))])


df_train[y_cols] = pd.get_dummies(df_train[target], prefix=target)
Y = df_train[y_cols].values
Y_add = np.zeros((pad_size, 2))
Y_add[:, 0] = 1.0
if pad_size > 0:
    Y = np.vstack([Y, Y_add])

X_val = X1

X = (
    tf.data.Dataset.from_tensor_slices((X_val, Y))
        .batch(model.bs, drop_remainder=True)
)

X = X.shuffle(buffer_size=10000)

q = 5
p = 1

select = lambda x, y: (x % q <= p)
nselect = lambda x, y: ~(x % q <= p)
take = lambda x, y: y

X_train = X
#X_train = X.enumerate().filter(nselect).map(take)
#X_valid = X.enumerate().filter(select).map(take)

2020-11-09 19:26:50,604 - __main__ - INFO - Dataset size: 7613
INFO:__main__:Dataset size: 7613
2020-11-09 19:26:50,605 - __main__ - INFO - Remainder from batch size: 29
Padding 3 elements.
INFO:__main__:Remainder from batch size: 29
Padding 3 elements.


In [23]:
log.info(f"Test dataset size: {df_test.shape}")

Z1 = df_test[text_cols].values

X_test = Z1

2020-11-09 19:26:50,634 - __main__ - INFO - Test dataset size: (3263, 65)
INFO:__main__:Test dataset size: (3263, 65)


In [24]:
df_test[y_cols] = pd.get_dummies(df_test[target], prefix=target)
Y_test = df_test[y_cols].values

In [25]:
X_valid = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(model.bs, drop_remainder=True)

In [26]:
hist = model.fit(
    X_train, 
    epochs=1000,
    validation_data=X_valid,
    callbacks=[
        #tensorboard_callback, 
        early_stopping
    ],
)

model.save(os.path.join(tfboard_dir, "model"))

Epoch 1/1000


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.





To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: logs/model/assets


INFO:tensorflow:Assets written to: logs/model/assets


In [27]:
Y_test = model.predict(X_test)
Y_pred = model.predict(X_val)

In [28]:
df_result = pd.DataFrame(Y_pred, columns=y_cols)
df_result = df_result.apply(np.round).astype({x: int for x in y_cols})
df_result[target] = df_result["target_1"]
df_result.drop(y_cols, inplace=True, axis=1)
df_result.drop(list(df_result.index[df_train.shape[0]:]), inplace=True, axis=0)
                           
df_pred = pd.DataFrame(Y_test, columns=y_cols)
df_pred = df_pred.apply(np.round).astype({x: int for x in y_cols})
df_pred[target] = df_pred["target_1"]
df_pred.drop(y_cols, inplace=True, axis=1)
df_pred["id"] = df_test["id"].values
df_pred = df_pred[["id", target]]

In [29]:
from sklearn.metrics import classification_report

log.info("\n" +
    classification_report(
        df_train[target],
        df_result[target],
        target_names=["Not disaster", "Disaster"]
    )
)

2020-11-09 19:31:35,640 - __main__ - INFO - 
              precision    recall  f1-score   support

Not disaster       0.87      0.94      0.91      4342
    Disaster       0.91      0.82      0.86      3271

    accuracy                           0.89      7613
   macro avg       0.89      0.88      0.88      7613
weighted avg       0.89      0.89      0.89      7613

INFO:__main__:
              precision    recall  f1-score   support

Not disaster       0.87      0.94      0.91      4342
    Disaster       0.91      0.82      0.86      3271

    accuracy                           0.89      7613
   macro avg       0.89      0.88      0.88      7613
weighted avg       0.89      0.89      0.89      7613



In [30]:
log.info("\n" +
    classification_report(
        df_test[target],
        df_pred[target],
        target_names=["Not disaster", "Disaster"]
    )
)

2020-11-09 19:31:35,652 - __main__ - INFO - 
              precision    recall  f1-score   support

Not disaster       0.78      0.87      0.83      1861
    Disaster       0.80      0.68      0.74      1402

    accuracy                           0.79      3263
   macro avg       0.79      0.78      0.78      3263
weighted avg       0.79      0.79      0.79      3263

INFO:__main__:
              precision    recall  f1-score   support

Not disaster       0.78      0.87      0.83      1861
    Disaster       0.80      0.68      0.74      1402

    accuracy                           0.79      3263
   macro avg       0.79      0.78      0.78      3263
weighted avg       0.79      0.79      0.79      3263



In [31]:
accuracy_score(df_train[target], df_result[target])

0.8879548141337187

In [32]:
accuracy_score(df_test[target], df_pred[target])

0.7897640208397181

In [33]:
#df_train[[target]].join(df_result[[target]], lsuffix="true", rsuffix="pred").head(500)
#df_pred.to_csv(os.path.join(data_dir, "results.csv"), index=False)

%load_ext tensorboard

%tensorboard --logdir logs

In [34]:
#!kill 3444