In [1]:
import os
import sys
import logging
import pandas as pd
import numpy as np
import re
import sklearn
import tensorflow as tf
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing

import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import wordcloud
from wordcloud import WordCloud

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

init_notebook_mode(connected=True)

log = logging.getLogger(name=__name__)
log.setLevel(logging.INFO)
logging.captureWarnings(True)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)

ch.setFormatter(formatter)
log.addHandler(ch)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 160

SEED = 1
tf.random.set_seed(
    SEED
)
#tf.keras.backend.set_floatx('float64')

log.info(f"Python version: {sys.version}")
log.info(f"Numpy version: {np.__version__}")
log.info(f"Pandas version: {pd.__version__}")
log.info(f"Scikit-learn version: {sklearn.__version__}")
log.info(f"TensorFlow version: {tf.__version__}")
log.info(f"Plotly version: {plotly.__version__}")
log.info(f"WordCloud version: {wordcloud.__version__}")
log.info(f"tensorflow.random seed: {SEED}")

UNK = "UNK"
NUM = "num"
SUCCESS = 0
stopwords = (nltk.corpus.stopwords.words("english") 
    #+ ["u", "im", "us", "th", "st", "nd", "r", "rt", "f", "v", "x"]
)

old_text = "text"
text = "t"
hashtag = "hashtag"
at = "at"
href = "href"
target = "target"
keyword = "keyword"
location = "location"

y_cols = [target+"_0", target+"_1"]

2020-11-09 14:38:16,034 - __main__ - INFO - Python version: 3.8.0 (default, Oct 28 2019, 16:14:01) 
[GCC 8.3.0]
2020-11-09 14:38:16,034 - __main__ - INFO - Numpy version: 1.18.5
2020-11-09 14:38:16,034 - __main__ - INFO - Pandas version: 1.1.4
2020-11-09 14:38:16,035 - __main__ - INFO - Scikit-learn version: 0.23.2
2020-11-09 14:38:16,035 - __main__ - INFO - TensorFlow version: 2.3.0
2020-11-09 14:38:16,036 - __main__ - INFO - Plotly version: 4.12.0
2020-11-09 14:38:16,037 - __main__ - INFO - WordCloud version: 1.8.0
2020-11-09 14:38:16,037 - __main__ - INFO - tensorflow.random seed: 1


import tensorflow_hub as hub

module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [2]:
class LabelEncoderExt(preprocessing.LabelEncoder):
    """
    """
    def __init__(self):

        super().__init__()

    def fit(self, y):

        if not isinstance(y, np.ndarray):
            y = np.array(y)
        assert (len(y.shape) == 1), "Require 1D array"
        y = np.concatenate((y, np.array([UNK])))
        super().fit(y)

    def transform(self, y):

        y[~np.isin(y, self.classes_, assume_unique=True)] = UNK
        return super().transform(y)

    def fit_transform(self, y):

        self.fit(y)
        return self.transform(y)

In [3]:
data_bn = "data"
data_dir = os.path.abspath(
    os.path.join(__name__, os.pardir, os.pardir, data_bn)
)

log.info(f"Data directory: {data_dir}")

2020-11-09 14:38:16,088 - __main__ - INFO - Data directory: /home/jimmy/github/kaggle/nlp_disaster_tweets/data


In [4]:
train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

In [5]:
df_train = pd.read_csv(train_fn)
df_test = pd.read_csv(test_fn)

log.info(f"Training data shape: {df_train.shape}")
log.info(f"Test data shape: {df_test.shape}")

2020-11-09 14:38:16,166 - __main__ - INFO - Training data shape: (7613, 5)
2020-11-09 14:38:16,166 - __main__ - INFO - Test data shape: (3263, 4)


In [6]:
solution_fn = os.path.join(data_dir, "socialmedia-disaster-tweets-DFE.csv")
df_X = pd.read_csv(solution_fn, sep=',', header=0, encoding = "ISO-8859-1")
df_X = df_X.rename({"tweetid": "id"}, axis=1).astype({"id": int})
df_X[target] = df_X["choose_one"].apply(lambda x: 1 if x=="Relevant" else 0)

In [7]:
df_test = df_test.merge(df_X[["id", "target"]], how="inner", left_on="id", right_on=df_X.index).rename({"target_y": target}, axis=1)

In [8]:
train_pts = df_train.shape[0]

In [9]:
df_train = pd.concat([df_train, df_test], ignore_index=True)
df_train = df_train.drop(["id_x", "id_y"], axis=1)

In [10]:
df_train.head(50)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
5,8,,,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1
6,10,,,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1
7,13,,,I'm on top of the hill and I can see a fire in the woods...,1
8,14,,,There's an emergency evacuation happening now in the building across the street,1
9,15,,,I'm afraid that the tornado is coming to our area...,1


df_test.head(40)

In [11]:
def to_lower(df):
    '''
    '''
    df[text] = df[text].apply(lambda x: x.casefold())
    
    return SUCCESS


def hash_handling(df):
    '''
    '''
    reg_hash_full = re.compile("(#)\w+")
    reg_hash = re.compile("(#)")
    
    f = lambda x: [y.group() for y in reg_hash_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[hashtag] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_hash.sub(' ', x))
    
    return SUCCESS


def at_handling(df):
    '''
    '''
    reg_at = re.compile("(@)")
    reg_at_full = re.compile("(@)\w+")
    
    f = lambda x: [y.group() for y in reg_at_full.finditer(x)]
    g = lambda x: ' '.join(x)
    
    df[at] = df[text].apply(f).apply(g)
    df[text] = df[text].apply(lambda x: reg_at_full.sub(' @ ', x))
    
    return SUCCESS


def count_at(df):
    """
    """
    df[at] = df[at].apply(lambda x: len(x.split()))
    
    return SUCCESS


def href_handling(df):
    '''
    '''
    reg_href_full = re.compile("(htt)\S+")
    
    f = lambda x: len(list(reg_href_full.finditer(x)))
    
    df[href] = df[text].apply(f)
    df[text] = df[text].apply(lambda x: reg_href_full.sub(' http ', x))
    
    return SUCCESS


def html_special_handling(df):
    '''
    '''
    reg_html = re.compile("(&)\w+(;)")
    df[text] = df[text].apply(lambda x: reg_html.sub(' html ', x))
    
    return SUCCESS
    
    
def xc2x89_byte_handling(df):
    '''
    '''
    reg_x89 = re.compile(b"\xc2\x89".decode('utf-8')+"\S+")
    df[text] = df[text].apply(lambda x: reg_x89.sub(' ', x))
    
    return SUCCESS
    
    
def special_char_handling(df):
    '''
    '''
    reg_special = re.compile("[^\w\s@]")
    df[text] = df[text].apply(lambda x: reg_special.sub(' ', x))
    df[text] = df[text].apply(lambda x: re.sub('_', ' ', x)) 
    
    return SUCCESS


def contraction_handling(df):
    '''
    '''
    reg_contract = re.compile("\s(s|m|t|(nt)|(ve)|w)\s")
    df[text] = df[text].apply(lambda x: reg_contract.sub(' ', x))
    
    return SUCCESS


def encode_numerals(df):
    '''
    '''
    reg_numerals = re.compile("\d+[\s\d]*")
    df[text] = df[text].apply(lambda x: reg_numerals.sub(' '+NUM+' ', x))
    
    return SUCCESS
    
    
def remove_stopwords(df):
    """
    """
    f = (lambda x: 
        ' '.join([y for y in x.strip().split() if y not in stopwords])
    )
    df[text] = df[text].apply(f)
    
    return SUCCESS   


def has_location(df):
    """
    """
    df[location] = df[location].apply(lambda x: 1 if pd.isnull(x) else 0)
    
    return SUCCESS


def preprocess(df):
    """
    """
    df[text] = df[old_text]
    df[keyword].fillna('', inplace=True)
    #df[text] = df[text] + " " + df[keyword]
    to_lower(df)
    hash_handling(df)
    at_handling(df)
    count_at(df)
    href_handling(df)
    html_special_handling(df)
    xc2x89_byte_handling(df)
    special_char_handling(df)
    contraction_handling(df)
    remove_stopwords(df)
    encode_numerals(df)
    has_location(df)
    
    return SUCCESS

In [12]:
_ = preprocess(df_train)

In [13]:
df_train.head(100)

Unnamed: 0,id,keyword,location,text,target,t,hashtag,at,href
0,1,,1,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1,deeds reason earthquake may allah forgive us,#earthquake,0,0
1,4,,1,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,,0,0
2,5,,1,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1,residents asked shelter place notified officers evacuation shelter place orders expected,,0,0
3,6,,1,"13,000 people receive #wildfires evacuation orders in California",1,num people receive wildfires evacuation orders california,#wildfires,0,0
4,7,,1,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1,got sent photo ruby alaska smoke wildfires pours school,#alaska #wildfires,0,0
5,8,,1,#RockyFire Update => California Hwy. 20 closed in both directions due to Lake County fire - #CAfire #wildfires,1,rockyfire update california hwy num closed directions due lake county fire cafire wildfires,#rockyfire #cafire #wildfires,0,0
6,10,,1,"#flood #disaster Heavy rain causes flash flooding of streets in Manitou, Colorado Springs areas",1,flood disaster heavy rain causes flash flooding streets manitou colorado springs areas,#flood #disaster,0,0
7,13,,1,I'm on top of the hill and I can see a fire in the woods...,1,top hill see fire woods,,0,0
8,14,,1,There's an emergency evacuation happening now in the building across the street,1,emergency evacuation happening building across street,,0,0
9,15,,1,I'm afraid that the tornado is coming to our area...,1,afraid tornado coming area,,0,0


In [14]:
def tokenize_dataframe(df, col, max_len=20):
    """
    """
    df_tmp = pd.DataFrame(df[col].apply(lambda x: reversed(x.split())).tolist())
    orig_len = len(df_tmp.columns)
    df_tmp = df_tmp.rename(
        lambda x: col+"_{:02d}".format(max_len-1-x), 
        axis=1
    )
    
    enum_cols = [col+"_{:02d}".format(i) for i in range(max_len)]
    if orig_len < max_len:
        compl_cols = [x for x in enum_cols if x not in df_tmp.columns]
        df_tmp[compl_cols] = np.nan

    df_merged = df.merge(
        df_tmp[enum_cols],
        how="outer",
        left_index=True,
        right_index=True
    )
    
    return df_merged, enum_cols


def filter_infrequent(df, cols, cutoff=5):
    """
    """
    unique_words, word_counts = (
        np.unique(df[cols].values.flatten(), return_counts=True)
    )
    infreq_dict = {
        x: (x if word_counts[i] >= cutoff else UNK)
            for i, x in np.ndenumerate(unique_words)
    }

    f = lambda x: infreq_dict[x]
    df[cols] = df[cols].applymap(f)
    
    return SUCCESS


def transform_data(df):
    """
    """
    _ = preprocess(df)
    df, text_cols = tokenize_dataframe(df, text, max_len=25)
    
    df[text_cols] = df[text_cols].fillna('')
    
    lemmatizer = WordNetLemmatizer() 
    ps = PorterStemmer()

    df[text_cols] = df[text_cols].applymap(lambda x: ps.stem(x))
    df[text_cols] = df[text_cols].applymap(lambda x: lemmatizer.lemmatize(x))

    _ = filter_infrequent(df, text_cols, cutoff=10)
        
    df, hash_cols = tokenize_dataframe(df, hashtag, max_len=3)
    df[hash_cols] = df[hash_cols].fillna('')

    _ = filter_infrequent(df, hash_cols, cutoff=5)
    
    return df, text_cols, hash_cols

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(df[text].values)
tmp = tokenizer.texts_to_sequences(df[text].values)
tmp = pad_sequences(tmp)
text_cols = ["text_{:02d}".format(i) for i in range(tmp.shape[1])]
df[text_cols] = tmp


In [15]:
df_train, text_cols = transform_data(df_train)

ValueError: too many values to unpack (expected 2)

In [None]:
df_train.head(50)

wc_size = (12, 12)

tdf = df_train[df_train[target]==1]

unique_words, word_counts = (
    np.unique(tdf[text_cols].values.flatten(), return_counts=True)
)
sm = np.sum(word_counts)
frequency_dict = {
    x: word_counts[i]/sm 
        for i, x in np.ndenumerate(unique_words)
}
try:
    frequency_dict.pop(NUM)
except:
    pass
try:
    frequency_dict.pop(UNK)
except:
    pass

wordcloud = WordCloud(
    width=1000, height=1000, 
    background_color='white',
    min_font_size=10
).generate_from_frequencies(frequency_dict)
fig = plt.figure(figsize=wc_size, facecolor=None)
ax = fig.add_subplot()
a = ax.imshow(wordcloud) 

tdf = df_train[df_train[target]==0]
unique_words, word_counts = (
    np.unique(tdf[text_cols].values.flatten(), return_counts=True)
)
sm = np.sum(word_counts)
frequency_dict = {
    x: word_counts[i]/sm
    for i, x in np.ndenumerate(unique_words)
}
try:
    frequency_dict.pop(NUM)
except:
    pass
try:
    frequency_dict.pop(UNK)
except:
    pass

wordcloud = WordCloud(
    width=1000, height=1000, 
    background_color='white',
    min_font_size=10
).generate_from_frequencies(frequency_dict)
fig = plt.figure(figsize=wc_size, facecolor=None) 
ax = fig.add_subplot()
ret = ax.imshow(wordcloud) 

In [None]:
enc = LabelEncoderExt()
df_train[text_cols] = (enc
    .fit_transform(df_train[text_cols].values.flatten())
    .reshape(df_train[text_cols].shape)
)
num_unique_words = enc.classes_.shape[0]

log.info(f"Number of unique words: {num_unique_words}")

In [None]:
hash_enc = LabelEncoderExt()
df_train[hash_cols] = (hash_enc
    .fit_transform(df_train[hash_cols].values.flatten())
    .reshape(df_train[hash_cols].shape)
)
num_unique_hash = hash_enc.classes_.shape[0]

log.info(f"Number of unique hashtags: {num_unique_hash}")

In [None]:
key_enc = LabelEncoderExt()
df_train[keyword] = (key_enc
    .fit_transform(df_train[keyword])
)
num_unique_keywords = key_enc.classes_.shape[0]

log.info(f"Number of unique keywords: {num_unique_keywords}")

In [None]:
def most_freq_bigrams(df, enc, text_cols, top_n=10):
    """
    """
    mat = df[text_cols].values
    mat_stack = np.array([mat[:, :-1].flatten(), mat[:, 1:].flatten()])
    uniq_pairs, counts = np.unique(mat_stack, return_counts=True, axis=1)
    
    one = enc.transform(np.array([UNK]))[0]
    zero = enc.transform(np.array(['']))[0]
    
    a1 = np.where(~np.isin(uniq_pairs[0], [zero,one]))[0]
    a2 = np.where(~np.isin(uniq_pairs[1], [zero,one]))[0]
    slc = a1[np.where(np.isin(a1, a2))[0]]
    
    top_counts = pd.Series(counts[slc]).nlargest(top_n)
    top_pairs = np.flip(
        np.transpose(uniq_pairs[:, slc][:, top_counts.index]), axis=1
    )
    str_top_pairs = (
        enc.inverse_transform(top_pairs.flatten())
            .reshape(top_pairs.shape)
    )
    
    return top_counts, top_pairs, str_top_pairs

v0 = most_freq_bigrams(
    df_train[df_train[target]==0],
    enc, text_cols, top_n=50
)
v1 = most_freq_bigrams(
    df_train[df_train[target]==1],
    enc, text_cols, top_n=50
)

bigrams0 = np.array(['_'.join(x.tolist()) for x in v0[2]])
bigrams1 = np.array(['_'.join(x.tolist()) for x in v1[2]])

bigr_cnt0 = np.vstack([bigrams0, v0[0].values])
bigr_cnt1 = np.vstack([bigrams1, v1[0].values])

fig = go.Figure()
bar0 = go.Bar(name="Not disaster", x=bigr_cnt0[0], y=bigr_cnt0[1])
bar1 = go.Bar(name="Disaster", x=bigr_cnt1[0], y=bigr_cnt1[1])

fig.add_trace(bar0)
fig.add_trace(bar1)

fig.update_layout(barmode='group')

iplot(fig)

df_test, _, _ = transform_data(df_test)

df_test.head(5)

df_test[text_cols] = (enc
    .transform(df_test[text_cols].values.flatten())
    .reshape(df_test[text_cols].shape)
)
df_test[hash_cols] = (hash_enc
    .transform(df_test[hash_cols].values.flatten())
    .reshape(df_test[hash_cols].shape)
)
df_test[keyword] = (key_enc
    .transform(df_test[keyword])
)

df_test.head(10)

In [None]:
class MyModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=32,
    ):
        """
        """
        self.inps = [
            (None, len(text_cols)),
            (None, len(hash_cols)),
            (None, 3), 
            (None, 1),
        ]
        self.bs = batch_size
        units = 30
        out_dim = 2
        
        super(MyModel, self).__init__()
        
        self._embed1 = tf.keras.layers.Embedding(
            num_unique_words,
            num_unique_words//200,
            input_length=self.inps[0][1],
            name="word_embedding",
            #trainable=False,
        )
            
        filters = 150
        window = 5
        
        self._conv1 = tf.keras.layers.Conv1D(
            filters,
            window
        )
        
        self._lstm1 = tf.keras.layers.LSTM(
            units,
            name="lstm1",
        )

        
        """
        self._lstm1 = tf.keras.layers.Bidirectional(
            tf.keras.layers.LSTM(
                units,
                name="lstm1",
            )
        )
        """
        
        '''
        self._embed2 = tf.keras.layers.Embedding(
            num_unique_hash,
            num_unique_hash//5,
            input_length=self.inps[1][1],
            name="hashtag_embedding",
        )        
        
        self._lstm2 = tf.keras.layers.GRU(
            units,
            name="lstm2"
        )
        
        self._embed3 = tf.keras.layers.Embedding(
            num_unique_keywords,
            num_unique_keywords//2,
            input_length=self.inps[2][1],
            name="keyword_embedding",
        )
        
        self._dense_red = tf.keras.layers.Dense(
            units,
            activation=tf.nn.relu,
        )
        
        self._dense_at = tf.keras.layers.Dense(
            units,
            activation=tf.nn.relu,
        )
        '''
        
        self._flatten = tf.keras.layers.Flatten()

        #self._cat = tf.keras.layers.Concatenate(axis=-1, name="Cat")
        
        '''
        self._dense1 = tf.keras.layers.Dense(
            units,
            activation=tf.nn.relu,
            name="dense",
        )
        '''
        
        self._dense2 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.softmax,
            name="final",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.BinaryAccuracy()]
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
        #self._loss = tf.keras.losses.BinaryCrossentropy(from_logits=False)
        #self._loss = tf.keras.losses.CategoricalHinge()
        self._loss = tf.keras.losses.KLDivergence()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )

        self.build(self.inps[0])
        

    #@tf.function
    def call(self, inputs):
        inp1 = inputs
        
        x1 = self._embed1(inp1)
        x1 = self._conv1(x1)
        y1 = self._lstm1(x1)
        #y1 = self._flatten(x1)
        
        #inp2 = inputs[1]
        #x2 = self._embed2(inp2)
        #y2 = self._lstm2(x2)
        
        #inp3 = inputs[2]
        #x4 = self._dense_at(inp3)
        
        #inp4 = inputs[3]
        #x3 = self._embed3(inp4)
        #x3 = self._flatten(x3)
        #y3 = self._dense_red(x3)
        
        #z = self._cat([y1, y2, y3, x4])
        #z = self._dense1(y1)
        out = self._dense2(y1)
        
        return out

In [None]:
model = MyModel()

model.summary()

In [None]:
tfboard_dir = "logs"
if not os.path.exists(tfboard_dir):
    os.mkdir(tfboard_dir)

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=tfboard_dir,
    histogram_freq=1,
    write_graph=True,
    write_images=True,
)
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_binary_accuracy",
    min_delta=1e-5,
    patience=20,
    baseline=0.5,
    restore_best_weights=True,
)

In [None]:
df_test = df_train.iloc[train_pts:]
df_train = df_train.iloc[:train_pts]

log.info(f"Dataset size: {df_train.shape[0]}")

remainder = df_train.shape[0] % model.bs
pad_size = model.bs - remainder if remainder !=0 else 0
log.info(f"Remainder from batch size: {remainder}\n"
         f"Padding {pad_size} elements."
)

X1 = df_train[text_cols].values
X2 = df_train[hash_cols].values
X3 = df_train[[at, href, location]].values.astype(np.float64)
X4 = df_train[keyword].values.reshape((-1, 1))
if pad_size > 0:
    X1 = np.vstack([X1, np.zeros((pad_size, X1.shape[1]))])
    X2 = np.vstack([X2, np.zeros((pad_size, X2.shape[1]))])
    X3 = np.vstack([X3, np.zeros((pad_size, X3.shape[1]))])
    X4 = np.vstack([X4, np.zeros((pad_size, X4.shape[1]))])


df_train[y_cols] = pd.get_dummies(df_train[target], prefix=target)
Y = df_train[y_cols].values
Y_add = np.zeros((pad_size, 2))
Y_add[:, 0] = 1.0
if pad_size > 0:
    Y = np.vstack([Y, Y_add])

#X_val = (X1, X2, X3, X4)
X_val = X1

X = (
    tf.data.Dataset.from_tensor_slices((X_val, Y))
        .batch(model.bs, drop_remainder=True)
)

X = X.shuffle(buffer_size=10000)

q = 5
p = 1

select = lambda x, y: (x % q <= p)
nselect = lambda x, y: ~(x % q <= p)
take = lambda x, y: y

X_train = X
#X_train = X.enumerate().filter(nselect).map(take)
#X_valid = X.enumerate().filter(select).map(take)

In [None]:
log.info(f"Test dataset size: {df_test.shape}")

Z1 = df_test[text_cols].values
Z2 = df_test[hash_cols].values
Z3 = df_test[[at, href, location]].values.astype(np.float64)
Z4 = df_test[keyword].values.reshape((-1, 1))

X_test = (Z1, Z2, Z3, Z4)
X_test = Z1

In [None]:
df_test[y_cols] = pd.get_dummies(df_test[target], prefix=target)
Y_test = df_test[y_cols].values

In [None]:
X_valid = tf.data.Dataset.from_tensor_slices((X_test, Y_test)).batch(model.bs, drop_remainder=True)

In [None]:
hist = model.fit(
    X_train, 
    epochs=1000,
    validation_data=X_valid,
    callbacks=[
        #tensorboard_callback, 
        early_stopping
    ],
)

model.save(os.path.join(tfboard_dir, "model"))

In [None]:
Y_test = model.predict(X_test)
Y_pred = model.predict(X_val)

In [None]:
df_result = pd.DataFrame(Y_pred, columns=y_cols)
df_result = df_result.apply(np.round).astype({x: int for x in y_cols})
df_result[target] = df_result["target_1"]
df_result.drop(y_cols, inplace=True, axis=1)
df_result.drop(list(df_result.index[df_train.shape[0]:]), inplace=True, axis=0)
                           
df_pred = pd.DataFrame(Y_test, columns=y_cols)
df_pred = df_pred.apply(np.round).astype({x: int for x in y_cols})
df_pred[target] = df_pred["target_1"]
df_pred.drop(y_cols, inplace=True, axis=1)
df_pred["id"] = df_test["id"].values
df_pred = df_pred[["id", target]]

In [None]:
from sklearn.metrics import classification_report

log.info("\n" +
    classification_report(
        df_train[target],
        df_result[target],
        target_names=["Not disaster", "Disaster"]
    )
)

In [None]:
log.info("\n" +
    classification_report(
        df_test[target],
        df_pred[target],
        target_names=["Not disaster", "Disaster"]
    )
)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(df_train[target], df_result[target])

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(df_test[target], df_pred[target])

In [None]:
#df_train[[target]].join(df_result[[target]], lsuffix="true", rsuffix="pred").head(500)
#df_pred.to_csv(os.path.join(data_dir, "results.csv"), index=False)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir logs

In [None]:
#!kill 3444