In [2]:
import os, re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

tf.get_logger().setLevel('ERROR')
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

In [35]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [36]:
train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [37]:
train = train.groupby('target').sample(3000)

In [114]:
train['full_text'] = train.keyword + " " + train.location + " " + train.text
train.full_text = train.full_text.apply(lambda x: str(x))

In [115]:
train_df, val_df = np.split(train.sample(frac = 1), [int(0.8 * len(train))])

In [116]:
def clean_text(dataframe):
    dataframe.full_text.apply(lambda x: x.lower())
    dataframe.full_text.apply(lambda x: re.sub(r'http\S+', '', x))
    dataframe.full_text.apply(lambda x: re.sub(r'\W+', ' ', x))
    dataframe.full_text.apply(lambda x: re.sub(r'\d+', '', x))
    dataframe.full_text.apply(lambda x: x.lower())
    return(dataframe)

In [117]:
cleaned_train = clean_text(train_df)
cleaned_val = clean_text(val_df)

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [148]:
vocab_size = 1000
embedding_dim = 100

tokenizer = Tokenizer(vocab_size, oov_token = "<OOV>")
tokenizer.fit_on_texts(cleaned_train.full_text)

train_sequence = tokenizer.texts_to_sequences(cleaned_train.full_text)
val_sequence = tokenizer.texts_to_sequences(cleaned_val.full_text)

maxlen = max([len(x) for x in train_sequence])
train_padded = pad_sequences(train_sequence, maxlen, padding = 'post', truncating = 'post')
val_padded = pad_sequences(val_sequence, maxlen, padding = 'post', truncating = 'post')

In [149]:
tf.keras.backend.clear_session()

In [150]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)))
model.add(tf.keras.layers.Dense(25, activation = 'relu'))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(1, activation = 'sigmoid'))

2023-07-16 00:19:12.425561: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-16 00:19:12.429599: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-16 00:19:12.432142: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [151]:
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'])

In [152]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', patience = 3)

In [153]:
model.fit(train_padded, 
          train_df.target,
          validation_data = (val_padded, val_df.target),
          epochs = 10, 
          callbacks = [early_stopping])

Epoch 1/10


2023-07-16 00:19:23.905596: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-16 00:19:23.909236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-16 00:19:23.911684: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2023-07-16 00:19:35.000069: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-07-16 00:19:35.003603: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-07-16 00:19:35.005765: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


<keras.callbacks.History at 0x137fbb490>

In [79]:
cleaned_test = clean_text(test)
test_sequence = tokenizer.texts_to_sequences(cleaned_test.text)
test_padded = pad_sequences(test_sequence, maxlen, padding = 'post', truncating = 'post')

In [91]:
test_predictions = (model.predict(test_padded) > 0.5).astype(int)



In [95]:
test_labels = [x[0] for x in test_predictions]

In [99]:
submission_df = pd.DataFrame(list(zip(cleaned_test.id, test_labels)),
                             columns = ['id', 'target'])

In [101]:
submission_df.to_csv('submission.csv', index = False)