In [1]:
import numpy as np
import pandas as pd

import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from keras import models, layers

import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
# get data
import zipfile

# download data
file_url = "https://www.dropbox.com/s/cbmrl3giec0q98l/nlp-getting-started.zip?dl=1"
file_location = keras.utils.get_file('data.zip', file_url)

# extract data
with zipfile.ZipFile(file_location) as zip_file:
    zip_file.extractall()

# read data into Dataframe
train_data = pd.read_csv('train.csv')

Downloading data from https://www.dropbox.com/s/cbmrl3giec0q98l/nlp-getting-started.zip?dl=1


In [0]:
# load tutorial data booooo
# tut_dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)
# tut_train_dataset, tut_test_dataset = tut_dataset['train'], tut_dataset['test']

In [0]:
# create an encoder
encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(train_data["text"], 31000)

In [0]:
# encode text in training dataset using above create encoder
encodings = []
for text in train_data["text"]:
    encodings.append(encoder.encode(text))

# convert encodings to ragged tensor
encodings_tensor = tf.ragged.constant(encodings)
# encodings_tensor = encodings_tensor.to_tensor() # convert encodings tensor from ragged to dense

In [0]:
# create dataset from encoded text and 'target' column
dataset = tf.data.Dataset.from_tensor_slices((encodings_tensor, train_data["target"]))
dataset = dataset.shuffle(8000)

# split into 'training' and 'test' sets
train_dataset, test_dataset = dataset.take(5000), dataset.skip(5000)

In [0]:
BUFFER_SIZE = 8000
BATCH_SIZE = 64

# shuffle and pad-batch train data
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE)

# pad-batch test data
test_dataset = test_dataset.batch(BATCH_SIZE)

In [0]:
# create model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(keras.optimizers.Adam(1e-4), keras.losses.BinaryCrossentropy(True), metrics=['accuracy'])

In [0]:
# train model
history = model.fit(train_dataset, epochs=10, validation_data=test_dataset, validation_steps=30)

In [0]:
# get test data
test_data = pd.read_csv("test.csv")

# make predictions
ids = []
targets = []
for row in test_data.iterrows():
    # get required data
    data = row[1]

    # encode text
    encoded_text = encoder.encode(data["text"])

    # make prediction
    prediction = model.predict([encoded_text])

    # make entries into respective arrays
    ids.append(data["id"])
    if prediction[0][0] <= 0:
        targets.append(0)
    else:
        targets.append(1)

In [0]:
final_dataframe = pd.DataFrame({"id": pd.Series(ids), "target": pd.Series(targets)})

In [0]:
final_dataframe.to_csv("submission.csv", columns=["id", "target"], index=False)

In [0]:
submission = pd.read_csv("submission.csv")

In [72]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 64)          1977088   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 2,051,457
Trainable params: 2,051,457
Non-trainable params: 0
_________________________________________________________________
