<a href="https://colab.research.google.com/github/kp425/nlp_lab/blob/master/Creating_kaggle_ds_from_tfrecords_imdb_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/kp425/nlp_lab.git

Cloning into 'nlp_lab'...
remote: Enumerating objects: 76, done.[K
remote: Counting objects: 100% (76/76), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 76 (delta 25), reused 41 (delta 9), pack-reused 0[K
Unpacking objects: 100% (76/76), done.


In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from nlp_lab.utils import WordTokenizer 
import numpy as np
import os
import re
import shutil
import string
import glob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
def getfiles(folder):
    files = glob.glob(f"{folder}/*.txt")
    return files

pos_path = '/content/aclImdb/train/pos'
neg_path = '/content/aclImdb/train/neg'

pos_files = getfiles(pos_path)
neg_files = getfiles(neg_path)

pos_ds = tf.data.TextLineDataset(pos_files,num_parallel_reads=AUTOTUNE)
pos_ds = pos_ds.map(lambda x: (x,1.0), num_parallel_calls = AUTOTUNE)

neg_ds = tf.data.TextLineDataset(neg_files,num_parallel_reads=AUTOTUNE)
neg_ds = neg_ds.map(lambda x: (x,0.0), num_parallel_calls = AUTOTUNE)

ds = pos_ds.concatenate(neg_ds)

In [None]:
pos_path = '/content/aclImdb/test/pos'
neg_path = '/content/aclImdb/test/neg'

pos_files = getfiles(pos_path)
neg_files = getfiles(neg_path)

pos_ds = tf.data.TextLineDataset(pos_files,num_parallel_reads=AUTOTUNE)
pos_ds = pos_ds.map(lambda x: (x,1.0), num_parallel_calls = AUTOTUNE)

neg_ds = tf.data.TextLineDataset(neg_files,num_parallel_reads=AUTOTUNE)
neg_ds = neg_ds.map(lambda x: (x,0.0), num_parallel_calls = AUTOTUNE)

test_ds = pos_ds.concatenate(neg_ds)

In [None]:
max_tokens = None
sequence_length = None

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                                                '')
def vectorize_text(seq, label):
    tmp_seq = tf.expand_dims(seq, -1)
    enc_seq = vectorize_layer(tmp_seq)
    seq_len = tf.shape(enc_seq)[-1]
    return seq, seq_len, enc_seq, label

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=sequence_length)

full_text = ds.map(lambda text, label: text)
vectorize_layer.adapt(full_text)  #collects all vocabulary and assigns ID

ds = ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)


#pad the encoded_seqs 

pad_ds = ds.map(lambda x,y,z,w: tf.squeeze(z,axis=0))           #seperate encoded seqs
pad_ds = pad_ds.padded_batch(25000)                        #pad the seqs
pad_ds = pad_ds.unbatch()                                       #unbatch them again to individuals, padding persists
pad_ds = pad_ds.cache()

pad_test_ds = test_ds.map(lambda x,y,z,w: tf.squeeze(z,axis=0))   #seperate encoded seqs
pad_test_ds = pad_test_ds.padded_batch(25000)                     #pad the seqs
pad_test_ds = pad_test_ds.unbatch()                               #unbatch them again to individuals, padding persists
pad_test_ds = pad_test_ds.cache()



In [None]:
#zip them with origianl datasets and remove unpadded sequences

#removing unpadded_seqs 
_ds = ds.map(lambda *x: (x[0],x[1],x[3]))
_test_ds = test_ds.map(lambda *x: (x[0],x[1],x[3]))

# zip the og datasets with padded seqs
_ds = tf.data.Dataset.zip((_ds, pad_ds))
_test_ds = tf.data.Dataset.zip((_test_ds,pad_test_ds))

#rearrange order
_ds = _ds.map(lambda x, y: (x[0],x[1], y, x[2])).cache()
_test_ds = _test_ds.map(lambda x, y: (x[0],x[1], y, x[2])).cache()

In [None]:
train_val_split = 0.9
ds_size = 25000

train_split = int(train_val_split * ds_size)
val_split = ds_size - train_split 

_ds = _ds.shuffle(100000, seed = 101)
train_ds = _ds.take(train_split)
val_ds = _ds.skip(train_split).take(val_split)



In [None]:
print(sum([1 for i in train_ds]))
print(sum([1 for i in val_ds]))

22500
2500


In [None]:
print(sum([1 for i in test_ds]))

25000


In [None]:
def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value = [value]))

def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value = [value]))


def serialize_to_tfr(seq, seqlen, enc_seq, label):
    def _serialize_seqs(seq, seqlen, enc_seq, label):
        
        seq = tf.io.serialize_tensor(seq)
        enc_seq = tf.io.serialize_tensor(enc_seq)

        feature = {
             'seq': _bytes_feature(seq),
             'seq_len': _int_feature(seqlen),
             'enc_seq': _bytes_feature(enc_seq),
             'label': _float_feature(label)}
        

        example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
        return example_proto.SerializeToString()
    
    tf_string = tf.py_function(_serialize_seqs ,
                               (seq, seqlen, enc_seq, label), 
                               tf.string)      
    return tf.reshape(tf_string, ()) 



train_ds = train_ds.map(lambda *x: serialize_to_tfr(x[0],x[1],x[2],x[3]))
val_ds = val_ds.map(lambda *x: serialize_to_tfr(x[0],x[1],x[2],x[3]))
test_ds = _test_ds.map(lambda *x: serialize_to_tfr(x[0],x[1],x[2],x[3]))



folder = "/content/imdb/"
os.makedirs(folder, exist_ok=True)


train_name = "train.tfrecord"
val_name =   "val.tfrecord"
test_name =  "test.tfrecord"

print('1')
writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, train_name))
writer.write(train_ds)

print('2')
writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, val_name))
writer.write(val_ds)

print('3')
writer = tf.data.experimental.TFRecordWriter(os.path.join(folder, test_name))
writer.write(test_ds)


1
2
3


In [None]:
import json

vocab = vectorize_layer.get_vocabulary()
tokens = {u:i for i,u in enumerate(vocab)}

with open(folder + 'tokens.json', 'w') as fp:
    json.dump(tokens, fp)



In [None]:
def parse_from_tfr(element):

    feature_description = \
        {'seq': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'seq_len': tf.io.FixedLenFeature([], tf.int64, default_value= 0),
        'enc_seq': tf.io.FixedLenFeature([], tf.string, default_value=''),
        'label': tf.io.FixedLenFeature([], tf.float32, default_value=0.0)}

    output = tf.io.parse_example(element, feature_description)
    seq = tf.io.parse_tensor(output['seq'], out_type = tf.string)
    seq_len = output['seq_len']
    enc_seq = tf.io.parse_tensor(output['enc_seq'], out_type = tf.int64)
    enc_seq = tf.cast(enc_seq, tf.int32)
    label = output['label']

    return seq, seq_len, enc_seq, label


te = tf.data.TFRecordDataset(['/content/imdb/test.tfrecord'])
te = te.map(parse_from_tfr)
print(sum([1 for i in te]))


tr = tf.data.TFRecordDataset(['/content/imdb/train.tfrecord'])
tr = tr.map(parse_from_tfr)
print(sum([1 for i in tr]))

v = tf.data.TFRecordDataset(['/content/imdb/val.tfrecord'])
v = v.map(parse_from_tfr)
print(sum([1 for i in v]))

25000
22500
2500


In [None]:
for i in te.take(1):
    print(i[0])
    print(i[1])
    print(i[2])
    print(i[3])
print("\n")
for i in tr.take(2):
    print(i[0])
    print(i[1])
    print(i[2])
    print(i[3])

tf.Tensor(b'Late one night on Tom Snyder\'s "Tomorrow" Show, I watched Tom ask his guest Henry Morgan what he considered to be \'perfect.\' Morgan responded, "Anything with Glenda Jackson." And although I wouldn\'t consider this film to be perfect, it does bear out that notion very well. I was about to use the clich\xc3\xa9\' about Hollywood not making pictures like this anymore, but then I just saw, "Up in the Air," another intelligent film about 2 people over the age of 35 who fall in love. That\'s where the similarities end, though. "House Calls" is just sheer fun watching 2 pros like Matthau and Jackson hit it off and seem completely natural while they\'re at it. I saw this film in the theater in 1978 (at the ripe old age of 18) and it took me another 20 years to get all of the jokes. Any film that can make punch lines out of 1920\'s tennis great Bill Tilden, and British Prime Minister Neville Chamberlain wouldn\'t play too well at the megaplex these days. One other thought: the or

In [None]:
id2vocab = {v:k for k,v in tokens.items()}

sen = []

for i in t.skip(12499).take(1):
    print(i[0])
    for j in i[2].numpy():
        sen.append(id2vocab[j])

print(' '.join(sen))

tf.Tensor(b"I thought this movie was fun. I have never really watched old movies before and this one was a really great first date film. It had warmth and heart and spirit. Was kind of cheesy but in today's film industry, cheesy is cute. I gave it a ten and I highly suggest renting, buying or seeing the movie anyway you can. Gene Kelly was very dreamy and a little bit sarcastic and you knew the character thought that he was gonna have it all. The female lead was cast perfect because their two personalities had spark and you wanted to hold on and see what would happen. The grandma in the movie was priceless. The perfect addition to a great old movie. I love the fact it was black and white and Gene Kelly is so sweet with all the kids in the movie that you can't help liking him. See It.", shape=(), dtype=string)
i thought this movie was fun i have never really watched old movies before and this one was a really great first date film it had warmth and heart and spirit was kind of cheesy bu

# Now create Kaggle ds from tfrecords

In [None]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [None]:
%cd /content/imdb
! kaggle datasets init

/content/imdb
Data package template written to: /content/imdb/dataset-metadata.json


In [None]:
! kaggle datasets create -p /content/imdb

Starting upload for file val.tfrecord
100% 50.3M/50.3M [00:09<00:00, 5.44MB/s]
Upload successful: val.tfrecord (50MB)
Starting upload for file test.tfrecord
100% 458M/458M [00:29<00:00, 16.3MB/s]
Upload successful: test.tfrecord (458MB)
Starting upload for file tokens.json
100% 2.10M/2.10M [00:08<00:00, 254kB/s]
Upload successful: tokens.json (2MB)
Starting upload for file train.tfrecord
100% 453M/453M [00:29<00:00, 16.2MB/s]
Upload successful: train.tfrecord (453MB)
Your private Dataset is being created. Please check progress at https://www.kaggle.com/loveyoutoo/imdbsentiment
