In [171]:
 pip install tensorflow-datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [172]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter

import re
# import string

[nltk_data] Downloading package stopwords to /Users/kevin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [173]:
# set display settings
# pd.set_option("max_columns", None)
# pd.set_option('max_colwidth', None)
# pd.set_option("expand_frame_repr", False)

In [174]:
# HELPER FUNCTIONS
tokenizer = tfds.deprecated.text.Tokenizer()

# clean the text
def pre_processor(text):
    # no_punctuation = text.translate(str.maketrans('', '', string.punctuation))
    return re.sub('[\W]+', ' ', text.lower())

def split_X_y(df):
  y = df.pop("vocabulary")
  # y['text_id'] = df['text_id']
  X = df.drop(['cohesion', 'syntax', 'phraseology', 'grammar', 'conventions'], axis=1)
  return X, y


In [175]:
# LOAD DATA FROM CSV FILES
def get_data(train_path, random_state):
  tf.random.set_seed(random_state)

  # prepare the training and testing and data
  df_raw = pd.read_csv(train_path)
  X, y = split_X_y(df_raw)

  # clean the data
  X = clean_data(X)

  # store the dataframe as a tensorflow dataset
  data_tf = tf.data.Dataset.from_tensor_slices(
      (X['full_text'].values, y.values)
  )

  # separate the data into train, test, and validation groups
  splits=[0.6, 0.2, 0.2]
  data_tf_test = data_tf.take(int(df_raw.shape[0]*splits[2]))
  data_tf_train_valid = data_tf.skip(int(df_raw.shape[0]*splits[2]))
  data_tf_train = data_tf_train_valid.take(int(df_raw.shape[0]*splits[0]))
  data_tf_valid = data_tf_train_valid.skip(int(df_raw.shape[0]*splits[0]))

  # return X_train, X_test, y_train, y_test
  return data_tf_test, data_tf_train, data_tf_valid

In [176]:
# CLEAN DATA
def clean_data(X):
    X['full_text'] = X['full_text'].apply(pre_processor)

    # count words
    X['words'] = X['full_text'].apply(lambda x: [word for word in x.split()])
    # df['num_full_words'] = df['full_words'].apply(lambda x: len(x))
    # df['num_cleaned_words'] = df['cleaned_text'].apply(lambda x: len(x))
    # remove stop words
    stop_words = set(stopwords.words('english'))
    X['words'] = X['words'].apply(lambda x: [word for word in x if word not in stop_words])
    X['num_words'] = X['words'].apply(lambda x: len(x))
    # df.head()
    # df.hist(column='num_words')
    X['full_text'] = X['words'].apply(lambda x: " ".join(x))
    return X

In [186]:
# CREATE VOCABULARY BY COUNTING WORD OCCURRENCES
def get_vocabulary(data_tf_train):
  # try and except the TF tokenizer
  # try:
  #     tokenizer = tfds.features.text.Tokenizer()
  # except AttributeError:
  tokenizer = tfds.deprecated.text.Tokenizer()

  # create an instance of the Counter class
  token_counts = Counter()

  for example in data_tf_train:
      tokens = tokenizer.tokenize(example[0].numpy())
      token_counts.update(tokens)
      
  print('Size of training vocabulary:', len(token_counts))
  return token_counts
  # display(token_counts)

In [190]:
# CREATE EMBEDDING BY ENCODING TEXT
def get_encoding(data_tf_train, data_tf_valid, data_tf_test, token_counts):
  # define function for token encoder
  def encode(text_tensor, label):
      text = text_tensor.numpy()[0]
      encoded_text = encoder.encode(text)
      return encoded_text, label

  #  wrap the encode function to a TF Operator
  def encode_map_fn(text, label):
      return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

  # # create an instance of the TF encoder class
  # try:
  #     # token_counts contains our training vocabulary
  #     encoder = tfds.features.text.TokenTextEncoder(token_counts)
  # except AttributeError:
  #     encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)
  encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

  data_tf_train = data_tf_train.map(encode_map_fn, encoder)
  data_tf_valid = data_tf_valid.map(encode_map_fn, encoder)
  data_tf_test = data_tf_test.map(encode_map_fn, encoder)
  return data_tf_train, data_tf_valid, data_tf_test

In [191]:
# # BATCH DATA
# def get_batches(batch_size, X_train, X_test, y_train):
#   data_tf_train = data_tf_train.map(encode_map_fn)
#
#   data_tf = tf.data.Dataset.from_tensor_slices(
#       (X_train[['encoding']].values, y_train.values)
#   )
#
#   # # divide the data into mini-batches of size 3
#   # data_tf_batched = X_train.padded_batch(3, padded_shapes=([-1], []))
#
#   # for batch in data_tf_batched:
#   #     print('Batch dimension:', batch[0].shape)
#
#   # for example in data_tf_batched.take(1):
#   #     print(example[0].numpy())
#   # return train_data, test_data
#   return 0, 0


In [192]:
def main():
  # initialize paths to csv files
  train_path = 'train.csv'

  # ensure the state is repeatable
  random_state=16

  # define batch size
  batch_size = 32

  # get the dataframes
  data_tf_test, data_tf_train, data_tf_valid = get_data(train_path, random_state)

  # get the unique words used
  token_counts = get_vocabulary(data_tf_train)
  # display(token_counts)

  data_tf_train, data_tf_valid, data_tf_test = get_encoding(data_tf_train, data_tf_valid, data_tf_test, token_counts)

  for example in data_tf_train.take(2):
    print ('----------------------------')
    print('Sequence length:', example[0].shape)
    print('Integer sequence:\n', example[0].numpy())


  # # inspection of the first 5 examples
  # for example in data_tf_test.take(5):
  #     # print review (first 40 characters) and sentiment (label)
  #     tf.print(example[0], example[1])
  #
  # # inspection of the first 5 examples
  # for example in data_tf_train.take(5):
  #     # print review (first 40 characters) and sentiment (label)
  #     tf.print(example[0], example[1])
  #
  # # inspection of the first 5 examples
  # for example in data_tf_valid.take(5):
  #     # print review (first 40 characters) and sentiment (label)
  #     tf.print(example[0], example[1])


main()

Size of training vocabulary: 16020


ValueError: Attempt to convert a value (<TokenTextEncoder vocab_size=16022>) with an unsupported type (<class 'tensorflow_datasets.core.deprecated.text.text_encoder.TokenTextEncoder'>) to a Tensor.

In [None]:
# divide the data into mini-batches of size 3

data_tf_batched = data_tf_train_subset.padded_batch(3, padded_shapes=([-1], []))

for batch in data_tf_batched:
    print('Batch dimension:', batch[0].shape)

for example in data_tf_batched.take(1):
    print(example[0].numpy())