In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
import tensorflow_datasets as tfds
from collections import Counter

import re
# import string

In [43]:
# set display settings
# pd.set_option("max_columns", None)
# pd.set_option('max_colwidth', None)
# pd.set_option("expand_frame_repr", False)

In [37]:
# HELPER FUNCTIONS

# define function for token encoder
def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

#  wrap the encode function to a TF Operator
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

# clean the text
def pre_processor(text):
    # no_punctuation = text.translate(str.maketrans('', '', string.punctuation))
    return re.sub('[\W]+', ' ', text.lower())

def split_X_y(df):
  y = df.pop("vocabulary")
  # y['text_id'] = df['text_id']
  X = df.drop(['cohesion', 'syntax', 'phraseology', 'grammar', 'conventions'], axis=1)
  return X, y


In [32]:
# LOAD DATA FROM CSV FILES
def get_data(train_path, random_state):
  tf.random.set_seed(random_state)

  # prepare the training and testing and data
  df_train_raw = pd.read_csv(train_path)
  X, y = split_X_y(df_train_raw)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

  return X_train, X_test, y_train, y_test

In [25]:
# CLEAN DATA
def clean_data(X):
    X['full_text'] = X['full_text'].apply(pre_processor)
    return X

In [26]:
# CREATE VOCABULARY BY COUNTING WORD OCCURRENCES
def get_vocabulary(X_train):
  # count words
  X_train['words'] = X_train['full_text'].apply(lambda x: [word for word in x.split()])
  # df['num_full_words'] = df['full_words'].apply(lambda x: len(x))
  # df['num_cleaned_words'] = df['cleaned_text'].apply(lambda x: len(x))
  # remove stop words
  stop_words = set(stopwords.words('english'))
  X_train['words'] = X_train['words'].apply(lambda x: [word for word in x if word not in stop_words])
  X_train['num_words'] = X_train['words'].apply(lambda x: len(x))
  # df.head()
  # df.hist(column='num_words')

  # try and except the TF tokenizer
  try:
      tokenizer = tfds.features.text.Tokenizer()
  except AttributeError:
      tokenizer = tfds.deprecated.text.Tokenizer()

  # create an instance of the Counter class
  token_counts = Counter()

  for x in X_train['full_text']:
      tokens = tokenizer.tokenize(x)
      token_counts.update(tokens)
      
  print('Size of training vocabulary:', len(token_counts))
  return token_counts
  # display(token_counts)

In [27]:
# CREATE EMBEDDING BY ENCODING TEXT
def get_encoding(X, token_counts):
  # create an instance of the TF encoder class
  try:
      # token_counts contains our training vocabulary
      encoder = tfds.features.text.TokenTextEncoder(token_counts)
  except AttributeError:
      encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

  # X['encoding'] = X['full_text'].apply(lambda x: encoder.encode(x))
  # return X
  data_tf_train = X.map(encode_map_fn)
  return data_tf_train

In [28]:
# BATCH DATA
def get_batches(batch_size, X_train, X_test, y_train):
  data_tf_train = data_tf_train.map(encode_map_fn)


  data_tf = tf.data.Dataset.from_tensor_slices(
      (X_train[['encoding']].values, y_train.values)
  )

  # # divide the data into mini-batches of size 3
  # data_tf_batched = X_train.padded_batch(3, padded_shapes=([-1], []))

  # for batch in data_tf_batched:
  #     print('Batch dimension:', batch[0].shape)

  # for example in data_tf_batched.take(1):
  #     print(example[0].numpy())
  # return train_data, test_data
  return 0, 0


In [46]:
def main():
  # initialize paths to csv files
  train_path = 'train.csv'

  # ensure the state is repeatable
  random_state=16

  # define batch size
  batch_size = 32

  # get the dataframes
  X_train, X_test, y_train, y_test = get_data(train_path, random_state)

  # clean the training data
  X_train = clean_data(X_train)

  # get the unique words used
  token_counts = get_vocabulary(X_train)

  with pd.option_context('display.max_colwidth', None):
      display(token_counts)

  # X_train = get_encoding(train_tf, token_counts)
  # display(X_train.head())



  # train_data, test_data = get_batches(batch_size, X_train, X_test, y_train)


main()

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/kevin/nltk_data'
    - '/usr/local/opt/python@3.10/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/usr/local/opt/python@3.10/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/usr/local/opt/python@3.10/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
