# Import data and libraries

In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import re
import random
import pickle

In [2]:
df = pd.read_csv("../data/interim/filtered_reddit_scrape.csv", low_memory=False)

In [3]:
df.head(5)

Unnamed: 0,author,score,created,subreddit,title,body,id,comment_author,comment_body,comment_score,comment_edited
0,presti-ravioli,4802,2019-08-12 20:50:20,depression,Depression for me is a constant feeling of wan...,It’s a constant sense of wanting to go somewhe...,cpidz8,RuneRaccoon,That's a good description of it. Good luck on ...,516.0,False
1,presti-ravioli,4802,2019-08-12 20:50:20,depression,Depression for me is a constant feeling of wan...,It’s a constant sense of wanting to go somewhe...,cpidz8,Kavlone,this is the first post i’ve clicked on from th...,497.0,False
2,presti-ravioli,4802,2019-08-12 20:50:20,depression,Depression for me is a constant feeling of wan...,It’s a constant sense of wanting to go somewhe...,cpidz8,bennynthejetsss,There’s a name for this phenomenon. It’s calle...,217.0,1565648011.0
3,presti-ravioli,4802,2019-08-12 20:50:20,depression,Depression for me is a constant feeling of wan...,It’s a constant sense of wanting to go somewhe...,cpidz8,AvoxGirl,Accurate. I saw a post recently that said “Dep...,129.0,False
4,presti-ravioli,4802,2019-08-12 20:50:20,depression,Depression for me is a constant feeling of wan...,It’s a constant sense of wanting to go somewhe...,cpidz8,thethirdman3,I’m the EXACT same way. “That place has gotta ...,49.0,False


# 1. Clean the text

In [4]:
def clean_text(text):
    '''Clean text by removing unnecessary characters and altering the format of words.'''
    text = str(text)
    text = text.lower()
    text = re.sub(r"’", "'", text)
    text = re.sub(r"\n", "", text)
    
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"there's", "there is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[^a-zA-Z?.!,]+", " ", text)
   # text = re.sub(r"([?.!,])", r" \1 ", text)
    
    return text

In [5]:
prompts = df['body'].apply(clean_text)
responses = df['comment_body'].apply(clean_text)

# 2. Tokenize

## 2.1 Initialize tokenizer

In [6]:
# Build tokenizer using tfds for both questions and answers
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    prompts + responses, target_vocab_size=2**13)

# Define start and end token to indicate the start and end of a sentence
START_TOKEN, END_TOKEN = [tokenizer.vocab_size], [tokenizer.vocab_size + 1]

# Vocabulary size plus start and end token
VOCAB_SIZE = tokenizer.vocab_size + 2

## 2.1 Demonstrate how tokenizer works

In [7]:
rand_ind = random.randint(0, len(prompts))
print(f"random_index: {rand_ind} \n\nprompt: {prompts[rand_ind]}\ntoken_prompt: {tokenizer.encode(prompts[rand_ind])}"
      f"\n\nresponse: {responses[rand_ind]}\ntoken_response: {tokenizer.encode(responses[rand_ind])}")

random_index: 6947 

prompt: i am a dreamer, a deep thinker. always picturing myself as a worry free, young and happy person enjoying it with someone i truly love. however, being stuck in my head reaching for unrealistic goals knowing l will always be anxious or suffering from bad mental health, and the crippling limits i put on myself will always prevent me from living the life i so desperately wish i could live. and i might just not be good enough for anyone, the fear of being alone forever, which really brings me down. so i continue to play scenes in my head of things i wish to happen, to keep me having hope that maybe just maybe, things will be okay.
token_prompt: [1, 17, 7, 2137, 436, 3, 7, 649, 662, 436, 2, 103, 7210, 79, 38, 7, 462, 1220, 3, 1051, 5, 169, 185, 2181, 8, 25, 106, 1, 550, 865, 2, 629, 3, 85, 791, 16, 11, 338, 2536, 18, 7943, 1713, 2624, 669, 364, 39, 103, 30, 340, 29, 777, 54, 206, 260, 1136, 3, 5, 6, 1887, 7024, 1, 233, 31, 79, 39, 103, 2229, 21, 54, 362, 6, 89, 1

In [8]:
print(f"Vocab size: {VOCAB_SIZE}")
print(f"Number of samples: {len(prompts)}")

Vocab size: 8228
Number of samples: 148630


## 2.2 Tokenize the data

In [9]:
# Maximum sentence length
MAX_LENGTH = 40


# Tokenize, filter and pad sentences
def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []
  
  for (sentence1, sentence2) in zip(inputs, outputs):
    # tokenize sentence
    sentence1 = START_TOKEN + tokenizer.encode(sentence1) + END_TOKEN
    sentence2 = START_TOKEN + tokenizer.encode(sentence2) + END_TOKEN
    # check tokenized sentence max length
    if len(sentence1) <= MAX_LENGTH and len(sentence2) <= MAX_LENGTH:
      tokenized_inputs.append(sentence1)
      tokenized_outputs.append(sentence2)
  
  # pad tokenized sentences
  tokenized_inputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_inputs, maxlen=MAX_LENGTH, padding='post')
  tokenized_outputs = tf.keras.preprocessing.sequence.pad_sequences(
      tokenized_outputs, maxlen=MAX_LENGTH, padding='post')
  
  return tokenized_inputs, tokenized_outputs


tokenized_prompts, tokenized_responses = tokenize_and_filter(prompts, responses)

# 3. Save the Preprocessed Data

In [10]:
tokenized_prompts

array([[8226,  412,    3, ...,    0,    0,    0],
       [8226,  412,    3, ...,    0,    0,    0],
       [8226,  412,    3, ...,    0,    0,    0],
       ...,
       [8226,   49,  647, ...,    0,    0,    0],
       [8226,   23,  862, ...,    0,    0,    0],
       [8226,    1,   15, ...,    0,    0,    0]], dtype=int32)

In [11]:
tokenized_responses

array([[8226,    6,  457, ...,    0,    0,    0],
       [8226, 1858, 8002, ...,    0,    0,    0],
       [8226, 1381,    2, ...,    0,    0,    0],
       ...,
       [8226, 3720,    4, ...,    0,    0,    0],
       [8226,  425, 8016, ...,    0,    0,    0],
       [8226,    1,   19, ...,    0,    0,    0]], dtype=int32)

In [11]:
# save constant values
constants = {'START_TOKEN': START_TOKEN, 'END_TOKEN': END_TOKEN, 'VOCAB_SIZE': VOCAB_SIZE}
pickle.dump(constants, open("../data/processed/constants.p", "wb")) # pickle prompts

# save the tokenized prompts, responses, and tokenizer
pickle.dump(tokenized_prompts, open("../data/processed/tok_prompts.p", "wb"))
pickle.dump(tokenized_responses, open("../data/processed/tok_responses.p", "wb")) 
pickle.dump(tokenizer, open("../data/processed/tokenizer.p", "wb")) 
