# CNN with Embeddings - Post Content

## Load Libraries

In [1]:
# standard
import pandas as pd
import numpy as np
import random
import os

# tf and keras
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import models
from keras import layers
from keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import LabelBinarizer

# plots
import seaborn as sns
import matplotlib.pyplot as plt

# for stop words
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# for standardizing text
import re
import string

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jaredfeldman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jaredfeldman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Initial Data Load and File Creation

In [2]:
# load data
training_data = pd.read_json("train.json")
training_data.head(2)

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,...,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc
0,,0,1,0,t3_l25d7,0,Hi I am in need of food for my 4 children we a...,Hi I am in need of food for my 4 children we a...,Request Colorado Springs Help Us Please,0.0,...,False,[],0,1,0,1,,nickylvst,1317852607,1317849007
1,,2,5,0,t3_rcb83,0,I spent the last money I had on gas today. Im ...,I spent the last money I had on gas today. Im ...,"[Request] California, No cash and I could use ...",501.1111,...,False,"[AskReddit, Eve, IAmA, MontereyBay, RandomKind...",34,4258,116,11168,,fohacidal,1332652424,1332648824


In [3]:
training_data_text = training_data[['request_text_edit_aware', 'requester_received_pizza']]
training_data_text.dtypes

request_text_edit_aware     object
requester_received_pizza      bool
dtype: object

In [4]:
training_data_text

Unnamed: 0,request_text_edit_aware,requester_received_pizza
0,Hi I am in need of food for my 4 children we a...,False
1,I spent the last money I had on gas today. Im ...,False
2,My girlfriend decided it would be a good idea ...,False
3,"It's cold, I'n hungry, and to be completely ho...",False
4,hey guys:\n I love this sub. I think it's grea...,False
...,...,...
4035,Is anyone out there kind enough to help me out...,False
4036,If someone could hook me up with a $15 gift ca...,True
4037,"Have today off, soo I'll be stuck in the house...",False
4038,"I've never done anything like this before, but...",False


In [5]:
posts_pizza = training_data_text[training_data_text['requester_received_pizza'] == True]
posts_no_pizza = training_data_text[training_data_text['requester_received_pizza'] == False]

print(len(posts_pizza))
print(len(posts_no_pizza))
print(len(posts_pizza) + len(posts_no_pizza))


994
3046
4040


In [6]:
posts_pizza_list = list(posts_pizza['request_text_edit_aware'])
posts_no_pizza_list = list(posts_no_pizza['request_text_edit_aware'])

print(len(posts_pizza_list))
print(len(posts_no_pizza_list))
print(len(posts_pizza_list) + len(posts_no_pizza_list))

994
3046
4040


In [7]:
import io

# create files for each list
# uncomment below to run but files already created and moved

#for index, post in enumerate(posts_no_pizza_list):
    #print(index)
    #print(post)
    #with io.open("file_" + str(index) + ".txt", 'w', encoding = 'utf-8') as f:
        #f.write(post)

## Helper Functions

In [8]:
def custom_standardization(input_data):
    # Convert the input_data to lowercase
    lowercase = tf.strings.lower(input_data)
    
    # Remove any '<br />' tags from the text and replace them with a space
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    
    # Remove any punctuation from the text
    # 're.escape(string.punctuation)' escapes all punctuation characters for use in the regex pattern
    # The regular expression pattern '[%s]' % re.escape(string.punctuation) matches any punctuation character
    # and replaces it with an empty string, effectively removing it from the text.
    # For example, if the input_data is "Hello, world!", the regex_replace will return "Hello world"    
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [9]:
# implement a learning rate schedule to reduce learning rate each epoch

def lr_schedule(epoch):
    initial_learning_rate = 0.01 # initial learning rate
    decay_steps = 5 # number of epochs to start decay
    decay_rate = 0.5 # decay rate

    # Compute the learning rate for the current epoch using exponential decay
    # The formula used is: lr = initial_learning_rate * (decay_rate ** (epoch // decay_steps))
    # The double division '//' ensures integer division so that only after 'decay_steps' epochs,
    # the learning rate gets reduced.
    lr = initial_learning_rate * (decay_rate ** (epoch // decay_steps))
    
    # Return the computed learning rate for the current epoch    
    return lr

# Create the LearningRateScheduler callback
# The LearningRateScheduler callback will call the 'lr_schedule' function
# at the beginning of each epoch to determine the learning rate for that epoch.
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_schedule)

## Model 1: All post content

In [10]:
batch_size = 32 # divide dataset into batches of 32 samples each
seed = 123 # for reproduceability

# raw_train_ds will be a TensorFlow dataset that contains
# batches of text data and their corresponding labels
raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'posts/all_posts', # pull data from this directory
    batch_size=batch_size,
    validation_split=0.1, 
    subset='training', 
    seed=seed)

Found 4040 files belonging to 2 classes.
Using 3636 files for training.


In [11]:
# print some examples

# Take one batch (batch_size=32) from the 'raw_train_ds' dataset
for text_batch, label_batch in raw_train_ds.take(1):
    # Loop over the first three samples in the batch
    for i in range(3):
        # Convert and print the text data and their corresponding labels for each sample
        print("Post", text_batch.numpy()[i])
        print("Label", label_batch.numpy()[i])

Post b"Broke ass-college student here; got paid 10 days ago and all of my paycheck went to rent and bills. I've got no groceries in the house and all my roommates are back home for the summer. No gas in the car and the bike's out of commission :(. If any kind redditor could hook me up I'd be able to eat for two days :D\n\ngoogle maps link to domino's in my area\nhttp://maps.google.com/maps/place?cid=10239146004443176581&amp;q=domino's+pizza,+chico,+ca&amp;hl=en&amp;sll=39.722827,-121.848776&amp;sspn=0.012468,0.024856&amp;ie=UTF8&amp;ll=39.733066,-121.864429&amp;spn=0,0&amp;z=15\n\n\nEDIT: oh, and of course: I'll be pizzaing it forward when my paycheck comes next month. this is a ridiculously cool subreddit."
Label 1
Post b"21/m/sfbay multi year lurker first time poster!\nme! -&gt; http://imgur.com/VXtY6\n\nI'm just really hungry and I've got no money right now. Been unemployed for 6 months but just got a real great job offer at a consulting firm in berkeley. so i'd like to do something

In [12]:
# print what each label (0 or 1) corresponds to

print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to no_pizza
Label 1 corresponds to pizza


In [13]:
# Create a TensorFlow dataset for validation from text files stored in the directory
# The 'text_dataset_from_directory' function automatically labels the text data based on subdirectories.
# In this case, text files are stored in the 'posts/all_posts' directory, and subdirectories inside 'all_posts'
# represent different classes or categories of text data.

raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    'posts/all_posts', # directory where .txt files are stored
    batch_size=batch_size, # same as above
    validation_split=0.1, # same as above
    subset='validation', # this time, we're creating the validation set
    seed=seed) # same as above

Found 4040 files belonging to 2 classes.
Using 404 files for validation.


In [14]:
# Set the maximum number of unique tokens (words) to keep in the vocabulary
max_features = 15000

# Set the maximum sequence length of the tokenized text data
sequence_length = 300

# Create a TextVectorization layer for tokenizing and vectorizing text data
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization, # Preprocessing function for standardizing text data, defined earlier
    max_tokens=max_features,# Maximum number of unique tokens to keep in the vocabulary
    output_mode='int', # Output mode as integer indices (integers represent tokens)
    output_sequence_length=sequence_length) # Maximum sequence length of the tokenized text data

In [15]:
def vectorize_text(text, label):
    # Expand the dimensions of the 'text' tensor to make it compatible with the 'vectorize_layer'
    # The '-1' argument adds a new axis at the end, effectively converting the 1D tensor 'text' into a 2D tensor
    # For example, if 'text' was [word1, word2, word3], it will become [[word1], [word2], [word3]]
    text = tf.expand_dims(text, -1)

    # Pass the expanded 'text' tensor through the 'vectorize_layer' to convert it into numerical sequences
    # The 'vectorize_layer' was defined earlier and tokenizes the text data into integer sequences.
    # It also applies the 'custom_standardization' function for preprocessing the text.
    vectorized_text = vectorize_layer(text)

    # Return the vectorized_text and its corresponding 'label'.
    # 'label' is associated with the 'text' and represents the class/category of the text.
    return vectorized_text, label

In [16]:
# Create a new dataset 'train_text' that contains only the text data (x) from 'raw_train_ds'
# The 'train_text' dataset is created using the 'map' function,
# which extracts only the 'x' (text) part of the input tuple (x, y).
train_text = raw_train_ds.map(lambda x, y: x)

# Adapt the 'vectorize_layer' to the training data
# This step is necessary to build the vocabulary and tokenize the text data based on the training dataset.
vectorize_layer.adapt(train_text)

In [17]:
# Get the next batch of data from 'raw_train_ds'
post_batch, label_batch = next(iter(raw_train_ds))

# Extract the first post (text data) and its corresponding label from the batch
first_post, first_label = post_batch[0], label_batch[0]

# Print the raw text post
print("Post--->", first_post)

# Print break
print("-" * 100)

# Print the label of corresponding post
print("Label--->", raw_train_ds.class_names[first_label])

# Print break
print("-" * 100)

# Print the vectorized post and corresponding label
print("Vectorized post--->", vectorize_text(first_post, first_label))

Post---> tf.Tensor(b"I'm helping my friend out who got kicked out of his house, letting him crash here and offering my time and whatever else he needs. As a result of helping him drink his way out of this (not healthy, I know, but we all deal in different ways, and he IS looking for a new place) we're broke and I'm out of ideas to cheer him up. Then I saw this place. It would be a perfect story, and cheer him up immensely, if some random stranger on the internet helped us out. ", shape=(), dtype=string)
----------------------------------------------------------------------------------------------------
Label---> pizza
----------------------------------------------------------------------------------------------------
Vectorized post---> (<tf.Tensor: shape=(1, 300), dtype=int64, numpy=
array([[  17,  430,    7,  191,   24,  121,   77,  949,   24,    8,  140,
         139, 1326,  151, 2520,   62,    4, 1376,    7,   69,    4,  401,
         245,   84,  624,   38,    5, 1852,    8,  430, 

In [18]:
# Print the size of the entire vocabulary
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

# Print two samples, from index 958 and index 135
print("958 ---> ",vectorize_layer.get_vocabulary()[958])
print("135 ---> ",vectorize_layer.get_vocabulary()[135])

Vocabulary size: 12570
958 --->  training
135 --->  them


In [19]:
# Map the 'vectorize_text' function to 'raw_train_ds' to convert its text data into vectorized numerical sequences
# The 'vectorize_text' function preprocesses and tokenizes the text data using the 'vectorize_layer'
# The resulting dataset 'train_ds' contains batches of vectorized text data and their corresponding labels.
train_ds = raw_train_ds.map(vectorize_text)

# Map the 'vectorize_text' function to 'raw_val_ds' to convert its text data into vectorized numerical sequences
# The 'vectorize_text' function preprocesses and tokenizes the text data using the 'vectorize_layer'
# The resulting dataset 'val_ds' contains batches of vectorized text data and their corresponding labels.
val_ds = raw_val_ds.map(vectorize_text)

In [20]:
# Define AUTOTUNE, which allows TensorFlow to automatically tune the buffer size for optimal performance.
AUTOTUNE = tf.data.AUTOTUNE

# Cache the 'train_ds' dataset to improve data loading speed during training.
# Caching stores the data in memory after the first iteration through the dataset,
# so subsequent iterations can access it faster without re-reading from the disk.
train_ds = train_ds.cache()

# Prefetch the 'train_ds' dataset to overlap data preprocessing and model execution.
# Prefetching allows the data pipeline to fetch data for the next batch while the current batch is being processed,
# reducing the idle time and maximizing GPU utilization during training.
train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)

# Cache the 'val_ds' dataset to improve data loading speed during validation.
# Caching stores the data in memory after the first iteration through the dataset,
# so subsequent iterations can access it faster without re-reading from the disk.
val_ds = val_ds.cache()

# Prefetch the 'val_ds' dataset to overlap data preprocessing and model execution during validation.
# Prefetching allows the data pipeline to fetch data for the next batch while the current batch is being processed,
# reducing the idle time and maximizing GPU utilization during validation.
val_ds = val_ds.prefetch(buffer_size=AUTOTUNE)

In [21]:
def build_model():
    # Clear previous session and set a random seed for reproducibility
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)

    model = tf.keras.Sequential()

    # Add an Embedding layer to convert text data into dense numerical vectors
    model.add(tf.keras.layers.Embedding(
        input_dim = max_features,  # size of feature vocabulary (number of unique words), defined earlier
        output_dim = 4,  # dimension of the embedding vector for each word (embedding dimension)
        input_length=sequence_length  # Length of each input sequence (number of words in a post), defined earlier
    ))
    
    # Add a 1D Convolutional layer to capture local patterns in the text
    model.add(tf.keras.layers.Conv1D(
        filters=16,        # Number of filters (output channels)
        strides=3,         # Stride size for the convolution operation
        padding='same',    # Padding to ensure the output has the same length as the input
        kernel_size=12,    # Length of the convolutional kernel (window size)
        activation='relu'
    ))

    # Add a Global Average Pooling layer to aggregate information over the sequence dimension
    # This reduces the sequence length to 1, making each post represented by a single vector
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    
    # Alternatively, we could concatenate the embedding representations of 
    # all tokens in the movie review
    #model.add(tf.keras.layers.Flatten())

    # Add an additional Dense layer with 16 units and ReLU activation
    model.add(tf.keras.layers.Dense(
      units=16,        
      activation='relu'))
    
    # Add the output layer with 1 unit and sigmoid activation for binary classification    
    model.add(tf.keras.layers.Dense(
      units=1,        
      activation='sigmoid'))

    # Compile the model with binary cross-entropy loss for binary classification
    # Use the Adam optimizer with default learning rate
    model.compile(loss='binary_crossentropy', 
                optimizer=tf.keras.optimizers.legacy.Adam(),#learning_rate = 0.001), commented out to use learning rate schedule
                metrics=['accuracy'])
    
    return model

In [22]:
# Create the model using the build_model function
model_all_posts = build_model()

# Display the model layers.
display(model_all_posts.layers)

# Display a summary of the mdel architecture
display(model_all_posts.summary())

# Retrieve the embeddings layer weights
# The embeddings are stored in the last layer of the model
embeddings = model_all_posts.layers[-1].get_weights()[0]

[<keras.src.layers.core.embedding.Embedding at 0x282db6a30>,
 <keras.src.layers.convolutional.conv1d.Conv1D at 0x28808ba90>,
 <keras.src.layers.pooling.global_average_pooling1d.GlobalAveragePooling1D at 0x28815fb20>,
 <keras.src.layers.core.dense.Dense at 0x288a23640>,
 <keras.src.layers.core.dense.Dense at 0x288a237c0>]

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 4)            60000     
                                                                 
 conv1d (Conv1D)             (None, 100, 16)           784       
                                                                 
 global_average_pooling1d (  (None, 16)                0         
 GlobalAveragePooling1D)                                         
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 61073 (238.57 KB)
Trainable params: 61073 (238.57 KB)
Non-trainable params: 0 (0.00 Byte)
__________________

None

In [23]:
history = model_all_posts.fit(
    train_ds,
    batch_size = 16,
    validation_data=val_ds,
    epochs=15,
    validation_split=0.1)#,
    #callbacks=[lr_scheduler]) # with this commented out, we'll use the default learning rate of 0.001

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


## Model 2: 700 Samples of Each Class