### Jonathan Bunch

7 November 2021

Bellevue University

DSC550-T301

---

# Week 9-10 Exercises

### Part 1: Neural Network Classifier with Scikit

In [2]:
# Import libraries.
import pandas as pd
import unicodedata
import sys
import nltk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

I ran into issues with the text normalizer used in the textbook, so I used the following text normalizing function from
my week 2 solutions instead.

In [3]:
# Create a translation table that maps punctuation characters to None values.
punc_tbl = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P'))
# Load the list of stop words from the nltk library.
# nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
# Create a Stemmer from the nltk PorterStemmer function.
porter = nltk.stem.porter.PorterStemmer()


def prep_text(t):
    """ This function will convert all text to lowercase letters, remove all punctuation, tokenize the text into words,
    remove all stop words, apply the PorterStemmer function to each word, and finally recombine the tokenized words."""
    # Strip any leading or following white spaces and convert all text to lowercase.
    text = t.strip().lower()
    # Apply the translation table to the text to remove punctuation characters.
    text = text.translate(punc_tbl)
    # Next, we will tokenize the text into individual words.
    tok_text = nltk.tokenize.word_tokenize(text)
    # Create a new list of tokenized words that are NOT stop words.
    tok_text_nsw = [word for word in tok_text if word not in stop_words]
    # Now we can apply the nltk PorterStemmer function to stem the tokenized words.
    tok_text_fin = [porter.stem(word) for word in tok_text_nsw]
    # Combine the cleaned and stemmed words back into a string.
    text_fin = " ".join(tok_text_fin)
    return text_fin

In [4]:
# Import the data.
comments = pd.read_json("categorized-comments.jsonl", lines=True)

# My computer is not very powerful, so I had to work with a smaller sample from the data set.
cdf1 = comments.sample(n=5000, random_state=5).reset_index(drop=True)
# Delete the original to free up memory.
del comments

# I will specify the data type for the category column and change the column names.
cdf1 = cdf1.astype({'cat': 'category'}).rename(columns={'cat': 'true_category', 'txt': 'comment'})

# Apply the text normalizer function that we wrote in week 2.
cdf1['normed_comment'] = cdf1.comment.apply(prep_text)

In [5]:
# Assign the features to variables for convenience.
X = cdf1.normed_comment
y = cdf1.true_category

# Create the pipeline that will apply the tfidf vectorizer and neural network classifier.
classifier = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('ann', MLPClassifier(random_state=5, activation='identity', hidden_layer_sizes=[50, 20], max_iter=500))
        ])

# Calculate the evaluation metrics for the classifier pipeline.
accuracy_scores = cross_val_score(classifier, X, y, cv=3, scoring='accuracy')
print("Accuracy Scores: ", accuracy_scores)
recall_scores = cross_val_score(classifier, X, y, cv=3, scoring='recall_micro')
print("Recall Scores: ", recall_scores)
precision_scores = cross_val_score(classifier, X, y, cv=3, scoring='precision_micro')
print("Precision Scores: ", precision_scores)
f1_scores = cross_val_score(classifier, X, y, cv=3, scoring='f1_micro')
print("F1 Scores: ", f1_scores)

Accuracy Scores:  [0.75344931 0.72945411 0.75990396]
Recall Scores:  [0.75344931 0.72945411 0.75990396]
Precision Scores:  [0.75344931 0.72945411 0.75990396]
F1 Scores:  [0.75344931 0.72945411 0.75990396]


### Part 2: Neural Network Classifier with Keras

In [7]:
import numpy as np
from keras.layers import Dense
from keras.models import Sequential

In [12]:
# For this step, I will apply the TfidfVectorizer seperately so I can see the number of features for the model input.
vectorizer = TfidfVectorizer()
X_vec = vectorizer.fit_transform(X)
X_vec = X_vec.toarray()

# Replace the target categories with dummy variables.
y_int = y.replace({'sports': 1, 'video_games': 2, 'science_and_technology': 3})

# Create the neural network model.
nn = Sequential()
nn.add(Dense(100, activation='relu', input_dim=9967))
nn.add(Dense(50, activation='relu'))
nn.add(Dense(1, activation='softmax'))
nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit the model.
nn.fit(X_vec, y_int, epochs=10, batch_size=128)

# Calculate the accuracy.
accuracy = nn.evaluate(X_vec, y_int)[1]
print("Model Accuracy: ", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model Accuracy:  0.23080000281333923


### Part 3: Classifying Images

Unfortunately, I could not fit this model successfully.  The error I ran into seems to be related to the
keras/tensorflow backend expecting to run on a GPU or CPU that I do not have available or correctly configured.

In [13]:
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K

In [14]:
# The following code is taken directly from the book.
# References: Chris Albon, Machine Learning with Python Cookbook
K.set_image_data_format("channels_first")
np.random.seed(0)
channels = 1
height = 28
width = 28

(data_train, target_train), (data_test, target_test) = mnist.load_data()
data_train = data_train.reshape(data_train.shape[0], channels, height, width)
data_test = data_test.reshape(data_test.shape[0], channels, height, width)

features_train = data_train / 255
features_test = data_test / 255

target_train = np_utils.to_categorical(target_train)
target_test = np_utils.to_categorical(target_test)
number_of_classes = target_test.shape[1]

# Create the CNN model.
network = Sequential()
network.add(Conv2D(filters=64,
                   kernel_size=(5, 5),
                   input_shape=(channels, width, height),
                   activation='relu'))

network.add(MaxPooling2D(pool_size=(2, 2)))
network.add(Dropout(0.5))
network.add(Flatten())
network.add(Dense(128, activation="relu"))
network.add(Dropout(0.5))
network.add(Dense(number_of_classes, activation="softmax"))

network.compile(loss="categorical_crossentropy",
                optimizer="rmsprop",
                metrics=["accuracy"])

In [16]:
# Fit the model.
network.fit(features_train,
            target_train,
            epochs=2,
            verbose=0,
            batch_size=1000,
            validation_data=(features_test, target_test))

InvalidArgumentError:  Default MaxPoolingOp only supports NHWC on device type CPU
	 [[node sequential_5/max_pooling2d/MaxPool
 (defined at /Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/layers/pooling.py:357)
]] [Op:__inference_train_function_8540]

Errors may have originated from an input operation.
Input Source operations connected to node sequential_5/max_pooling2d/MaxPool:
In[0] sequential_5/conv2d/Relu (defined at /Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/backend.py:4867)

Operation defined at: (most recent call last)
>>>   File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
>>>     return _run_code(code, main_globals, None,
>>> 
>>>   File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
>>>     exec(code, run_globals)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel_launcher.py", line 16, in <module>
>>>     app.launch_new_instance()
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/traitlets/config/application.py", line 846, in launch_instance
>>>     app.start()
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 677, in start
>>>     self.io_loop.start()
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 199, in start
>>>     self.asyncio_loop.run_forever()
>>> 
>>>   File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 596, in run_forever
>>>     self._run_once()
>>> 
>>>   File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/base_events.py", line 1890, in _run_once
>>>     handle._run()
>>> 
>>>   File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/asyncio/events.py", line 80, in _run
>>>     self._context.run(self._callback, *self._args)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 457, in dispatch_queue
>>>     await self.process_one()
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 446, in process_one
>>>     await dispatch(*args)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 353, in dispatch_shell
>>>     await result
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 648, in execute_request
>>>     reply_content = await reply_content
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 353, in do_execute
>>>     res = shell.run_cell(code, store_history=store_history, silent=silent)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
>>>     return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2901, in run_cell
>>>     result = self._run_cell(
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 2947, in _run_cell
>>>     return runner(coro)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
>>>     coro.send(None)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3172, in run_cell_async
>>>     has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3364, in run_ast_nodes
>>>     if (await self.run_code(code, result,  async_=asy)):
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
>>>     exec(code_obj, self.user_global_ns, self.user_ns)
>>> 
>>>   File "/var/folders/h6/5pjr2l7j7vl5b83hw7spkt2r0000gn/T/ipykernel_22093/1464944213.py", line 1, in <module>
>>>     network.fit(features_train, # 特徴量
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/training.py", line 1216, in fit
>>>     tmp_logs = self.train_function(iterator)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/training.py", line 878, in train_function
>>>     return step_function(self, iterator)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/training.py", line 867, in step_function
>>>     outputs = model.distribute_strategy.run(run_step, args=(data,))
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/training.py", line 860, in run_step
>>>     outputs = model.train_step(data)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/training.py", line 808, in train_step
>>>     y_pred = self(x, training=True)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/sequential.py", line 373, in call
>>>     return super(Sequential, self).call(inputs, training=training, mask=mask)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/functional.py", line 451, in call
>>>     return self._run_internal_graph(
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/functional.py", line 589, in _run_internal_graph
>>>     outputs = node.layer(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1083, in __call__
>>>     outputs = call_fn(inputs, *args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
>>>     return fn(*args, **kwargs)
>>> 
>>>   File "/Users/jonathanbunch/PycharmProjects/dsc550/venv/lib/python3.9/site-packages/keras/layers/pooling.py", line 357, in call
>>>     outputs = self.pool_function(
>>> 