# Stoneburner, Kurt
- ## DSC 550 - Week 09/10

In [1]:
import os
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)


### 2. Neural Network Classifier with Keras###
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [3]:
#https://oindrilasen.com/2021/02/how-to-install-and-import-keras-in-anaconda-jupyter-notebooks/

from keras.layers import Dense
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix
import tensorflow as tf
import time

In [4]:
df = pd.read_pickle("z_wk09_categorized_comments_processed.zip")

df['processed'] = df['lema_stem_tokens'].apply(lambda word_list: ' '.join(word_list)) 

#//*** Convert categorical string to categorical int
#//*** Only run once to prevent iPython issues
if (df.dtypes['cat'] == object):
    cat_dict = dict(tuple(enumerate(df['cat'].unique())))
    #//*** Build sexcat Categorical column
    df['intcat'] = df['cat'].copy()
    
    #//*** replace values using the sex_dict dictionary
    for key,value in cat_dict.items():
        df['intcat'] = df['intcat'].replace(value,key)

In [8]:
#//*** Reference Code: Applied Text Analysys with Python p282.
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

def build_network():
    """
    Create a function that returns a compiled neural network
    """
    nn = Sequential()
    nn.add(Dense(N_FEATURES, activation='relu', input_shape=(N_FEATURES,)))
    
    nn.add(Dense(N_FEATURES*SECOND_LAYER_SIZE, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

score_df = pd.DataFrame()


#//*** Theoretically, this number should be return as the second value of the tfidf sparse matrix
N_FEATURES = 1000

#//*** X is Post Processed Data to evaluate
data_model_x = df['processed']

tfidf = TfidfVectorizer(max_features=N_FEATURES)

data_model_y = df['intcat']

test_size=.5
test_size=.98
#continuous scoring model
#scoring = 'r2_score'

#data_model_x = tfidf.fit_transform(data_model_x)

#//*** N_CLASSES the number of categories to solve for
N_CLASSES = len(df['intcat'].unique())

FEATURE_PERCENT = .25
SECOND_LAYER_SIZE = .25
epochs = 2

for test_size in [.9]:
    # split the data randomly into test/train sets
    x_train, x_test, y_train, y_test = train_test_split(data_model_x, data_model_y, test_size =test_size, random_state=0)


    #vectorizer = TfidfVectorizer(max_features=N_FEATURES)

    #//*** Initialize the Vectorizer, get all the features
    tfidf = TfidfVectorizer()

    print("First pass Vectorizing")
    total_features = tfidf.fit_transform(x_train).shape[1]


    for FEATURE_PERCENT in [.25,.5,.75,1]:
        # split the data randomly into test/train sets
        x_train, x_test, y_train, y_test = train_test_split(data_model_x, data_model_y, test_size =test_size, random_state=0)

        N_FEATURES = int(total_features * FEATURE_PERCENT)

        print(f"Re-Vectorizing: max_features={N_FEATURES} [{total_features}*{FEATURE_PERCENT}]")

        vectorizer = TfidfVectorizer(max_features=N_FEATURES)


        x_train = vectorizer.fit_transform(x_train)
        x_test = vectorizer.transform(x_test)

        x_train = convert_sparse_matrix_to_sparse_tensor(x_train)
        x_test = convert_sparse_matrix_to_sparse_tensor(x_test)

        for SECOND_LAYER_SIZE in [.25,.5,.75,1]:

            for epochs in [5]:

                pipeline = Pipeline([
                #  ('norm', TextNormalizer()),
                #  ('vect', TfidfVectorizer(max_features=N_FEATURES)),
                 ('nn', KerasClassifier(build_fn=build_network,
                 epochs=epochs,
                batch_size=128))
                 ])

                start_time = time.time()

                pipeline.fit(x_train,y_train)

                model_run_time = time.time() - start_time 
                #print(f"Model Run Time: {model_run_time}s")
                precision = 1
                accuracy = 1
                recall = 1

                print('Scoring...')
                start_score_time = time.time()
                y_predicted = pipeline.predict(x_test)
                print(f"Scoring Time {round(time.time()-start_score_time,2)}s")

                precision = precision_score(y_test, y_predicted, average='micro')
                accuracy = accuracy_score(y_test, y_predicted)
                recall = recall_score(y_test, y_predicted, average ='micro')
                cm = confusion_matrix(y_test, y_predicted)

                print("========")
                print(f"Precision: {precision}")
                print(f"Accuracy: {accuracy}")
                print(f"Recall: {recall}")
                print(f"cm: {cm}")

                if len(score_df) == 0:
                    score_df = pd.DataFrame()
                    score_df['acc_per_s'] = [accuracy/model_run_time]    
                    score_df['time'] = [round(model_run_time,0)]  
                    score_df['size'] = [int((1-test_size)*len(data_model_x))]
                    score_df['tf'] = [total_features]    
                    score_df['fp'] = [FEATURE_PERCENT]    
                    score_df['features'] = [N_FEATURES]    
                    score_df['sls'] = [SECOND_LAYER_SIZE]    
                    score_df['epochs'] = [epochs]    
                    score_df['accuracy'] = [accuracy]    
                    score_df['precision'] = [precision]    
                    score_df['recall'] = [recall]    

                else:
                    temp_df = pd.DataFrame()
                    temp_df['acc_per_s'] = [accuracy/model_run_time]    
                    temp_df['time'] = [round(model_run_time,0)]    
                    temp_df['size'] = [int((1-test_size)*len(data_model_x))]
                    temp_df['tf'] = [total_features]    
                    temp_df['fp'] = [FEATURE_PERCENT]    
                    temp_df['features'] = [N_FEATURES]    
                    temp_df['sls'] = [SECOND_LAYER_SIZE]    
                    temp_df['epochs'] = [epochs]    
                    temp_df['accuracy'] = [accuracy]    
                    temp_df['precision'] = [precision]    
                    temp_df['recall'] = [recall] 

                    score_df = pd.concat([score_df,temp_df])
                    print(temp_df)
    

print(score_df)

First pass Vectorizing
Re-Vectorizing: max_features=9659 [38639*0.25]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 97.3s
Precision: 0.822503520300399
Accuracy: 0.822503520300399
Recall: 0.822503520300399
cm: [[ 82434    447  48213]
 [  1531  10475  10631]
 [ 33115   2871 355691]]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 160.0s
Precision: 0.8274246068997888
Accuracy: 0.8274246068997888
Recall: 0.8274246068997888
cm: [[ 78637    363  52094]
 [  1261  10009  11367]
 [ 26786   2253 362638]]
   acc_per_s    time   size     tf    fp  features  sls  epochs  accuracy  \
0   0.000781  1059.0  60600  38639  0.25      9659  0.5       5  0.827425   

   precision    recall  
0   0.827425  0.827425  
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 225.99s
Precision: 0.8250997418446374
Accuracy: 0.8250997418446374
Recall: 0.8250997418446374
cm: [[ 81138    514  49442]
 [  1522  10512  10603]
 [ 30429   2882 35836

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 1273.62s
Precision: 0.8345330468199953
Accuracy: 0.8345330468199953
Recall: 0.8345330468199953
cm: [[ 73373    544  57177]
 [   917  10844  10876]
 [ 17198   3535 370944]]
   acc_per_s    time   size     tf    fp  features  sls  epochs  accuracy  \
0   0.000095  8791.0  60600  38639  0.75     28979  0.5       5  0.834533   

   precision    recall  
0   0.834533  0.834533  
Epoch 1/5


ResourceExhaustedError: in user code:

    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\engine\training.py:806 train_function  *
        return step_function(self, iterator)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\engine\training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\engine\training.py:789 run_step  **
        outputs = model.train_step(data)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\engine\training.py:756 train_step
        _minimize(self.distribute_strategy, tape, self.optimizer, loss,
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\engine\training.py:2743 _minimize
        optimizer.apply_gradients(
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:519 apply_gradients
        self._create_all_weights(var_list)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:704 _create_all_weights
        self._create_slots(var_list)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\optimizer_v2\adam.py:127 _create_slots
        self.add_slot(var, 'm')
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\optimizer_v2\optimizer_v2.py:760 add_slot
        weight = tf_variables.Variable(
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:262 __call__
        return cls._variable_v2_call(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:244 _variable_v2_call
        return previous_getter(
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2857 creator
        return next_creator(**kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2857 creator
        return next_creator(**kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\distribute\distribute_lib.py:2857 creator
        return next_creator(**kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:67 getter
        return captured_getter(captured_previous, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\eager\def_function.py:683 variable_capturing_scope
        v = UnliftedInitializerVariable(
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\variables.py:264 __call__
        return super(VariableMetaclass, cls).__call__(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\eager\def_function.py:226 __init__
        initial_value() if init_from_fn else initial_value,
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\keras\initializers\initializers_v2.py:137 __call__
        return super(Zeros, self).__call__(shape, dtype=_get_dtype(dtype))
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\init_ops_v2.py:132 __call__
        return array_ops.zeros(shape, dtype)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\array_ops.py:2747 wrapped
        tensor = fun(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\array_ops.py:2806 zeros
        output = fill(shape, constant(zero, dtype=dtype), name=name)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\array_ops.py:239 fill
        result = gen_array_ops.fill(dims, value, name=name)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\ops\gen_array_ops.py:3401 fill
        _ops.raise_from_not_ok_status(e, name)
    C:\Users\stonk013\Anaconda3\envs\keras_env\lib\site-packages\tensorflow\python\framework\ops.py:6843 raise_from_not_ok_status
        six.raise_from(core._status_to_exception(e.code, message), None)
    <string>:3 raise_from
        

    ResourceExhaustedError: OOM when allocating tensor with shape[28979,21734] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu [Op:Fill]


In [11]:
print(score_df)
pd.to_pickle(score_df,"z_wk09_keras_v1.zip")

   acc_per_s    time   size     tf    fp  features   sls  epochs  accuracy  \
0   0.000924   890.0  60600  38639  0.25      9659  0.25       5  0.822504   
0   0.000781  1059.0  60600  38639  0.25      9659  0.50       5  0.827425   
0   0.000680  1213.0  60600  38639  0.25      9659  0.75       5  0.825100   
0   0.000626  1321.0  60600  38639  0.25      9659  1.00       5  0.826585   
0   0.000245  3361.0  60600  38639  0.50     19319  0.25       5  0.824113   
0   0.000193  4244.0  60600  38639  0.50     19319  0.50       5  0.820864   
0   0.000182  4525.0  60600  38639  0.50     19319  0.75       5  0.822335   
0   0.000160  5182.0  60600  38639  0.50     19319  1.00       5  0.827615   
0   0.000106  7705.0  60600  38639  0.75     28979  0.25       5  0.813794   
0   0.000095  8791.0  60600  38639  0.75     28979  0.50       5  0.834533   

   precision    recall  
0   0.822504  0.822504  
0   0.827425  0.827425  
0   0.825100  0.825100  
0   0.826585  0.826585  
0   0.824113  0.

In [69]:
y_predicted = pipeline.predict(x_test)

0.799031130448839
[[ 68691    624  73491]
 [  1168   7849  15566]
 [ 25777   2727 397995]]


In [109]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix

#You'll get a warning but just ignore it

from sklearn.metrics import precision_recall_fscore_support

precision = precision_score(y_test, y_predicted, average='micro')
accuracy = accuracy_score(y_test, y_predicted)
recall = recall_score(y_test, y_predicted, average ='micro')
cm = confusion_matrix(y_test, y_predicted)


In [77]:
score = pipeline.score(x_test,y_test)



In [80]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_predicted,average='samples')

ValueError: Samplewise metrics are not available outside of multilabel classification.

In [61]:
pipeline.steps[-1][1].model.save(saveto['keras_model']) 
pipeline.steps.pop(-1)
joblib.dump(model, saveto['sklearn_pipe'])

NameError: name 'saveto' is not defined

In [55]:
y_predicted = pipeline.predict(x_test)

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix

#You'll get a warning but just ignore it

 


print(f1_score(y_test, y_predicted, average = 'micro'))
print(precision_score(y_test, y_predicted, average = 'micro'))
print(accuracy_score(y_test, y_predicted))
print(recall_score(y_test, y_predicted, average = 'micro'))
print(confusion_matrix(y_test, y_predicted))

0.8205883990948976
0.8205883990948976
0.8205883990948976
0.8205883990948976
[[ 64247    620  51632]
 [   985   8901  10133]
 [ 20564   3046 324679]]


In [38]:
from keras.layers import Dense
from keras.models import Sequential

 


N_FEATURES = 5000
N_CLASSES = 3
def build_network():
    """
    Create a function that returns a compiled neural network
    """
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
         loss='categorical_crossentropy',
         optimizer='adam',
         metrics=['accuracy']
    )
    return nn

 


def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

 


import tensorflow as tf

 

vectorizer = TfidfVectorizer(max_features=N_FEATURES)

 

X_train = vectorizer.fit_transform(df["processed"])
X_test = vectorizer.transform(test["processed_text"])
X_train = convert_sparse_matrix_to_sparse_tensor(X_train)
X_test = convert_sparse_matrix_to_sparse_tensor(X_test)

 

 

 

from sklearn.pipeline import Pipeline
# from transformer import TextNormalizer
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
#  ('norm', TextNormalizer()),
#  ('vect', TfidfVectorizer(max_features=N_FEATURES)),
 ('nn', KerasClassifier(build_fn=build_network,
 epochs=200,
batch_size=128))
 ])

 


def train_model(X, y, model, saveto=None, cv=12): 
    """
    Trains model from corpus at specified path and fits on full data.
    If a saveto dictionary is specified, writes Keras and Sklearn
    pipeline components to disk separately. Returns the scores.
    """

 

    model.fit(X, y)
    if saveto: 
        model.steps[-1][1].model.save(saveto['keras_model']) 
        model.steps.pop(-1)
        joblib.dump(model, saveto['sklearn_pipe'])
    return model

 

    y_train = train["cat"]
y_test = test["cat"]

 

 

TModel =train_model(X_train, y_train, model = pipeline)

 

#scores being returned are the "loss scores" 

 

y_predicted = TModel.predict(X_test)

 

#You'll get a warning but just ignore it

 


f1_score(ytest, y_predicted, average = 'micro')
precision_score(ytest, y_predicted, average = 'micro')
accuracy_score(ytest, y_predicted)
recall_score(ytest, y_predicted, average = 'micro')
confusion_matrix(ytest, y_predicted)

NameError: name 'test' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#//*** Initialize the Vectorizer
tfidf = TfidfVectorizer()


#//*** Build the feature matrix, which is a weighted sparse matrix
bwarg = tfidf.fit_transform(df['processed'][:10000])



In [None]:
print(bwarg.shape)

### 3. Classifying Images ###
In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.
