# Stoneburner, Kurt
- ## DSC 550 - Week 09/10

In [100]:
import os
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### 1. Neural Network Classifier with Scikit ### 

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.


In [2]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** These cleanup functions are based on Week 02 cleanup code, and rebuilt for Week 04

#//*****************************************
#//*** Functions:
#//*****************************************
#//*** Mr_clean_text: Converts to lowercase, removes punctuation, newlines and html markup
#//****************************************************************************************************
#//*** Tokenize_series: Converts a Series containing strings, to a series containing tokenized lists
#//****************************************************************************************************
#//*** Remove_stop_words: Removes Stop words based on nltk stopwords 'english' dictionary
#//****************************************************************************************************
#//*** Apply_stemmer: Stem tokenized words using nltk.stem.porter.PorterStemme
#//****************************************************************************************************
#//*** apply_pos_tag: Builds Part of Speech Tagging from tokeninzed text
#//****************************************************************************************************

#//****************************************************************************************************

#//****************************************************************************************************
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    def clean_text(input_string):
        clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", input_string.lower())
        return re.sub(r'\W+', ' ', clean1)

    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** Start Timing the process
    start_time = time.time()

    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "punctuation" ]
    
    default_false = ["remove_empty"]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
    
    for key in default_false:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace('\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace('\n',"")
    
    
    input_series = input_series.str.replace("\\(http.+\\)","")
    
    #//*** Print Elements between brackets
    #print(input_series[ input_series == input_series.str.match('[.*]')])

     
    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice

        #input_series = input_series.apply(lambda x: clean_text(x))

        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata

        #//*** replace Comma and Period with a space.
        for punct in [",","."]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

        #table = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}
        #print(table )
        #input_series = input_series.str.translate(table)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series

                                          
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series,slices=20,input_options={}):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
        
    #//*** All kwargs are listed here. These initialize False by default.
    key_list = [ "fast", "quiet" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to False
    for key in key_list:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
            
    #input_series = input_series.apply(word_tokenize)
    
    if action_dict['fast'] == False:
        print("Processing Tokens with NLTK Word Tokenize")
        input_series = apply_with_progress(input_series,word_tokenize,slices)
    else:
        print("Process Tokens with Split()")
        input_series = apply_with_progress(input_series,lambda x: x.split(),slices)
    
    
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series

#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
                #print(f"Removing: {word}")
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()

    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    #//*** Manually adding word the
    stop_words = []
    
    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    #print("Stop Words: ")
    print(stop_words)
    print ("Processing Stop Words")
    input_series = apply_with_progress(input_series, apply_stop_words)
    
    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series

def apply_stemmer(input_series,trim_single_words = True,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    #//*** Instantiate the Stemmer
    porter = nltk.stem.porter.PorterStemmer()
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [porter.stem(word) for word in word_list] - performs stemming on each word and returns a list
    #input_series = input_series.apply(lambda word_list: [porter.stem(word) for word in word_list] )
    print("Begin: Apply Stemmer")
    input_series = apply_with_progress(input_series, lambda word_list: [porter.stem(word) for word in word_list],slices)
    
    #//*** Remove Single letter words after stemming
    
    """
    if trim_single_words:
        for word_list in input_series:
            for word in word_list:
                if len(word) < 2:
                    word_list.remove(word)
    """
    
    print(f"Apply Stemmer Time: {time.time() - start_time}")
    return input_series

def apply_pos_tag(input_series,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    from nltk import pos_tag

    #//pos_tag requires an additional download
    try:
        pos_tag(["the","quick","brown","fox"])
    except: 
        nltk.download('averaged_perceptron_tagger')
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    print("Begin Part of Speech tagging")
    
    input_series = apply_with_progress(input_series,pos_tag,slices)
    
    print(f"Part of Speech Tagging Time: {round(time.time() - start_time,2)}s")
    
    return input_series
    
def apply_lemmatization(input_series,slices=20):
            
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk


    from nltk.stem import WordNetLemmatizer
    
    from nltk.corpus import wordnet    
    
    #nltk.download('wordnet')
    
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    # Initialize the Lemmatizer instance
    lemmatizer = WordNetLemmatizer() 
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [lemmatizer.lemmatize(word) for word in word_list] - performs lemmtization on each word and returns a list
    #lemmatized = input_series.apply(lambda word_list: [lemmatizer.lemmatize(*word) for word in word_list] )
    
    print("Begin Lemmatization...")
    
    input_series = apply_with_progress(input_series,lambda word_list: [lemmatizer.lemmatize(word) for word in word_list],20)
    
    print(f"Lemmatization Time: {time.time() - start_time}")
    
    #if detoken:
    #    return tokenize_series(input_series,5,{"fast":True})

    return input_series

#//*** Apply a function to a Series and display processing progress
#//*** Slices is the total number of intervals to report progress.
#//*** Slices = 20 displays processing after every 5% is processed
#//*** Slices = 100 displays processing after every 1% is processed
def apply_with_progress(input_series,input_function,slices=20):
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Get the time at the start of the loop, used for elapsed time.
    start_time = time.time()
    
    #//*** The interval is the number of elements to process in each Loop. The default is 20.
    #//*** Which displays results at 5% intervals.
    interval = int(len(input_series)/slices)
    
    #//*** Total number of items to process
    total = len(input_series)
    

    #//*** Loop through slice times and display processing statistics for each slice.
    for x in range(0, slices ):
        #//*** Get time at the start of the slice.
        loop_time = time.time()
        
        #//*** Set the start index
        begin_dex = interval*x
        
        #//*** Set the end index
        end_dex = interval*x+interval-1
        
        #//*** Apply the input function to a slice of the input_series
        #//*** This part does all the actual 'work'
        input_series[begin_dex:end_dex] = input_series[begin_dex:end_dex].apply(input_function)
        
        #//*** Get the time after the slice of work is done
        now = time.time()
        
        #//*** Compute the estimated remaining time
        #//*** Total elapsed time / % of completed slices = Estimated total time
        #//*** Estimated total time - elaped time = Remaining time
        est_remain = round( ( ( now - start_time ) /  ( (x+1)/slices ) - (now-start_time)),2)

        #//*** Display Results so we know how much time is left (so we can effectively multi-task: ie comments, research and Valheim)
        print(f"Processed {x}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(now-loop_time,2)}s elapsed: {round(now-start_time,2)}s est Remain: {est_remain}s")
    
    #//*** END For Slice Loop
    
    #//*** Process the remaining values (Since interval is an int there should be a remainder)
    loop_time = time.time()
    begin_dex = end_dex+1
    if begin_dex < len(input_series):
        print(f"Processing Remaining values: {begin_dex} : {total} ")
        #print(input_series[begin_dex:])
        input_series[begin_dex:] = input_series[begin_dex:].apply(input_function)
    
    #//*** Display Final output
    print(f"Processed {slices}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(time.time()-loop_time,2)}s elapsed: {round(time.time()-start_time,2)}s")
    
    #//*** return Series
    return input_series


### 2. Neural Network Classifier with Keras###
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

In [3]:
#https://oindrilasen.com/2021/02/how-to-install-and-import-keras-in-anaconda-jupyter-notebooks/

from keras.layers import Dense
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier


from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix
import tensorflow as tf
import time

In [4]:
df = pd.read_pickle("z_wk09_categorized_comments_processed.zip")

df['processed'] = df['lema_stem_tokens'].apply(lambda word_list: ' '.join(word_list)) 

#//*** Convert categorical string to categorical int
#//*** Only run once to prevent iPython issues
if (df.dtypes['cat'] == object):
    cat_dict = dict(tuple(enumerate(df['cat'].unique())))
    #//*** Build sexcat Categorical column
    df['intcat'] = df['cat'].copy()
    
    #//*** replace values using the sex_dict dictionary
    for key,value in cat_dict.items():
        df['intcat'] = df['intcat'].replace(value,key)

In [None]:
#//*** Reference Code: Applied Text Analysys with Python p282.
metric_dict = {}

metrics = ['total_features',]

def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

def build_network():
    """
    Create a function that returns a compiled neural network
    """
    nn = Sequential()
    nn.add(Dense(N_FEATURES, activation='relu', input_shape=(N_FEATURES,)))
    
    nn.add(Dense(N_FEATURES*SECOND_LAYER_SIZE, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    return nn

score_df = pd.DataFrame()


#//*** Theoretically, this number should be return as the second value of the tfidf sparse matrix
N_FEATURES = 1000

#//*** X is Post Processed Data to evaluate
data_model_x = df['processed']

tfidf = TfidfVectorizer(max_features=N_FEATURES)

data_model_y = df['intcat']

test_size=.5
test_size=.98
#continuous scoring model
#scoring = 'r2_score'

#data_model_x = tfidf.fit_transform(data_model_x)

#//*** N_CLASSES the number of categories to solve for
N_CLASSES = len(df['intcat'].unique())

FEATURE_PERCENT = .25
SECOND_LAYER_SIZE = .25
epochs = 2

for test_size in [.995,.98,.95,.9,.85,.8]:
    # split the data randomly into test/train sets
    x_train, x_test, y_train, y_test = train_test_split(data_model_x, data_model_y, test_size =test_size, random_state=0)


    #vectorizer = TfidfVectorizer(max_features=N_FEATURES)

    #//*** Initialize the Vectorizer, get all the features
    tfidf = TfidfVectorizer()

    print("First pass Vectorizing")
    total_features = tfidf.fit_transform(x_train).shape[1]


    for FEATURE_PERCENT in [.25,.5,.75,1]:
        # split the data randomly into test/train sets
        x_train, x_test, y_train, y_test = train_test_split(data_model_x, data_model_y, test_size =test_size, random_state=0)

        N_FEATURES = int(total_features * FEATURE_PERCENT)

        print(f"Re-Vectorizing: max_features={N_FEATURES} [{total_features}*{FEATURE_PERCENT}]")

        vectorizer = TfidfVectorizer(max_features=N_FEATURES)


        x_train = vectorizer.fit_transform(x_train)
        x_test = vectorizer.transform(x_test)

        x_train = convert_sparse_matrix_to_sparse_tensor(x_train)
        x_test = convert_sparse_matrix_to_sparse_tensor(x_test)

        for SECOND_LAYER_SIZE in [.25,.5,.75,1]:

            for epochs in [5,10,20]:

                pipeline = Pipeline([
                #  ('norm', TextNormalizer()),
                #  ('vect', TfidfVectorizer(max_features=N_FEATURES)),
                 ('nn', KerasClassifier(build_fn=build_network,
                 epochs=epochs,
                batch_size=128))
                 ])

                start_time = time.time()

                pipeline.fit(x_train,y_train)

                model_run_time = time.time() - start_time 
                #print(f"Model Run Time: {model_run_time}s")
                precision = 1
                accuracy = 1
                recall = 1

                print('Scoring...')
                start_score_time = time.time()
                y_predicted = pipeline.predict(x_test)
                print(f"Scoring Time {round(time.time()-start_score_time,2)}s")

                precision = precision_score(y_test, y_predicted, average='micro')
                accuracy = accuracy_score(y_test, y_predicted)
                recall = recall_score(y_test, y_predicted, average ='micro')
                cm = confusion_matrix(y_test, y_predicted)

                print("========")
                print(f"Precision: {precision}")
                print(f"Accuracy: {accuracy}")
                print(f"Recall: {recall}")
                print(f"cm: {cm}")

                if len(score_df) == 0:
                    score_df = pd.DataFrame()
                    score_df['acc_per_s'] = [accuracy/model_run_time]    
                    score_df['time'] = [round(model_run_time,0)]  
                    score_df['size'] = [int((1-test_size)*len(data_model_x))]
                    score_df['tf'] = [total_features]    
                    score_df['fp'] = [FEATURE_PERCENT]    
                    score_df['features'] = [N_FEATURES]    
                    score_df['sls'] = [SECOND_LAYER_SIZE]    
                    score_df['epochs'] = [epochs]    
                    score_df['accuracy'] = [accuracy]    
                    score_df['precision'] = [precision]    
                    score_df['recall'] = [recall]    

                else:
                    temp_df = pd.DataFrame()
                    temp_df['acc_per_s'] = [accuracy/model_run_time]    
                    temp_df['time'] = [round(model_run_time,0)]    
                    temp_df['size'] = [int((1-test_size)*len(data_model_x))]
                    temp_df['tf'] = [total_features]    
                    temp_df['fp'] = [FEATURE_PERCENT]    
                    temp_df['features'] = [N_FEATURES]    
                    temp_df['sls'] = [SECOND_LAYER_SIZE]    
                    temp_df['epochs'] = [epochs]    
                    temp_df['accuracy'] = [accuracy]    
                    temp_df['precision'] = [precision]    
                    temp_df['recall'] = [recall] 

                    score_df = pd.concat([score_df,temp_df])
                print(temp_df)
    

print(score_df)

First pass Vectorizing
Re-Vectorizing: max_features=1802 [7210*0.25]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 15.1s
Precision: 0.7485812086013088
Accuracy: 0.7485812086013088
Recall: 0.7485812086013088
cm: [[ 48861      1  96100]
 [  2009     36  22925]
 [ 30562      3 402481]]
   acc_per_s  time  size    tf   fp  features  sls  epochs  accuracy  \
0   0.062956  12.0  3030  7210  0.5      3605    1       5  0.758953   

   precision    recall  
0   0.758953  0.758953  
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Scoring...
Scoring Time 15.53s
Precision: 0.7447867086361359
Accuracy: 0.7447867086361359
Recall: 0.7447867086361359
cm: [[ 57113    465  87384]
 [  2417   4302  18251]
 [ 43126   2245 387675]]
   acc_per_s  time  size    tf    fp  features   sls  epochs  accuracy  \
0   0.164637   5.0  3030  7210  0.25      1802  0.25      10  0.744787   

   precision    recall  
0   0.744787  

Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 19.04s
Precision: 0.7449409431189861
Accuracy: 0.7449409431189861
Recall: 0.7449409431189861
cm: [[ 55901    682  88379]
 [  2366   4712  17892]
 [ 41442   3034 388570]]
   acc_per_s  time  size    tf    fp  features  sls  epochs  accuracy  \
0   0.062739  12.0  3030  7210  0.25      1802  0.5      20  0.744941   

   precision    recall  
0   0.744941  0.744941  
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 26.41s
Precision: 0.7511335405271834
Accuracy: 0.7511335405271834
Recall: 0.7511335405271834
cm: [[ 45102    103  99757]
 [  1621   2209  21140]
 [ 26812    628 405606]]
   acc_per_s  time  size    tf    fp  features   sls  epochs  accuracy  \
0   0.223706   3.0  3030  7210  0.25      1802  0.75       5  0.751134   

   precision    recall  
0   0.751134  0.751134  
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoc

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 32.11s
Precision: 0.7455213954737984
Accuracy: 0.7455213954737984
Recall: 0.7455213954737984
cm: [[ 55625    625  88712]
 [  2338   4703  17929]
 [ 40839   3002 389205]]
   acc_per_s  time  size    tf    fp  features  sls  epochs  accuracy  \
0   0.051665  14.0  3030  7210  0.25      1802    1      20  0.745521   

   precision    recall  
0   0.745521  0.745521  
Re-Vectorizing: max_features=3605 [7210*0.5]
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 34.94s
Precision: 0.7483556614005817
Accuracy: 0.7483556614005817
Recall: 0.7483556614005817
cm: [[ 70218     60  74684]
 [  3476    552  20942]
 [ 52383    191 380472]]
   acc_per_s  time  size    tf   fp  features   sls  epochs  accuracy  \
0   0.103228   7.0  3030  7

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Scoring...
Scoring Time 57.38s
Precision: 0.7603146383450142
Accuracy: 0.7603146383450142
Recall: 0.7603146383450142
cm: [[ 54329    271  90362]
 [  1723   3342  19905]
 [ 30879   1385 400782]]
   acc_per_s  time  size    tf   fp  features  sls  epochs  accuracy  \
0   0.042556  18.0  3030  7210  0.5      3605  0.5      10  0.760315   

   precision    recall  
0   0.760315  0.760315  
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 56.86s
Precision: 0.7456673377801512
Accuracy: 0.7456673377801512
Recall: 0.7456673377801512
cm: [[ 72660    756  71546]
 [  3238   5193  16539]
 [ 57619   3659 371768]]
   acc_per_s  time  size    tf   fp  features  sls  epochs  accuracy  \
0   0.021616  34.0  3030  7210  0.5 

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 94.46s
Precision: 0.7622599829512852
Accuracy: 0.7622599829512852
Recall: 0.7622599829512852
cm: [[ 52544    287  92131]
 [  1471   3693  19806]
 [ 28186   1471 403389]]
   acc_per_s  time  size    tf   fp  features  sls  epochs  accuracy  \
0   0.062013  12.0  3030  7210  0.5      3605    1       5   0.76226   

   precision   recall  
0    0.76226  0.76226  
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Scoring...
Scoring Time 94.74s
Precision: 0.7614357406074517
Accuracy: 0.7614357406074517
Recall: 0.7614357406074517
cm: [[ 50145    308  94509]
 [  1420   3328  20222]
 [ 25956   1434 405656]]
   acc_per_s  time  size    tf   fp  features  sls  epochs  accuracy  \
0   0.031486  24.0  3030  7210  0.5      3605    1      10  0.761436   

   precision    recall  
0   0.761436  0.761436  
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7

Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 73.75s
Precision: 0.753642753135272
Accuracy: 0.753642753135272
Recall: 0.753642753135272
cm: [[ 69523    584  74855]
 [  2679   4729  17562]
 [ 49887   2981 380178]]
   acc_per_s  time  size    tf    fp  features   sls  epochs  accuracy  \
0   0.012654  60.0  3030  7210  0.75      5407  0.25      20  0.753643   

   precision    recall  
0   0.753643  0.753643  
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 117.49s
Precision: 0.7351147139696639
Accuracy: 0.7351147139696639
Recall: 0.7351147139696639
cm: [[ 70925    555  73482]
 [  3185   4924  16861]
 [ 62866   2771 367409]]
   acc_per_s  time  size    tf    fp  features  sls  epochs  accuracy  \
0   0.039282  19.0  3030  7210  0.75      5407  0.5       5  0.735115   

   precision    recall  
0   0.735115  0.735115  
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 157.49s
Precision: 0.7611206379005536
Accuracy: 0.7611206379005536
Recall: 0.7611206379005536
cm: [[ 62230    616  82116]
 [  2079   4906  17985]
 [ 38082   3161 391803]]
   acc_per_s  time  size    tf    fp  features   sls  epochs  accuracy  \
0   0.008624  88.0  3030  7210  0.75      5407  0.75      20  0.761121   

   precision    recall  
0   0.761121  0.761121  
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 212.17s
Precision: 0.7618503494323176
Accuracy: 0.7618503494323176
Recall: 0.7618503494323176
cm: [[ 50634    545  93783]
 [  1341   4705  18924]
 [ 26309   2697 404040]]
   acc_per_s  time  size    tf    fp  features  sls  epochs  accuracy  \
0   0.028438  27.0  3030  7210  0.75      5407    1       5   0.7618

Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Scoring...
Scoring Time 105.13s
Precision: 0.7666349352712702
Accuracy: 0.7666349352712702
Recall: 0.7666349352712702
cm: [[ 61225    252  83485]
 [  1615   3367  19988]
 [ 33887   1487 397672]]
   acc_per_s  time  size    tf  fp  features   sls  epochs  accuracy  \
0    0.01557  49.0  3030  7210   1      7210  0.25      10  0.766635   

   precision    recall  
0   0.766635  0.766635  
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Scoring...
Scoring Time 104.28s
Precision: 0.7644938952996627
Accuracy: 0.7644938952996627
Recall: 0.7644938952996627
cm: [[ 42750    489 101723]
 [   883   4575  19512]
 [ 16852   2546 413648]]
   acc_per_s  time  size    tf  fp  features   sls  epochs  accuracy  \
0    0.00779  98.0  303

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Scoring...
Scoring Time 279.38s
Precision: 0.7432360716311375
Accuracy: 0.7432360716311375
Recall: 0.7432360716311375
cm: [[ 59375   1451  84136]
 [  2057   7710  15203]
 [ 45247   6729 381070]]
   acc_per_s  time  size    tf  fp  features   sls  epochs  accuracy  \
0    0.01815  41.0  3030  7210   1      7210  0.75       5  0.743236   

   precision    recall  
0   0.743236  0.743236  
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Scoring...


In [69]:
y_predicted = pipeline.predict(x_test)

0.799031130448839
[[ 68691    624  73491]
 [  1168   7849  15566]
 [ 25777   2727 397995]]


In [109]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix

#You'll get a warning but just ignore it

from sklearn.metrics import precision_recall_fscore_support

precision = precision_score(y_test, y_predicted, average='micro')
accuracy = accuracy_score(y_test, y_predicted)
recall = recall_score(y_test, y_predicted, average ='micro')
cm = confusion_matrix(y_test, y_predicted)


In [77]:
score = pipeline.score(x_test,y_test)



In [80]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_test, y_predicted,average='samples')

ValueError: Samplewise metrics are not available outside of multilabel classification.

In [61]:
pipeline.steps[-1][1].model.save(saveto['keras_model']) 
pipeline.steps.pop(-1)
joblib.dump(model, saveto['sklearn_pipe'])

NameError: name 'saveto' is not defined

In [55]:
y_predicted = pipeline.predict(x_test)

from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix

#You'll get a warning but just ignore it

 


print(f1_score(y_test, y_predicted, average = 'micro'))
print(precision_score(y_test, y_predicted, average = 'micro'))
print(accuracy_score(y_test, y_predicted))
print(recall_score(y_test, y_predicted, average = 'micro'))
print(confusion_matrix(y_test, y_predicted))

0.8205883990948976
0.8205883990948976
0.8205883990948976
0.8205883990948976
[[ 64247    620  51632]
 [   985   8901  10133]
 [ 20564   3046 324679]]


In [38]:
from keras.layers import Dense
from keras.models import Sequential

 


N_FEATURES = 5000
N_CLASSES = 3
def build_network():
    """
    Create a function that returns a compiled neural network
    """
    nn = Sequential()
    nn.add(Dense(500, activation='relu', input_shape=(N_FEATURES,)))
    nn.add(Dense(150, activation='relu'))
    nn.add(Dense(N_CLASSES, activation='softmax'))
    nn.compile(
         loss='categorical_crossentropy',
         optimizer='adam',
         metrics=['accuracy']
    )
    return nn

 


def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.sparse.reorder(tf.SparseTensor(indices, coo.data, coo.shape))

 


import tensorflow as tf

 

vectorizer = TfidfVectorizer(max_features=N_FEATURES)

 

X_train = vectorizer.fit_transform(df["processed"])
X_test = vectorizer.transform(test["processed_text"])
X_train = convert_sparse_matrix_to_sparse_tensor(X_train)
X_test = convert_sparse_matrix_to_sparse_tensor(X_test)

 

 

 

from sklearn.pipeline import Pipeline
# from transformer import TextNormalizer
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
#  ('norm', TextNormalizer()),
#  ('vect', TfidfVectorizer(max_features=N_FEATURES)),
 ('nn', KerasClassifier(build_fn=build_network,
 epochs=200,
batch_size=128))
 ])

 


def train_model(X, y, model, saveto=None, cv=12): 
    """
    Trains model from corpus at specified path and fits on full data.
    If a saveto dictionary is specified, writes Keras and Sklearn
    pipeline components to disk separately. Returns the scores.
    """

 

    model.fit(X, y)
    if saveto: 
        model.steps[-1][1].model.save(saveto['keras_model']) 
        model.steps.pop(-1)
        joblib.dump(model, saveto['sklearn_pipe'])
    return model

 

    y_train = train["cat"]
y_test = test["cat"]

 

 

TModel =train_model(X_train, y_train, model = pipeline)

 

#scores being returned are the "loss scores" 

 

y_predicted = TModel.predict(X_test)

 

#You'll get a warning but just ignore it

 


f1_score(ytest, y_predicted, average = 'micro')
precision_score(ytest, y_predicted, average = 'micro')
accuracy_score(ytest, y_predicted)
recall_score(ytest, y_predicted, average = 'micro')
confusion_matrix(ytest, y_predicted)

NameError: name 'test' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#//*** Initialize the Vectorizer
tfidf = TfidfVectorizer()


#//*** Build the feature matrix, which is a weighted sparse matrix
bwarg = tfidf.fit_transform(df['processed'][:10000])



In [None]:
print(bwarg.shape)

### 3. Classifying Images ###
In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.
