# Stoneburner, Kurt
- ## DSC 550 - Week 09/10

In [1]:
import os
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### 1. Neural Network Classifier with Scikit ### 

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.


In [71]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** These cleanup functions are based on Week 02 cleanup code, and rebuilt for Week 04

#//*****************************************
#//*** Functions:
#//*****************************************
#//*** Mr_clean_text: Converts to lowercase, removes punctuation, newlines and html markup
#//****************************************************************************************************
#//*** Tokenize_series: Converts a Series containing strings, to a series containing tokenized lists
#//****************************************************************************************************
#//*** Remove_stop_words: Removes Stop words based on nltk stopwords 'english' dictionary
#//****************************************************************************************************
#//*** Apply_stemmer: Stem tokenized words using nltk.stem.porter.PorterStemme
#//****************************************************************************************************
#//*** apply_pos_tag: Builds Part of Speech Tagging from tokeninzed text
#//****************************************************************************************************

#//****************************************************************************************************

#//****************************************************************************************************
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    def clean_text(input_string):
        clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", input_string.lower())
        return re.sub(r'\W+', ' ', clean1)

    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** Start Timing the process
    start_time = time.time()

    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "punctuation" ]
    
    default_false = ["remove_empty"]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
    
    for key in default_false:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace('\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace('\n',"")
    
    
    input_series = input_series.str.replace("\\(http.+\\)","")
    
    #//*** Print Elements between brackets
    #print(input_series[ input_series == input_series.str.match('[.*]')])

     
    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice

        #input_series = input_series.apply(lambda x: clean_text(x))

        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata

        #//*** replace Comma and Period with a space.
        for punct in [",","."]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

        #table = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}
        #print(table )
        #input_series = input_series.str.translate(table)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series

                                          
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series,slices=20,input_options={}):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
        
    #//*** All kwargs are listed here. These initialize False by default.
    key_list = [ "fast", "quiet" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to False
    for key in key_list:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
            
    #input_series = input_series.apply(word_tokenize)
    
    if action_dict['fast'] == False:
        print("Processing Tokens with NLTK Word Tokenize")
        input_series = apply_with_progress(input_series,word_tokenize,slices)
    else:
        print("Process Tokens with Split()")
        input_series = apply_with_progress(input_series,lambda x: x.split(),slices)
    
    
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series

#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
                #print(f"Removing: {word}")
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()

    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    #//*** Manually adding word the
    stop_words = []
    
    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    #print("Stop Words: ")
    print(stop_words)
    print ("Processing Stop Words")
    input_series = apply_with_progress(input_series, apply_stop_words)
    
    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series

def apply_stemmer(input_series,trim_single_words = True,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    #//*** Instantiate the Stemmer
    porter = nltk.stem.porter.PorterStemmer()
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [porter.stem(word) for word in word_list] - performs stemming on each word and returns a list
    #input_series = input_series.apply(lambda word_list: [porter.stem(word) for word in word_list] )
    print("Begin: Apply Stemmer")
    input_series = apply_with_progress(input_series, lambda word_list: [porter.stem(word) for word in word_list],slices)
    
    #//*** Remove Single letter words after stemming
    
    """
    if trim_single_words:
        for word_list in input_series:
            for word in word_list:
                if len(word) < 2:
                    word_list.remove(word)
    """
    
    print(f"Apply Stemmer Time: {time.time() - start_time}")
    return input_series

def apply_pos_tag(input_series,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    from nltk import pos_tag

    #//pos_tag requires an additional download
    try:
        pos_tag(["the","quick","brown","fox"])
    except: 
        nltk.download('averaged_perceptron_tagger')
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    print("Begin Part of Speech tagging")
    
    input_series = apply_with_progress(input_series,pos_tag,slices)
    
    print(f"Part of Speech Tagging Time: {round(time.time() - start_time,2)}s")
    
    return input_series
    
def apply_lemmatization(input_series,slices=20):
            
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk


    from nltk.stem import WordNetLemmatizer
    
    from nltk.corpus import wordnet    
    
    #nltk.download('wordnet')
    
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    # Initialize the Lemmatizer instance
    lemmatizer = WordNetLemmatizer() 
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [lemmatizer.lemmatize(word) for word in word_list] - performs lemmtization on each word and returns a list
    #lemmatized = input_series.apply(lambda word_list: [lemmatizer.lemmatize(*word) for word in word_list] )
    
    print("Begin Lemmatization...")
    
    input_series = apply_with_progress(input_series,lambda word_list: [lemmatizer.lemmatize(word) for word in word_list],20)
    
    print(f"Lemmatization Time: {time.time() - start_time}")
    
    #if detoken:
    #    return tokenize_series(input_series,5,{"fast":True})

    return input_series

#//*** Apply a function to a Series and display processing progress
#//*** Slices is the total number of intervals to report progress.
#//*** Slices = 20 displays processing after every 5% is processed
#//*** Slices = 100 displays processing after every 1% is processed
def apply_with_progress(input_series,input_function,slices=20):
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Get the time at the start of the loop, used for elapsed time.
    start_time = time.time()
    
    #//*** The interval is the number of elements to process in each Loop. The default is 20.
    #//*** Which displays results at 5% intervals.
    interval = int(len(input_series)/slices)
    
    #//*** Total number of items to process
    total = len(input_series)
    

    #//*** Loop through slice times and display processing statistics for each slice.
    for x in range(0, slices ):
        #//*** Get time at the start of the slice.
        loop_time = time.time()
        
        #//*** Set the start index
        begin_dex = interval*x
        
        #//*** Set the end index
        end_dex = interval*x+interval-1
        
        #//*** Apply the input function to a slice of the input_series
        #//*** This part does all the actual 'work'
        input_series[begin_dex:end_dex] = input_series[begin_dex:end_dex].apply(input_function)
        
        #//*** Get the time after the slice of work is done
        now = time.time()
        
        #//*** Compute the estimated remaining time
        #//*** Total elapsed time / % of completed slices = Estimated total time
        #//*** Estimated total time - elaped time = Remaining time
        est_remain = round( ( ( now - start_time ) /  ( (x+1)/slices ) - (now-start_time)),2)

        #//*** Display Results so we know how much time is left (so we can effectively multi-task: ie comments, research and Valheim)
        print(f"Processed {x}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(now-loop_time,2)}s elapsed: {round(now-start_time,2)}s est Remain: {est_remain}s")
    
    #//*** END For Slice Loop
    
    #//*** Process the remaining values (Since interval is an int there should be a remainder)
    loop_time = time.time()
    begin_dex = end_dex+1
    if begin_dex < len(input_series):
        print(f"Processing Remaining values: {begin_dex} : {total} ")
        #print(input_series[begin_dex:])
        input_series[begin_dex:] = input_series[begin_dex:].apply(input_function)
    
    #//*** Display Final output
    print(f"Processed {slices}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(time.time()-loop_time,2)}s elapsed: {round(time.time()-start_time,2)}s")
    
    #//*** return Series
    return input_series

In [3]:
df = pd.read_json("z_wk09_categorized-comments.jsonl", lines=True)

In [21]:
print(df)
#df['pos_tag'] = apply_pos_tag(df['txt'])

                cat                                                txt  \
0            sports  Barely better than Gabbert? He was significant...   
1            sports  Fuck the ducks and the Angels! But welcome to ...   
2            sports  Should have drafted more WRs.\n\n- Matt Millen...   
3            sports            [Done](https://i.imgur.com/2YZ90pm.jpg)   
4            sports                                      No!! NOO!!!!!   
...             ...                                                ...   
606471  video_games             No. It's probably only happened to you   
606472  video_games  I think most of the disappointment came from t...   
606473  video_games  dishonored 1/2 looked like arse, so what the h...   
606474  video_games                                          [removed]   
606475  video_games  I wish more games provided options like Rise o...   

                                                processed  
0       barely better than gabbert he was significa

In [74]:
#//*** Clean the Text.
df['processed'] = mr_clean_text(df['txt'],{"lower": True, "newline": True, "html": True, "remove_empty" : False, "punctuation" : True})

print(f"Articles of length with 0 characters: {len(df[ df['processed'].str.len() == 0 ])}")

#//Remove Items with an Arbitrary length of 0

print(f"Articles of length with 0 characters: {len(df[ df['processed'].str.len() == 0 ])}")
print("Remove these articles")
print(f"Article Count Before: {len(df)}")
df = df[ df['processed'].str.len() > 0 ]
print(f"Article Count After: {len(df)}")

lower True
newline True
html True
remove_empty False
punctuation True
Text Cleaning Time: 4.877617120742798
Articles of length with 0 characters: 0
Articles of length with 0 characters: 0
Remove these articles
Article Count Before: 606008
Article Count After: 606008


In [75]:
#//************************
#//*** Tokenize the Text
#//************************
#//*** The custom function displays progress while it's working
df['processed'] = tokenize_series(df['processed'],20,{"fast":True})

fast True
Process Tokens with Split()
Processed 0/20: 0:30299 [606008] in 0.19s elapsed: 0.19s est Remain: 3.57s
Processed 1/20: 30300:60599 [606008] in 0.06s elapsed: 0.25s est Remain: 2.21s
Processed 2/20: 60600:90899 [606008] in 0.06s elapsed: 0.31s est Remain: 1.74s
Processed 3/20: 90900:121199 [606008] in 0.05s elapsed: 0.35s est Remain: 1.41s
Processed 4/20: 121200:151499 [606008] in 0.08s elapsed: 0.43s est Remain: 1.3s
Processed 5/20: 151500:181799 [606008] in 0.06s elapsed: 0.49s est Remain: 1.15s
Processed 6/20: 181800:212099 [606008] in 0.06s elapsed: 0.55s est Remain: 1.02s
Processed 7/20: 212100:242399 [606008] in 0.05s elapsed: 0.6s est Remain: 0.9s
Processed 8/20: 242400:272699 [606008] in 0.56s elapsed: 1.16s est Remain: 1.42s
Processed 9/20: 272700:302999 [606008] in 0.09s elapsed: 1.26s est Remain: 1.26s
Processed 10/20: 303000:333299 [606008] in 0.07s elapsed: 1.33s est Remain: 1.09s
Processed 11/20: 333300:363599 [606008] in 0.05s elapsed: 1.38s est Remain: 0.92s
Pr

In [77]:
#//************************
#//*** Remove Stop Words
#//************************
#//*** The custom function displays progress while it's working
df['processed'] = remove_stop_words(df['processed'])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'youre', 'youve', 'youll', 'youd', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'shes', 'her', 'hers', 'herself', 'it', 'its', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'thatll', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', '

In [78]:
df['lema_stem_tokens'] = apply_lemmatization(df['processed']) 

Begin Lemmatization...
Processed 0/20: 0:30299 [606008] in 3.52s elapsed: 3.52s est Remain: 66.95s
Processed 1/20: 30300:60599 [606008] in 1.47s elapsed: 4.99s est Remain: 44.93s
Processed 2/20: 60600:90899 [606008] in 1.32s elapsed: 6.31s est Remain: 35.77s
Processed 3/20: 90900:121199 [606008] in 1.17s elapsed: 7.48s est Remain: 29.93s
Processed 4/20: 121200:151499 [606008] in 1.94s elapsed: 9.42s est Remain: 28.26s
Processed 5/20: 151500:181799 [606008] in 1.4s elapsed: 10.82s est Remain: 25.25s
Processed 6/20: 181800:212099 [606008] in 1.34s elapsed: 12.16s est Remain: 22.58s
Processed 7/20: 212100:242399 [606008] in 1.23s elapsed: 13.39s est Remain: 20.09s
Processed 8/20: 242400:272699 [606008] in 1.28s elapsed: 14.67s est Remain: 17.93s
Processed 9/20: 272700:302999 [606008] in 2.45s elapsed: 17.12s est Remain: 17.12s
Processed 10/20: 303000:333299 [606008] in 1.84s elapsed: 18.96s est Remain: 15.51s
Processed 11/20: 333300:363599 [606008] in 2.28s elapsed: 21.24s est Remain: 14.

In [83]:
print( f"Total Corpus Word Count: {df['lema_stem_tokens'].apply(lambda x: len(x)).sum()}" )
#//*** Eliminate words with length of 0,1 or 2. This is an arbitrary value to help with feature reduction
#df['tokens'] = df['tokens'].apply(lambda word_list : list(filter(lambda word : len(word) >= 3, word_list))) 
df['lema_stem_tokens'] = df['lema_stem_tokens'].apply(lambda word_list : list(filter(lambda word : len(word) >= 3, word_list))) 

#//*** Build Word Count
df['num_wds'] = df['lema_stem_tokens'].apply(lambda x: len(x))

print( f"Total Corpus Word Count: {df['lema_stem_tokens'].apply(lambda x: len(x)).sum()}" )

#//************************
#//*** Stem the lemma's
#//***********************************************************************
#//*** We are trying to reduce the feature set
#//***********************************************************************
df['lema_stem_tokens'] = apply_stemmer(df['lema_stem_tokens'])

Total Corpus Word Count: 9135233
Total Corpus Word Count: 9135233
Begin: Apply Stemmer
Processed 0/100: 0:6059 [606008] in 1.75s elapsed: 1.75s est Remain: 172.76s
Processed 1/100: 6060:12119 [606008] in 1.6s elapsed: 3.35s est Remain: 164.09s
Processed 2/100: 12120:18179 [606008] in 1.59s elapsed: 4.94s est Remain: 159.75s
Processed 3/100: 18180:24239 [606008] in 1.62s elapsed: 6.56s est Remain: 157.54s
Processed 4/100: 24240:30299 [606008] in 1.53s elapsed: 8.1s est Remain: 153.87s
Processed 5/100: 30300:36359 [606008] in 1.45s elapsed: 9.55s est Remain: 149.62s
Processed 6/100: 36360:42419 [606008] in 1.41s elapsed: 10.96s est Remain: 145.57s
Processed 7/100: 42420:48479 [606008] in 1.39s elapsed: 12.34s est Remain: 141.96s
Processed 8/100: 48480:54539 [606008] in 1.4s elapsed: 13.75s est Remain: 138.98s
Processed 9/100: 54540:60599 [606008] in 1.37s elapsed: 15.12s est Remain: 136.04s
Processed 10/100: 60600:66659 [606008] in 1.35s elapsed: 16.46s est Remain: 133.22s
Processed 11/1

Processed 96/100: 581760:587819 [606008] in 1.45s elapsed: 146.91s est Remain: 4.54s
Processed 97/100: 587820:593879 [606008] in 1.44s elapsed: 148.35s est Remain: 3.03s
Processed 98/100: 593880:599939 [606008] in 1.64s elapsed: 149.99s est Remain: 1.52s
Processed 99/100: 599940:605999 [606008] in 1.63s elapsed: 151.62s est Remain: 0.0s
Processing Remaining values: 606000 : 606008 
Processed 100/100: 606000:605999 [606008] in 0.0s elapsed: 151.62s
Apply Stemmer Time: 151.62582802772522


In [84]:
#//**********************************
#//*** Apply Part of Speech Tagging
#//**********************************
df['pos_tag'] = apply_pos_tag(df['lema_stem_tokens'])

Begin Part of Speech tagging
Processed 0/100: 0:6059 [606008] in 7.28s elapsed: 7.28s est Remain: 720.63s
Processed 1/100: 6060:12119 [606008] in 6.76s elapsed: 14.04s est Remain: 688.11s
Processed 2/100: 12120:18179 [606008] in 6.72s elapsed: 20.77s est Remain: 671.47s
Processed 3/100: 18180:24239 [606008] in 6.9s elapsed: 27.67s est Remain: 664.1s
Processed 4/100: 24240:30299 [606008] in 6.67s elapsed: 34.34s est Remain: 652.54s
Processed 5/100: 30300:36359 [606008] in 6.38s elapsed: 40.73s est Remain: 638.06s
Processed 6/100: 36360:42419 [606008] in 6.37s elapsed: 47.1s est Remain: 625.73s
Processed 7/100: 42420:48479 [606008] in 6.21s elapsed: 53.31s est Remain: 613.04s
Processed 8/100: 48480:54539 [606008] in 6.28s elapsed: 59.58s est Remain: 602.47s
Processed 9/100: 54540:60599 [606008] in 6.18s elapsed: 65.76s est Remain: 591.88s
Processed 10/100: 60600:66659 [606008] in 6.16s elapsed: 71.93s est Remain: 581.94s
Processed 11/100: 66660:72719 [606008] in 6.03s elapsed: 77.96s est

Processed 95/100: 575700:581759 [606008] in 6.65s elapsed: 629.74s est Remain: 26.24s
Processed 96/100: 581760:587819 [606008] in 6.48s elapsed: 636.22s est Remain: 19.68s
Processed 97/100: 587820:593879 [606008] in 6.52s elapsed: 642.74s est Remain: 13.12s
Processed 98/100: 593880:599939 [606008] in 6.87s elapsed: 649.61s est Remain: 6.56s
Processed 99/100: 599940:605999 [606008] in 6.81s elapsed: 656.43s est Remain: 0.0s
Processing Remaining values: 606000 : 606008 
Processed 100/100: 606000:605999 [606008] in 0.01s elapsed: 656.44s
Part of Speech Tagging Time: 656.45s


In [87]:
pd.to_pickle(df,"z_wk09_categorized_comments_processed.zip")

In [102]:
#conda install -c conda-forge transformers 

#from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score


#//*** Convert categorical string to categorical int
#//*** Only run once to prevent iPython issues
if (df.dtypes['cat'] == object):
    cat_dict = dict(tuple(enumerate(df['cat'].unique())))
    #//*** Build sexcat Categorical column
    df['intcat'] = df['cat'].copy()
    
    #//*** replace values using the sex_dict dictionary
    for key,value in cat_dict.items():
        df['intcat'] = df['intcat'].replace(value,key)

x = list(df['lema_stem_tokens'])

y = np.array(df['intcat'])

from transformer import TextNormalizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.feature_extraction.text import TfidVectorizer
#print(df)

ModuleNotFoundError: No module named 'transformer'


### 2. Neural Network Classifier with Keras###
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

### 3. Classifying Images ###
In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.
