# Stoneburner, Kurt
- ## DSC 550 - Week 09/10

In [1]:
import os
import sys
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

#//*** Use the whole window in the IPYNB editor
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#//*** Maximize columns and rows displayed by pandas
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### 1. Neural Network Classifier with Scikit ### 

Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using scikit-learn. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.


In [2]:
#rdf = pd.read_json("z_controversial-comments.jsonl", lines=True)
df = pd.read_csv("z_wk02_controversial_words_df.csv", index_col=0)

In [3]:
#//***************************************
#//*** Apply Common Cleanup operations
#//***************************************
#//*** These cleanup functions are based on Week 02 cleanup code, and rebuilt for Week 04

#//*****************************************
#//*** Functions:
#//*****************************************
#//*** Mr_clean_text: Converts to lowercase, removes punctuation, newlines and html markup
#//****************************************************************************************************
#//*** Tokenize_series: Converts a Series containing strings, to a series containing tokenized lists
#//****************************************************************************************************
#//*** Remove_stop_words: Removes Stop words based on nltk stopwords 'english' dictionary
#//****************************************************************************************************
#//*** Apply_stemmer: Stem tokenized words using nltk.stem.porter.PorterStemme
#//****************************************************************************************************
#//*** apply_pos_tag: Builds Part of Speech Tagging from tokeninzed text
#//****************************************************************************************************

#//****************************************************************************************************

#//****************************************************************************************************
#//*** Key values will default to true. If code needs to be defaulted to False, a default_false list can be added later
#//*** All Boolean kwarg keya are stored in kwarg list. This speeds up the coding of the action_dict.
#//*** As Kwargs are added 
def mr_clean_text(input_series, input_options={}):
    
    def clean_text(input_string):
        clean1 = re.sub(r'['+string.punctuation + '’—”'+']', "", input_string.lower())
        return re.sub(r'\W+', ' ', clean1)

    #//*** import time library
    try:
        type(time)
    except:
        import time
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** Start Timing the process
    start_time = time.time()

    
    #//*** Add some data validation. I'm preparing this function for additional use. I'm checking if future users (ie future me)
    #//*** may throw some garbage at this function. Experience has taught me to fail safely wherever possible.

    #//*** All kwargs are listed here. These initialize TRUE by default.
    key_list = [ "lower", "newline", "html", "punctuation" ]
    
    default_false = ["remove_empty"]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to TRUE
    for key in key_list:
        action_dict[key] = True
    
    for key in default_false:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    
    #//*************************************************************************
    #//*** The Cleanup/Processing code is a straight lift from DSC550 - Week02
    #//*************************************************************************
    #//*** Convert to Lower Case, Default to True
    if action_dict["lower"]:
        input_series = input_series.str.lower()
    
   
    #//*** Remove New Lines
    if action_dict["newline"]:
        #//*** Rmove \r\n
        input_series = input_series.str.replace('\r?\n',"")

        #//*** Remove \n new lines
        input_series = input_series.str.replace('\n',"")

    #//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
    #//*** Let's use regex to remove html entities
    if action_dict["html"]:
        input_series = input_series.str.replace(r'&.*;',"")

    #//*** Remove the empty lines
    if action_dict["remove_empty"]:
        input_series = input_series[ input_series.str.len() > 0]

    #//*** Remove punctuation
    if action_dict["punctuation"]:
        #//*** Load libraries for punctuation if not already loaded.
        #//*** Wrapping these in a try, no sense in importing libraries that already exist.
        #//*** Unsure of the cost of reimporting libraries (if any). But testing if library is already loaded feels
        #//*** like a good practice

        #input_series = input_series.apply(lambda x: clean_text(x))

        try:
            type(sys)
        except:
            import sys

        try:
            type(unicodedata)
        except:
            import unicodedata

        #//*** replace Comma and Period with a space.
        for punct in [",","."]:
            input_series = input_series.str.replace(punct," ")

        #//*** Remove punctuation using the example from the book
        punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
        input_series = input_series.str.translate(punctuation)

        #table = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}
        #print(table )
        #input_series = input_series.str.translate(table)

    print(f"Text Cleaning Time: {time.time() - start_time}")

    return input_series

                                          
#//*** Tokenize a Series containing Strings.
#//*** Breaking this out into it's own function for later reuse.
#//*** Not a lot of code here, but it helps to keep the libraries localized. This creates standarization for future
#//*** Stoneburner projects. Also has the ability to add functionality as needed.

def tokenize_series(input_series,slices=20,input_options={}):
    
    try:
        type(nltk)
    except:
        import nltk
    
    word_tokenize = nltk.tokenize.word_tokenize 
    
    #//*** import time library
    try:
        type(time)
    except:
        import time
        
    #//*** All kwargs are listed here. These initialize False by default.
    key_list = [ "fast", "quiet" ]
    
    #//*** Build Action Dictionary
    action_dict = { } 
    
    #//*** Build the keys from kwarg_list and default them to False
    for key in key_list:
        action_dict[key] = False
        
    #//*** Loop through the input kwargs (if any). Assign the action_dict values based on the kwargs:
    for key,value in input_options.items():
        print(key,value)
        action_dict[key] = value
    
    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
            
    #input_series = input_series.apply(word_tokenize)
    
    if action_dict['fast'] == False:
        print("Processing Tokens with NLTK Word Tokenize")
        input_series = apply_with_progress(input_series,word_tokenize,slices)
    else:
        print("Process Tokens with Split()")
        input_series = apply_with_progress(input_series,lambda x: x.split(),slices)
    
    
    
    print(f"Tokenize Time: {time.time() - start_time}")
    
    return input_series

#//*** Remove Stop words from the input list
def remove_stop_words(input_series):
    
    #//*** This function removes stop_words from a series.
    #//*** Works with series.apply()
    def apply_stop_words(input_list):

        #//*** Load Stopwords   
        for word in input_list:
            if word in stop_words:
                input_list.remove(word)
                #print(f"Removing: {word}")
        return input_list

    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk
        
    stopwords = nltk.corpus.stopwords

    #//*** Stopwords requires an additional download
    try:
        type(stopwords)
    except:
        nltk.download('stopwords')


    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()

    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
    #//*** Manually adding word the
    stop_words = []
    
    #//*** Remove apostrophies from the stop_words
    for stop in stopwords.words('english'):
        stop_words.append(stop.replace("'",""))

    #print("Stop Words: ")
    print(stop_words)
    print ("Processing Stop Words")
    input_series = apply_with_progress(input_series, apply_stop_words)
    
    print(f"Stop Words Time: {time.time() - start_time}")
    
    return input_series

def apply_stemmer(input_series,trim_single_words = True,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    #//*** Instantiate the Stemmer
    porter = nltk.stem.porter.PorterStemmer()
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [porter.stem(word) for word in word_list] - performs stemming on each word and returns a list
    #input_series = input_series.apply(lambda word_list: [porter.stem(word) for word in word_list] )
    print("Begin: Apply Stemmer")
    input_series = apply_with_progress(input_series, lambda word_list: [porter.stem(word) for word in word_list],slices)
    
    #//*** Remove Single letter words after stemming
    
    """
    if trim_single_words:
        for word_list in input_series:
            for word in word_list:
                if len(word) < 2:
                    word_list.remove(word)
    """
    
    print(f"Apply Stemmer Time: {time.time() - start_time}")
    return input_series

def apply_pos_tag(input_series,slices=100):
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk

    from nltk import pos_tag

    #//pos_tag requires an additional download
    try:
        pos_tag(["the","quick","brown","fox"])
    except: 
        nltk.download('averaged_perceptron_tagger')
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    print("Begin Part of Speech tagging")
    
    input_series = apply_with_progress(input_series,pos_tag,slices)
    
    print(f"Part of Speech Tagging Time: {round(time.time() - start_time,2)}s")
    
    return input_series
    
def apply_lemmatization(input_series,slices=20):
            
    #//*** import nltk if needed
    try:
        type(nltk)
    except:
        import nltk


    from nltk.stem import WordNetLemmatizer
    
    from nltk.corpus import wordnet    
    
    #nltk.download('wordnet')
    
    
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Start Timing the process
    start_time = time.time()
    
    #//*** Force creation of a new Series instead of using a copy
    input_series = input_series.copy()
    
    # Initialize the Lemmatizer instance
    lemmatizer = WordNetLemmatizer() 
    
    #//*** 1.) Apply() an action to each row
    #//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
    #//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
    #//*** 4.) [lemmatizer.lemmatize(word) for word in word_list] - performs lemmtization on each word and returns a list
    #lemmatized = input_series.apply(lambda word_list: [lemmatizer.lemmatize(*word) for word in word_list] )
    
    print("Begin Lemmatization...")
    
    input_series = apply_with_progress(input_series,lambda word_list: [lemmatizer.lemmatize(word) for word in word_list],20)
    
    print(f"Lemmatization Time: {time.time() - start_time}")
    
    #if detoken:
    #    return tokenize_series(input_series,5,{"fast":True})

    return input_series

#//*** Apply a function to a Series and display processing progress
#//*** Slices is the total number of intervals to report progress.
#//*** Slices = 20 displays processing after every 5% is processed
#//*** Slices = 100 displays processing after every 1% is processed
def apply_with_progress(input_series,input_function,slices=20):
    #//*** import time library
    try:
        type(time)
    except:
        import time

    #//*** Get the time at the start of the loop, used for elapsed time.
    start_time = time.time()
    
    #//*** The interval is the number of elements to process in each Loop. The default is 20.
    #//*** Which displays results at 5% intervals.
    interval = int(len(input_series)/slices)
    
    #//*** Total number of items to process
    total = len(input_series)
    

    #//*** Loop through slice times and display processing statistics for each slice.
    for x in range(0, slices ):
        #//*** Get time at the start of the slice.
        loop_time = time.time()
        
        #//*** Set the start index
        begin_dex = interval*x
        
        #//*** Set the end index
        end_dex = interval*x+interval-1
        
        #//*** Apply the input function to a slice of the input_series
        #//*** This part does all the actual 'work'
        input_series[begin_dex:end_dex] = input_series[begin_dex:end_dex].apply(input_function)
        
        #//*** Get the time after the slice of work is done
        now = time.time()
        
        #//*** Compute the estimated remaining time
        #//*** Total elapsed time / % of completed slices = Estimated total time
        #//*** Estimated total time - elaped time = Remaining time
        est_remain = round( ( ( now - start_time ) /  ( (x+1)/slices ) - (now-start_time)),2)

        #//*** Display Results so we know how much time is left (so we can effectively multi-task: ie comments, research and Valheim)
        print(f"Processed {x}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(now-loop_time,2)}s elapsed: {round(now-start_time,2)}s est Remain: {est_remain}s")
    
    #//*** END For Slice Loop
    
    #//*** Process the remaining values (Since interval is an int there should be a remainder)
    loop_time = time.time()
    begin_dex = end_dex+1
    if begin_dex < len(input_series):
        print(f"Processing Remaining values: {begin_dex} : {total} ")
        #print(input_series[begin_dex:])
        input_series[begin_dex:] = input_series[begin_dex:].apply(input_function)
    
    #//*** Display Final output
    print(f"Processed {slices}/{slices}: {begin_dex}:{end_dex} [{total}] in {round(time.time()-loop_time,2)}s elapsed: {round(time.time()-start_time,2)}s")
    
    #//*** return Series
    return input_series

In [4]:
df['pos_tag'] = apply_pos_tag(df['txt'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\stonk013\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


Begin Part of Speech tagging
Processed 0/100: 0:8740 [874139] in 95.56s elapsed: 95.56s est Remain: 9460.25s
Processed 1/100: 8741:17481 [874139] in 89.77s elapsed: 185.33s est Remain: 9081.01s
Processed 2/100: 17482:26222 [874139] in 88.65s elapsed: 273.98s est Remain: 8858.64s
Processed 3/100: 26223:34963 [874139] in 93.18s elapsed: 367.16s est Remain: 8811.72s
Processed 4/100: 34964:43704 [874139] in 95.91s elapsed: 463.06s est Remain: 8798.14s
Processed 5/100: 43705:52445 [874139] in 91.39s elapsed: 554.45s est Remain: 8686.34s
Processed 6/100: 52446:61186 [874139] in 87.11s elapsed: 641.56s est Remain: 8523.58s
Processed 7/100: 61187:69927 [874139] in 91.08s elapsed: 732.64s est Remain: 8425.39s
Processed 8/100: 69928:78668 [874139] in 93.17s elapsed: 825.81s est Remain: 8349.83s
Processed 9/100: 78669:87409 [874139] in 93.71s elapsed: 919.52s est Remain: 8275.64s
Processed 10/100: 87410:96150 [874139] in 92.12s elapsed: 1011.63s est Remain: 8185.04s
Processed 11/100: 96151:104891

KeyboardInterrupt: 

In [None]:
print(df)


### 2. Neural Network Classifier with Keras###
Using the multi-label classifier dataset from earlier exercises (categorized-comments.jsonl in the reddit folder), fit a neural network classifier using Keras. Use the code found in chapter 12 of the Applied Text Analysis with Python book as a guideline. Report the accuracy, precision, recall, F1-score, and confusion matrix.

### 3. Classifying Images ###
In chapter 20 of the Machine Learning with Python Cookbook, implement the code found in section 20.15 classify MSINT images using a convolutional neural network. Report the accuracy of your results.
