# Stoneburner, Kurt
- ## DSC 550 - Week 02

In [169]:
# //****************************************************************************************
# //*** Set Working Directory to thinkstats folder.
# //*** This pseudo-relative path call should work on all Stoneburner localized projects. 
# //****************************************************************************************

import os
import sys
import json 
# //*** Imports and Load Data
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import unicodedata

#//*** nltk - Natural Language toolkit
import nltk

#//**** Requires the punkt module. Download if it doesn't exist
try:
    type(nltk.punkt)
except:
    nltk.download('punkt')
    
from nltk.corpus import stopwords

#//*** Stopwords requires an additional download
try:
    type(stopwords)
except:
    nltk.download('stopwords')

from nltk.tokenize import word_tokenize

from nltk.stem.porter import PorterStemmer

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

### 2.2 Exercise: Build Your Text Classifiers ###

**1. You can find the dataset controversial-comments.jsonl for this exercise in the Weekly Resources: Week 2 Data Files.**

Pre-processing Text: For this part, you will start by reading the controversial-comments.jsonl file into a DataFrame. Then,

In [2]:
#//*** Temporary dictionary holds lists of JSON objects. pd.read_json generated an error. Likely due to the file
#//*** Not being a complete JSON object. Each line is its on JSON object. 
#//*** Read the file line by line
#//*** Parse each line of JSON. Parse each Key / Value pair. Each value is appeneded to a list. The lists are managed
#//*** with tdict[key]. As long as the input file has the same number of keys for each line, then this works.
#//*** Not sure what the canonical method is for converting items into a dataframe. But this technique has worked well
#//*** in DSC530 and DSC540.

#//*** Temporary Dictionary
tdict = {}

#//*** Read JSON into lists based on keys.
with open('z_controversial-comments.jsonl', "r") as f:
    
    #//*** Initialize tdict. Each Key is used in both the JSON and tdict. This works on JSON of any length but is
    #//*** limited to a flat construct which is fine for 2-D arrays.
    #//*** 1.) Read the first line of the file
    #//*** 2.) Convert the first line of JSON to a dictionary
    #//*** 3.) Get each key/value in dictionary items
    for key,value in json.loads(f.readline()).items():
            #//*** Initialize a list of value, using tdict[key]
            tdict[key] = [value]
    
    #//*** Process each remaining lines.
    for line in f:
        
        #//*** 1.) Convert each line to a dictionary
        #//*** 2.) get each key/value in dictionary
        for key,value in json.loads(line).items():
            
            #//*** Add Value to the list associated with tdict[key]
            tdict[key].append(value)
#//*** Initialize a new dataframe
con_df = pd.DataFrame()

#//*** Loop through tdict, add each key as a column with value as the column data
for key,value in tdict.items():
    con_df[key] = value

#//*** Delete tdict. It is unused and a 200mb+ object
del tdict

**A. Convert all text to lowercase letters.**

In [None]:
#//*** Convert to lower case
con_df['txt'] = con_df['txt'].str.lower()

**B. Remove all punctuation from the text.**

In [85]:
#//*** Remove new lines, I didn't see any samples of \r\n. But it is common enough. Replace it if it exists
con_df['txt'] = con_df['txt'].str.replace(r'\r?\n',"")
#//*** Remove plain ]n new lines
con_df['txt'] = con_df['txt'].str.replace(r'\n',"")

#//*** Remove html entities, observed entities are &gt; and &lt;. All HTML entities begin with & and end with ;.
#//*** Let's use regex to remove html entities
con_df['txt'] = con_df['txt'].str.replace(r'&.*;',"")

#//*** Remove elements flagged as [removed]
con_df['txt'] = con_df['txt'].str.replace(r'\[removed\]',"")

#//*** Remove elements flagged as [deleted]
con_df['txt'] = con_df['txt'].str.replace(r'\[deleted\]',"")

#//*** Some text should be empty with the removal of [removed] and [deleted]
#//*** Remove the empty text
con_df = con_df[ con_df['txt'].str.len() > 0]

#//*** Remove punctuation using the example from the book
punctuation = dict.fromkeys(i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith('P') )
con_df['txt'] = con_df['txt'].str.translate(punctuation)


**C. Remove stop words.**

In [105]:
#//*** Tokenize conf_df['txt']
#//*** This Takes a wee bit to run
con_df['process'] = con_df['txt'].apply(word_tokenize)

In [170]:
#//*** I'm not pythonic enough to do this on one line.
#//*** This function removes stop_words from a list.
#//*** Works with dataframe.apply()
def remove_stop_words(input_list):
    #//*** Load Stopwords
    
    
    for word in input_list:
        if word in stop_words:
            input_list.remove(word)
    return input_list

#//*** The stop_words include punctuation. Stop Word Contractions will not be filtered out.
stop_words = []

#//*** Remove apostrophies from the stop_words
for stop in stopwords.words('english'):
    stop_words.append(stop.replace("'",""))

#//*** Remove Stop words from the tokenized strings in the 'process' column
con_df['process'] = con_df['process'].apply(remove_stop_words)

**D. Apply NLTK’s PorterStemmer.**

In [184]:


#/*** Create Stemmer
porter = PorterStemmer()

#//*** Pre stemming sample
print(con_df['process'][400:420])

#//*** It's a pythonic answer
#//*** 1.) Apply() an action to each row
#//*** 2.) lambda word_list, each row is treated as word_list for the subsequent expression
#//*** 3.) The base [ word for word in wordlist] would return each word in word_list as a list. 
#//*** 4.) [porter.stem(word) for word in word_list] - performs stemming on each word and returns a list
con_df['process'] = con_df['process'].apply(lambda word_list: [porter.stem(word) for word in word_list] )

#//*** post stemming sample
print(con_df['process'][400:420])


439    [bill, fuck, bagel, long, toaster, yet, hillar...
440                          [ucuteman, got, rekt, lulz]
442    [think, anyone, cares, business, management, p...
443    [mean, hes, going, take, away, spouses, health...
444    [wanting, argue, something, past, make, disqua...
445                           [long, get, keep, chicago]
446    [everytime, say, doubt, actually, adult, sound...
447    [working, momsnever, underestimate, stupid, le...
448    [put, effort, first, response, conversation, w...
449    [1, post, positive, trump, news, instead, whin...
450    [ive, saying, thinking, year, depressing, amer...
452                                [subreddit, terrible]
453    [mitt, romney, right, guy, office, positive, s...
454    [dense, enough, taking, comments, section, rpo...
455    [think, boils, understanding, lobbysts, first,...
456    [true, cant, electors, vote, without, popular,...
457    [lotta, research, done, thedonald, trump, univ...
458             [especially, 17

In [185]:
#//******************************
#//*** Break out the Konami Code
#//******************************
#//*** Up, up, Down, Down, Left, Right, Left, Right, Select Start

#//*** It's a lot of processing to get here. Save the dataframe to make this easier to pickup later
con_df.to_csv("z_wk02_controversial_words_df.csv")

**2. Now that the data is pre-processed, you will apply three different techniques to get it into a usable form for model-building. Apply each of the following steps (individually) to the pre-processed data.**

A. Convert each text entry into a word-count vector (see sections 5.3 & 6.8 in the Machine Learning with Python Cookbook).

B. Convert each text entry into a part-of-speech tag vector (see section 6.7 in the Machine Learning with Python Cookbook).

C. Convert each entry into a term frequency-inverse document frequency (tfidf) vector (see section 6.9 in the Machine Learning with Python Cookbook).

**Follow-Up Question**

For the three techniques in problem (2) above, give an example where each would be useful.

NOTE

Running these steps on all of the data can take a while, so feel free to cut down on the number of texts (maybe 50,000) if your program takes too long to run. But be sure to select the text entries randomly!

In [4]:
# //*** CODE HERE

In [5]:
# //*** CODE HERE