resources

<ul>
<li>https://chrisalbon.com/</li>
<li>https://stackoverflow.com/questions/44173624/how-to-nltk-word-tokenize-to-a-pandas-dataframe-for-twitter-data</li>
<li>https://www.dataquest.io/blog/natural-language-processing-with-python/</li>
<li>https://stackoverflow.com/questions/34784004/python-text-processing-nltk-and-pandas</li>
<li>https://stackoverflow.com/questions/37443138/python-stemming-with-pandas-dataframe</li>
<li>https://stackoverflow.com/questions/18936957/count-distinct-words-from-a-pandas-data-frame</li>
<li>https://pandas.pydata.org/pandas-docs/stable/indexing.html</li>
<li>https://github.com/pandas-dev/pandas/blob/master/doc/cheatsheet/Pandas_Cheat_Sheet.pdf</li>
<li>http://www.webpages.uidaho.edu/~stevel/504/Pandas%20DataFrame%20Notes.pdf</li>
<li>https://chrisalbon.com/python/pandas_convert_categorical_to_dummies.html</li>
<li>http://mathesaurus.sourceforge.net/r-numpy.html</li>
<li>http://www.datasciencefree.com/cheatsheets.html</li>
<li>http://www.data-analysis-in-python.org/python_for_r.html</li>
<li>https://www.dataquest.io/blog/python-vs-r/</li>
<li>https://rstudio.github.io/reticulate/articles/arrays.html</li>
<li>https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html</li>
</ul>

## General

preferred packages

In [None]:
import h2o
import pandas as pd
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

read a CSV into a data frame

In [None]:
train = pd.read_csv("train_121117.csv", sep=',', error_bad_lines=False, encoding='latin1')#, encoding='utf-8') #header=None, 

save a data frame as a CSV file

In [None]:
test.to_csv("test_121317.csv")

drop a column

In [None]:
test = test.drop('Unnamed: 0', axis=1) 
# column is named "Unnamed: 0'
# axis=1, here, means columns

## Data Exploration

In [None]:
test.shape

get column names

In [None]:
train.columns.get_values()

see values for a specific column for a few rows

In [None]:
test[1:5]["tokenized_text"]

get summary information for a given column

In [None]:
test["word1_preposition"].describe()

filter rows

In [None]:
train.query('has_altogeth != 0')[0:3] #using column/feature "has_altogeth"

get mean of numeric column

In [None]:
train["all_char"].mean()

## Data Shaping and Manipulation

create a vector/list from a data frame column, operated upon

In [None]:
first_word = test["bare"].apply(lambda x: re.search(r"^\s*(\w*)\b", x).group(1))

split data set into train and validation set using random sampling from uniform distribution

In [None]:
np.random.seed(1002)
train["assign"] = np.random.uniform(low=0.0, high=1.0, size=train.shape[0])
train_use = train[lambda x: x["assign"] > 0.2]
validate_use = train[lambda x: x["assign"] <= 0.2]

## Text Processing with re

In [None]:
import(re)

search for a pattern and return the string/word found

In [None]:
re.search(r"^\w*\b", test["bare"][0]).group(0) 
# if use grouping, 0 returns entire hit, and 1+ return individual pattern groupings

search for a pattern in all the elements of a column and return the grouping found 

In [None]:
first_word = test["bare"].apply(lambda x: re.search(r"^\s*(\w*)\b", x).group(1)) # first word is a vector of words
# if use grouping, 0 returns entire hit, and 1+ return individual pattern groupings

create a feature counting the number members of a list of words contained in each string in a vector/column of text

In [None]:
temp = first_word #create a copy of the vector/list (or column)
for prep in prepositions: # use list of words 'prepositions'
    term = "\\b"+prep+"\\b"
    temp = temp.apply(lambda x: re.sub(term, "A", x))
    
temp = temp.apply(lambda x: re.sub("[^A]", "", x))
test["word1_preposition"] = temp.apply(lambda x: len(x)) #if the column only contained single words, this will automatically be binary

## Text Processing with NLTK
<ul>
<li>https://stackoverflow.com/questions/37443138/python-stemming-with-pandas-dataframe
<li>https://stackoverflow.com/questions/18936957/count-distinct-words-from-a-pandas-data-frame
</ul>

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
#from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

tokenize and stem text in a data frame column

In [None]:
stemmer = SnowballStemmer("english")
train['tokenized_text'] = train['bare'].apply(word_tokenize)
train["stemmed"] = train["tokenized_text"].apply(lambda x: [stemmer.stem(y) for y in x])
train["stemmed"] = train["stemmed"].apply(lambda x: " ".join(x))

create binary, dummy variable columns for the presence of specific words

In [None]:
words_of_interest = ['account', 'altogeth', 'attent', 'between', 'case'] #these are stems
for word in words_of_interest[1:2]:
    lbl = "has_" + word
    temp = train["stemmed"].apply(lambda x: re.search("\\b"+word+"\\b", x))
    train[lbl] = temp.apply(lambda x: 1 if x else 0)

## Machine Learning with h2o

<ul>
<li>http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/drf.html</li>
<li>https://github.com/h2oai/h2o-tutorials</li>
<li>http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/frame.html</li>
<li>https://h2o-release.s3.amazonaws.com/h2o/rel-slater/9/docs-website/h2o-py/docs/frame.html</li>
<li>http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/sortcolumn.html</li>
<li>http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-munging/replacing-values.html</li>
</ul>

initialize

In [None]:
h2o.init(nthreads=5, max_mem_size = "4G")

get cluster information

In [None]:
h2o.cluster().show_status()

import CSV

In [None]:
train = h2o.upload_file("train_use_121217.csv")

view columns; drop a column

In [None]:
train.columns
train = train.drop("C1")

check the type of a column; member of list

In [None]:
train["word_count"].isnumeric()[0]
type(mu[0][0])

find all numeric columns

In [None]:
numerics = []
for x in range(0, len(cnames)):
    if cnames[x] in ["is_eap", "is_hpl", "is_mws"]: #these were numeric, but I didn't want to include 
        pass
    elif temp[x] == True: 
        numerics.append(cnames[x])

get mean and sd for all of the numeric columns; z-score normalize using mean and sd

In [None]:
mu = []
sigma = []
for cname in numerics:
    mu.append(train[cname].mean())
    sigma.append(train[cname].sd())
    
for x in range(0, len(numerics)):
    train[numerics[x]] = (train[numerics[x]] - mu[x][0])/sigma[x][0]    

train random forest model

In [None]:
rf2 = h2o.estimators.random_forest.H2ORandomForestEstimator(ntrees = 50, max_depth = 20, seed = 1002)
rf2.train(training_frame = train, y = "author", ignored_columns = ["id", "is_eap", "is_hpl", "is_mws"])

get predictions from trained model and overall accuracy of predicted classes

In [None]:
rf2_predictions = rf2.predict(test)
rf2_results = test_authors == rf2_predictions["predict"]
len(rf2_results[rf2_results == 1])/len(test_authors)

create new frame from subset of columns

In [None]:
results = submit[[0,288, 289, 290]]

export h2o frame as csv

In [None]:
h2o.export_file(results, "submission_121317.csv", force = True)