# A Multiclass Classification problem

# Model Set-up

In [1]:
%matplotlib inline
import pandas as pd
import os
import numpy as np
import sklearn


In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# PreProcessing work
Loop through each text files acquired from OCR and scrub the data

We will then zip the list into a DataFrame.




In [4]:
NEWLINE = '\n'
SKIP_FILES = {'cmds'}


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="ANSI")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield content
                    


In [5]:
#Training set is pre-classified according to subfolder subfolder
path =r'C:\Users\osutr_000\Documents\Data\Ops'
list_o = []

for text in read_files(path):
    # tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    # Remove stop words known in nltk package
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    # Use word stemmeing to get root meaning
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    # Concatenate words into one string and then add to list_
    final = " ".join(intermediate)
    list_o.append(final)
ops_df = pd.DataFrame(data = list_o)
ops_df['class']="ops"
len(ops_df)

580

In [6]:
path =r'C:\Users\osutr_000\Documents\Data\Legal'
list_l = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_l.append(final)
legal_df = pd.DataFrame(data = list_l)
legal_df['class']="legal"
len(legal_df)

209

In [7]:
path =r'C:\Users\osutr_000\Documents\Data\Accounting'
list_a = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_a.append(final)
accounting_df = pd.DataFrame(data = list_a)
accounting_df['class']="accounting"
len(accounting_df)

324

## Assemble training and test data
We will use a 50/50 split

In [8]:
merged_df = ops_df.append(legal_df).append(accounting_df)
merged_df.columns = ['text', 'cat']
merged_df.sort_index().reset_index()
len(merged_df)

1113

In [9]:
# There are 1113 total training examples:
# 580 Ops
# 209 Legal
# 324 Accounting

In [10]:
# Now we will flatten the data into (sample, feature) matrices
X = merged_df.text
y = merged_df.cat

# and then split the dataset into two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Model fitting
To streamline the code, I will be using both pipelines and grid_search in the model fitting steps. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. The purpose of the grid_search is the test several different combinations of hyper-parameters to find the best estimator.

There will be 3 models evaluated:
* Model 1: CountVectorizer, TfidfTransformer, and SGDCLassifier
* Model 2: HashVectorizer and SGDClassifier
* Model 3: TfidfVectorizer and LinearSVC

## Model 1
Here are some of the model features seen in this section:
* CountVectorizer implements both tokenization and occurrence counting in a single class
* TfidfTransformer transforms a count matrix to a normalized tf or tf-idf representation. 
* SGDClassifier (Stochastic Gradient Decent)


In [11]:
# Define a pipeline with a text feature extractor and classifier:

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=.0001)),
])

# hyper-parameters
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
}


In [12]:
print("# Tuning hyper-parameters...")
print()

clf1 = GridSearchCV(pipeline, parameters)
clf1.fit(X = X_train, y = y_train)

print()
print("Grid scores on development set:")
print()
means = clf1.cv_results_['mean_test_score']
stds = clf1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


Grid scores on development set:

0.991 (+/-0.010) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'vect__max_df': 0.5}
0.995 (+/-0.000) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'vect__max_df': 0.75}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'vect__max_df': 1.0}
0.995 (+/-0.009) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__max_df': 0.5}
0.998 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__max_df': 0.75}
0.998 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'vect__max_df': 1.0}
0.991 (+/-0.010) for {'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l1', 'vect__max_df': 0.5}
0.995 (+/-0.000) for {'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l1', 'vect__max_df': 0.75}
0.995 (+/-0.000) for {'clf__alpha': 1e-05, 'clf__penalty': 'e

In [13]:
print("Best parameters set found on development set:")
print(clf1.best_params_)
print()
print("Best score on development set:")
print(clf1.best_score_)

Best parameters set found on development set:
{'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'tfidf__norm': 'l2', 'vect__max_df': 0.75}

Best score on development set:
1.0


In [14]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf1.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf1.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      0.99      1.00       160
      legal       0.99      1.00      1.00       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




## Analysis of Model 1
As seen in the detailed classification report, both the precision and recall of the model are near 100%,
and therefore the harmonic mean (f-score) is also 100%.This indicates a near perfect model fit to the dataset. 

Precision is the number of true positives over the number of true positives plus false positives. 
Recall is the number of true positives over the number of true positives plus false negatives

## Model 2
In Model 2 a different vectorizer, HashingVectorizer, is used in the pipeline in order to evaluate the effect CountVectorizer had on Model 1. The HashingVectorizer will convert the text documents to a matrix of token occurrences by using a hashing trick to find the token string name to feature integer index mapping.

In [15]:
# Define a pipeline with a text feature extractor and classifier:

pipeline = Pipeline([
    ('vect', HashingVectorizer()),
    ('clf', SGDClassifier(max_iter=1000, tol=.0001)),
])

# hyper-parameters
parameters = {
    'vect__norm': ('l1','l2',None),
    'vect__alternate_sign': [0,1],
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
}


In [16]:
print("# Tuning hyper-parameters...")
print()

clf2 = GridSearchCV(pipeline, parameters)
clf2.fit(X_train, y_train)

print()
print("Grid scores on development set:")
print()
means = clf2.cv_results_['mean_test_score']
stds = clf2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


Grid scores on development set:

0.993 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': 'l1'}
0.996 (+/-0.010) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': 'l2'}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': None}
0.991 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': 'l1'}
1.000 (+/-0.000) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': 'l2'}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': None}
0.993 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'vect__alternate_sign': 0, 'vect__norm': 'l1'}
0.998 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__penalty': 'elasticnet', 'vect__alternate_sign': 0, 'vect__norm': 'l2'}
0.996 (+/-0.010) for {'c

In [17]:
print("Best parameters set found on development set:")
print(clf2.best_params_)
print()
print("Best score on development set:")
print(clf2.best_score_)

Best parameters set found on development set:
{'clf__alpha': 1e-05, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': 'l2'}

Best score on development set:
1.0


In [18]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf2.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf2.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       160
      legal       0.99      1.00      1.00       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




## Results from Model 2
Model 2 score results are nearly identical to Model 1. There will need to be further analysis before we can confidently declare which vectorizer is a better fit to the model. Fortunately, both have shown good results. 

One benfit of using the HashingVectorizer is that it scales very well since it does not have memory. However, there is no way to compute the inverse transform (from feature indices to string feature names) which can be a problem when trying to introspect which features are most important to a model. 


## Model 3
In Model 3 I will apply a new classifier, Support Vector Machine (SVM) in order to evaluate the effectivness of the SGD classifier on Model 1. SVM is one of the oldest AI algorithms, and is the basis for neural networks. SVC does not care about the 'perfect' point, instead it wants the 'ugliest' point that still classifies. We will tune the slack variable (c) in our model. 

In addition, I will resume using the same transformers from Model 1, CountVectorizer and TfidfTransformer, which are combined into a TfidfVectorizer. 


In [19]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC()),
])

param_grid = dict(tfidf__sublinear_tf=[0,1],
                  tfidf__smooth_idf=[0,1],
                  tfidf__norm =['l1','l2',None],
                  clf__C=[1, 10, 100]
                 )

In [20]:
print("# Tuning hyper-parameters...")
print()

clf = GridSearchCV(pipe, param_grid)
clf.fit(X_train, y_train)


print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


Grid scores on development set:

0.896 (+/-0.006) for {'clf__C': 1, 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.646 (+/-0.012) for {'clf__C': 1, 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}
0.905 (+/-0.009) for {'clf__C': 1, 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.658 (+/-0.022) for {'clf__C': 1, 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 1}
0.991 (+/-0.018) for {'clf__C': 1, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.993 (+/-0.014) for {'clf__C': 1, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}
0.991 (+/-0.018) for {'clf__C': 1, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.993 (+/-0.014) for {'clf__C': 1, 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 1}
0.964 (+/-0.011) for {'clf__C': 1, 'tfidf__norm': None, 'tfidf__smooth_idf': 0, 'tfidf__

In [21]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print()
print("Best score on development set:")
print(clf.best_score_)

Best parameters set found on development set:
{'clf__C': 10, 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}

Best score on development set:
0.998201438849


In [22]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       160
      legal       0.99      1.00      1.00       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




## Analysis of Model 3
The SVM classifier is performing well, but slightly less accurate than the SGDC classifier. Precision, Recall and the combined F1 score are all averaging over 99%.

# Analysis of dataset
It is very curious that we have seen such good statistical results on all three models. I wonder what the top features the model is using to classify the documents? 

In [23]:
cv = CountVectorizer()
tfidf = TfidfTransformer()
clf = SGDClassifier(max_iter=1000, tol=.0001)
counts = cv.fit_transform(X_train, y_train)
inverse = tfidf.fit_transform(counts)
clf.fit(inverse, y_train)

#df = get_feature_df(clf1,cv.get_feature_names())
#df

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=1000, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=0.0001, verbose=0, warm_start=False)

In [24]:
print("How many total words are in the dataset:")
print (len(cv.vocabulary_))

How many total words are in the dataset:
17274


In [25]:
feature_names = cv.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
bestFeat = coefs_with_fns[:20]
worstFeat = coefs_with_fns[:-(20 + 1):-1]
print("Top 20 best features: ")
for feat in bestFeat:
    print(feat)
print("Top 20 worst features: ")
for feat in worstFeat:
    print(feat)

Top 20 best features: 
(-2.6328489991346107, 'item')
(-2.5388631023912005, 'stor')
(-2.1321852378280632, '2017')
(-1.7061929400109135, 'forb')
(-1.7061581778127475, 'shal')
(-1.7058441488086014, 'nrc')
(-1.6957447552067115, 'agr')
(-1.6264026160280873, 'box')
(-1.612774437528548, 'iiii')
(-1.5311906654616074, 'per')
(-1.4905384606874612, 'dre')
(-1.48437558298814, '0621266')
(-1.3992072257255157, 'vault')
(-1.2702604678333984, '2653')
(-1.267502048702466, 'vmi')
(-1.2051885551214421, '0621385')
(-1.1931551646377758, 'ms')
(-1.1669089060284177, 'boat')
(-1.1048900455671682, '25am')
(-1.102869615243298, 'pick')
Top 20 worst features: 
(2.6541626939421494, '2016')
(2.5918903954985413, '16')
(2.4672060899808597, 'invo')
(2.349496008060493, 'serv')
(2.0295616797862688, 'windstream')
(2.0028895152066259, 'pay')
(1.9458360604122062, 'llc')
(1.9386518022330104, '00')
(1.6453764315366723, 'tax')
(1.6366854582891603, 'dat')
(1.5900171125333229, '50')
(1.4897009248228821, 'alarm')
(1.448343862035

In [26]:
#what the are most repeated words in the training set?
freqs = [(word, counts.getcol(idx).sum()) for word, idx in cv.vocabulary_.items()]
#sort from largest to smallest
print (sorted (freqs, key = lambda x: -x[1]))

[('00', 4212), ('dat', 2830), ('item', 2731), ('stor', 2301), ('agr', 2163), ('midcon', 2137), ('serv', 1971), ('custom', 1740), ('shal', 1716), ('company', 1699), ('box', 1663), ('not', 1651), ('16', 1373), ('ok', 1283), ('charg', 1199), ('ord', 1198), ('pay', 1110), ('11', 1054), ('mat', 1019), ('2016', 1007), ('city', 988), ('oklahom', 963), ('work', 963), ('us', 954), ('fee', 914), ('fil', 907), ('tot', 893), ('10', 893), ('405', 881), ('provid', 869), ('invo', 825), ('2017', 813), ('party', 812), ('due', 790), ('the', 786), ('delivery', 783), ('tim', 777), ('sit', 775), ('lin', 747), ('term', 745), ('address', 736), ('0621385', 728), ('auth', 727), ('may', 723), ('numb', 722), ('inform', 703), ('med', 685), ('scan', 677), ('pag', 676), ('day', 671), ('trr', 671), ('ver', 666), ('no', 658), ('request', 652), ('writ', 651), ('ms', 644), ('vmi', 643), ('12', 636), ('cli', 624), ('per', 620), ('account', 612), ('access', 601), ('vault', 594), ('stat', 587), ('tax', 579), ('05', 579), 




In [30]:
import gc
gc.collect()

path =r'C:\Users\osutr_000\Documents\Data\Ops'
list_o = []

for text in read_files(path):
    list_o.append(text)

path =r'C:\Users\osutr_000\Documents\Data\Legal'
list_l = []

for text in read_files(path):
    list_l.append(text)
    
path =r'C:\Users\osutr_000\Documents\Data\Accounting'
list_a = []

for text in read_files(path):
    list_a.append(text)
    
list_ = list_o + list_a + list_l

In [31]:
gc.collect()
corpus = ' '.join(list_a)
nlp(corpus)

MemoryError: 

In [None]:
token_text = [token.orth_ for token in corpus_l]
token_pos = [token.pos_ for token in corpus_l]
token_entity_type = [token.ent_type_ for token in corpus_l]
token_shape = [token.shape_ for token in corpus_l]

token_df = pd.DataFrame(list(zip(token_text, token_pos, token_entity_type, token_shape)),
             columns=['token_text', 'part_of_speech','entity_type', 'token_shape'])
token_df.head(10)

In [None]:
token_df.groupby(token_df.part_of_speech).size().plot(kind = 'bar')