# Model Set-up

A demonstration of Topic Modeling

In [37]:
%matplotlib inline
import pandas as pd
import os
import numpy as np
import sklearn


In [38]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD

In [39]:
NEWLINE = '\n'
SKIP_FILES = {'cmds'}


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="ANSI")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield content
                    


# PreProcessing work
Loop through each text files acquired from OCR and scrub the data

We will then zip the list into a DataFrame.




In [40]:
#Training set is pre-classified according to subfolder subfolder
path =r'C:\Users\osutr_000\Documents\Data\Ops'
list_ = []

for text in read_files(path):
    # tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    # Remove stop words known in nltk package
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    # Use word stemmeing to get root meaning
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    # Concatenate words into one string and then add to list_
    final = " ".join(intermediate)
    list_.append(final)
ops_df = pd.DataFrame(data = list_)
ops_df['class']="ops"
len(ops_df)

580

In [41]:
path =r'C:\Users\osutr_000\Documents\Data\Legal'
list_ = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_.append(final)
legal_df = pd.DataFrame(data = list_)
legal_df['class']="legal"
len(legal_df)

209

In [42]:
path =r'C:\Users\osutr_000\Documents\Data\Accounting'
list_ = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_.append(final)
accounting_df = pd.DataFrame(data = list_)
accounting_df['class']="accounting"
len(accounting_df)

324

In [43]:
merged_df = ops_df.append(legal_df).append(accounting_df)
len(merged_df)

1113

In [44]:
# There are 1113 total training examples:
# 580 Ops
# 209 Legal
# 324 Accounting

In [45]:
merged_df.columns = ['text', 'cat']
merged_df.sort_index().reset_index().head()

Unnamed: 0,index,text,cat
0,0,401 w 33rd edmond ok 73013 www midcondat com 4...,ops
1,0,26938 002 4 27 16 290 60 290 60 5 4 16 26938 d...,accounting
2,0,midcon dat serv llc thi dat destruct agr mad e...,legal
3,1,5 25 16 179 17 179 17 5 27 16 01605 05 republ ...,accounting
4,1,,legal


In [None]:
# You can see how the processed data is now structured.

In [47]:
# Now we will flatten the data into (sample, feature) matrices
X = merged_df.text
y = merged_df.cat

# and then split the dataset into two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Model 1
Here are some of the model features seen in this section:
* CountVectorizer implements both tokenization and occurrence counting in a single class

* TfidfTransformer transforms a count matrix to a normalized tf or tf-idf representation. 
The goal of using tf-idf is to scale down the impact of tokens that occur very frequently in a given corpus 
that are hence empirically less informative than features that occur in a small fraction of the training corpus.

* SGDCClassifier (Stochastic Gradient Decent) has gain popularity in NLP applications for it's scalability.
SGD has a lot of parameters and is sensitive to feature scaling 


In [48]:
# Define a pipeline with a text feature extractor and classifier:

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# hyper-parameters
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (5,),
    #'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}


In [49]:
print("# Tuning hyper-parameters...")
print()

clf = GridSearchCV(pipeline, parameters)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...

Best parameters set found on development set:

{'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.5}

Grid scores on development set:

0.930 (+/-0.037) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 0.5}
0.950 (+/-0.079) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 0.75}
0.932 (+/-0.040) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 1.0}
0.993 (+/-0.014) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.5}
0.993 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.75}
0.993 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 

In [50]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       160
      legal       0.99      1.00      1.00       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




# Analysis of Model 1
As seen in the detailed classification report, both the precision and recall of the model are near 100%,
and therefore the harmonic mean (f-score) is also 100%.This indicates a good model fit to the dataset. 

Precision is the number of true positives over the number of true positives plus false positives. 
Recall is the number of true positives over the number of true positives plus false negatives

# Model 2
Here are some of the features you will see tried in this model:
* TfidfVectorizer combines the features of the previously used packages - CountVectorizer and TfidfTransformer
* Support Vector Classification (SVC) doesn't like high dimensionality (number of features) so I applied a demensionality reducer called TruncatedSVD.
* SVM-C is one of the oldest AI algorithms, and is the basis for neural networks. SVC doesn't care about the 'perfect' point, rather it wasnts the 'ugliest' point that still classifies. You can control the kernel type and slack variable (c)


In [65]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD()),
    ('clf', SVC()),
])

param_grid = dict(tfidf__sublinear_tf=[0,1],
                  svd__n_components=[10, 20, 100],
                  clf__kernel=['linear','rbf'],
                  clf__C=[1, 10, 100]
                 )

In [66]:
print("# Tuning hyper-parameters...")
print()

clf = GridSearchCV(pipe, param_grid)
clf.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...

Best parameters set found on development set:

{'clf__C': 100, 'clf__kernel': 'linear', 'svd__n_components': 10, 'tfidf__sublinear_tf': 1}

Grid scores on development set:

0.964 (+/-0.078) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 10, 'tfidf__sublinear_tf': 0}
0.964 (+/-0.078) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 10, 'tfidf__sublinear_tf': 1}
0.987 (+/-0.005) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 20, 'tfidf__sublinear_tf': 0}
0.991 (+/-0.010) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 20, 'tfidf__sublinear_tf': 1}
0.991 (+/-0.018) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 100, 'tfidf__sublinear_tf': 0}
0.995 (+/-0.015) for {'clf__C': 1, 'clf__kernel': 'linear', 'svd__n_components': 100, 'tfidf__sublinear_tf': 1}
0.930 (+/-0.030) for {'clf__C': 1, 'clf__kernel': 'rbf', 'svd__n_components': 10, 'tfidf__sublinear_tf': 0}
0.930 (+/-0.030) for 

In [67]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       160
      legal       0.99      1.00      1.00       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




## Analysis of Model 2
Given that Gradient Decent and Support Vector Machines are of the same family, it is notsurpring Model 2 results are similar to Model 1. Precision, Recall and the combined F1 score are all averaging 100%.

## Next Steps
I will explore a random forrest approach in Model 3 