In [1]:
%matplotlib inline
import pandas as pd
import os
import numpy as np
import seaborn as sns
import glob
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
NEWLINE = '\n'
SKIP_FILES = {'cmds'}


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="ANSI")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield content
                    


In [3]:
path =r'C:\Users\osutr_000\Documents\Data\Ops'
list_ = []

for value in read_files(path):
    list_.append(value)

ops_df = pd.DataFrame(data = list_)
ops_df['class']="ops"
len(ops_df)

580

In [4]:
path =r'C:\Users\osutr_000\Documents\Data\Legal'
list_ = []

for value in read_files(path):
    list_.append(value)

legal_df = pd.DataFrame(data = list_)
legal_df['class']="legal"
len(legal_df)

209

In [5]:
path =r'C:\Users\osutr_000\Documents\Data\Accounting'
list_ = []

for value in read_files(path):
    list_.append(value)

act_df = pd.DataFrame(data = list_)
act_df['class']="accounting"
len(act_df)

324

In [6]:
merged_df = ops_df.append(legal_df).append(act_df)
len(merged_df)
merged_df.columns = ['text', 'cat']
merged_df.sort_index().reset_index().head()

Unnamed: 0,index,text,cat
0,0,"401 W. 33rd \n\nEdmond, OK 73013 \n\nWWW.MIDCO...",ops
1,0,26938 \n\n\n\n002 \n\n\n\n4/27/16 290.60 290. ...,accounting
2,0,"MIDCON Data Services, LLC. \n\n\n\nThis Data D...",legal
3,1,5/25/16 179.17 179.17 \n\n\n\n\n\n5/27/16 0160...,accounting
4,1,,legal


In [7]:
def make_xy(merged_df, vectorizer=None):
    #Your code here    
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(merged_df.text)
    X = X.tocsc()  # some versions of sklearn return COO format
    y = (merged_df.cat == 'ops').values.astype(np.int)
    return X, y
X, y = make_xy(merged_df)


In [8]:
y.sum()

580

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
xtrain, xtest, ytrain, ytest = train_test_split(X, y)
classi = MultinomialNB().fit(xtrain, ytrain)
accuracy_test = classi.score(xtest, ytest)
accuracy_train = classi.score(xtrain, ytrain)
print("Accuracy of test set")
print(accuracy_test)
print("Accuracy of training set")
print(accuracy_train)

Accuracy of test set
0.939068100358
Accuracy of training set
0.924460431655


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
def make_xy_v2(merged_df, vectorizer=TfidfVectorizer()):
    if vectorizer is None:
        vectorizer = CountVectorizer()
    X2 = vectorizer.fit_transform(merged_df.text)
    X2 = X.tocsc()  # some versions of sklearn return COO format
    y2 = (merged_df.cat == 'ops').values.astype(np.int)
    return X, y
X2, y2 = make_xy_v2(merged_df)

In [11]:
xtrain, xtest, ytrain, ytest = train_test_split(X2, y2)
classi = MultinomialNB().fit(xtrain, ytrain)
accuracy_test_tfidf = classi.score(xtest, ytest)
accuracy_train_tfidf = classi.score(xtrain, ytrain)

print("Accuracy of test set")
print(accuracy_test_tfidf)
print("Accuracy of training set")
print(accuracy_train_tfidf)

Accuracy of test set
0.94982078853
Accuracy of training set
0.920863309353


In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from __future__ import print_function
from pprint import pprint
from time import time
import logging

In [19]:
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    #'clf__max_iter': (5,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(merged_df.text, merged_df.cat)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__penalty': ('l2', 'elasticnet'),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed:  4.1min finished


done in 249.947s

Best score: 1.000
Best parameters set:
	clf__alpha: 1e-06
	clf__penalty: 'l2'
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__ngram_range: (1, 1)
