## A Multi-class Classification problem

# Model Set-up

In [2]:
%matplotlib inline
import pandas as pd
import os
import numpy as np
import sklearn


In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()




In [4]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [5]:
NEWLINE = '\n'
SKIP_FILES = {'cmds'}


def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="ANSI")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == NEWLINE:
                            past_header = True
                    f.close()
                    content = NEWLINE.join(lines)
                    yield content
                    


# PreProcessing work
Loop through each text files acquired from OCR and scrub the data

We will then zip the list into a DataFrame.




In [6]:
#Training set is pre-classified according to subfolder subfolder
path =r'C:\Users\osutr_000\Documents\Data\Ops'
list_o = []

for text in read_files(path):
    # tokenize the text
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    # Remove stop words known in nltk package
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    # Use word stemmeing to get root meaning
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    # Concatenate words into one string and then add to list_
    final = " ".join(intermediate)
    list_o.append(final)
ops_df = pd.DataFrame(data = list_o)
ops_df['class']="ops"
len(ops_df)

580

In [7]:
path =r'C:\Users\osutr_000\Documents\Data\Legal'
list_l = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_l.append(final)
legal_df = pd.DataFrame(data = list_l)
legal_df['class']="legal"
len(legal_df)

209

In [8]:
path =r'C:\Users\osutr_000\Documents\Data\Accounting'
list_a = []

for text in read_files(path):
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(text)
    stop = stopwords.words('english')
    intermediate = [i for i in intermediate if i not in stop]
    lanste = LancasterStemmer()
    intermediate = [lanste.stem(i) for i in intermediate]
    final = " ".join(intermediate)
    list_a.append(final)
accounting_df = pd.DataFrame(data = list_a)
accounting_df['class']="accounting"
len(accounting_df)

324

In [9]:
merged_df = ops_df.append(legal_df).append(accounting_df)
len(merged_df)

1113

In [10]:
# There are 1113 total training examples:
# 580 Ops
# 209 Legal
# 324 Accounting

In [11]:
merged_df.columns = ['text', 'cat']
merged_df.sort_index().reset_index().head()

Unnamed: 0,index,text,cat
0,0,401 w 33rd edmond ok 73013 www midcondat com 4...,ops
1,0,26938 002 4 27 16 290 60 290 60 5 4 16 26938 d...,accounting
2,0,midcon dat serv llc thi dat destruct agr mad e...,legal
3,1,5 25 16 179 17 179 17 5 27 16 01605 05 republ ...,accounting
4,1,,legal


In [12]:
# You can see how the processed data is now structured.

In [13]:
# Now we will flatten the data into (sample, feature) matrices
X = merged_df.text
y = merged_df.cat

# and then split the dataset into two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)

# Model 1
Here are some of the model features seen in this section:
* CountVectorizer implements both tokenization and occurrence counting in a single class

* TfidfTransformer transforms a count matrix to a normalized tf or tf-idf representation. 
The goal of using tf-idf is to scale down the impact of tokens that occur very frequently in a given corpus 
that are hence empirically less informative than features that occur in a small fraction of the training corpus.

* SGDCClassifier (Stochastic Gradient Decent) has gain popularity in NLP applications for it's scalability.
SGD has a lot of parameters and is sensitive to feature scaling 


In [14]:
# Define a pipeline with a text feature extractor and classifier:

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(max_iter=1000, tol=.0001)),
])

# hyper-parameters
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__max_iter': (5,),
    'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80),
}


In [15]:
print("# Tuning hyper-parameters...")
print()

clf1 = GridSearchCV(pipeline, parameters)
clf1.fit(X = X_train, y = y_train)

print()
print("Grid scores on development set:")
print()
means = clf1.cv_results_['mean_test_score']
stds = clf1.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf1.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


















Grid scores on development set:

0.993 (+/-0.014) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 0.5}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 0.75}
0.993 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': True, 'vect__max_df': 1.0}
0.995 (+/-0.015) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.5}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.75}
0.996 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 1.0}
0.998 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.5}
0.998 (+/-0.005) for {'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tf

0.996 (+/-0.010) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.5}
0.995 (+/-0.000) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 0.75}
0.998 (+/-0.005) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l1', 'tfidf__use_idf': False, 'vect__max_df': 1.0}
0.996 (+/-0.010) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.5}
0.998 (+/-0.005) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 0.75}
0.998 (+/-0.005) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l2', 'tfidf__use_idf': True, 'vect__max_df': 1.0}
0.996 (+/-0.010) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__max_df': 0.5}
0.998 (+/-0.005) for {'clf__alpha': 1e-06, 'clf__n_iter': 80, 'tfidf__norm': 'l2', 'tfidf__use_idf

In [16]:
print("Best parameters set found on development set:")
print(clf1.best_params_)
print()
print("Best score on development set:")
print(clf1.best_score_)

Best parameters set found on development set:
{'clf__alpha': 1e-05, 'clf__n_iter': 10, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__max_df': 0.5}

Best score on development set:
1.0


In [17]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf1.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf1.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      0.99      0.99       160
      legal       0.96      1.00      0.98       106
        ops       1.00      0.99      1.00       291

avg / total       0.99      0.99      0.99       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




# Analysis of Model 1
As seen in the detailed classification report, both the precision and recall of the model are near 100%,
and therefore the harmonic mean (f-score) is also 100%.This indicates a near perfect model fit to the dataset. 

Precision is the number of true positives over the number of true positives plus false positives. 
Recall is the number of true positives over the number of true positives plus false negatives

## Model 2
In Model 2 a different vectorizer, HashingVectorizer, is used in the pipeline in order to evaluate CountVectorizer. The HashingVectorizer will convert the text documents to a matrix of token occurrences by using a hashing trick to find the token string name to feature integer index mapping.

In [18]:
# Define a pipeline with a text feature extractor and classifier:

pipeline = Pipeline([
    ('vect', HashingVectorizer()),
    ('clf', SGDClassifier()),
])

# hyper-parameters
parameters = {
    'vect__norm': ('l1','l2',None),
    'vect__alternate_sign': [0,1],
    'clf__max_iter': (5,),
    #'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}


In [19]:
print("# Tuning hyper-parameters...")
print()

clf2 = GridSearchCV(pipeline, parameters)
clf2.fit(X_train, y_train)

print()
print("Grid scores on development set:")
print()
means = clf2.cv_results_['mean_test_score']
stds = clf2.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf2.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


Grid scores on development set:

0.991 (+/-0.010) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': 'l1'}
0.998 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': 'l2'}
0.998 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': None}
0.995 (+/-0.000) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': 'l1'}
0.998 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': 'l2'}
0.993 (+/-0.010) for {'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 1, 'vect__norm': None}
0.991 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'elasticnet', 'vect__alternate_sign': 0, 'vect__norm': 'l1'}
0.998 (+/-0.005) for {'clf__max_iter': 5, 'clf__penalty': 'elasticnet', 'vect__alternate_sign': 0, 'vect__norm': 'l2'}
0.998 (+/-0.005) for {'clf__max_

In [20]:
print("Best parameters set found on development set:")
print(clf2.best_params_)
print()
print("Best score on development set:")
print(clf2.best_score_)

Best parameters set found on development set:
{'clf__max_iter': 5, 'clf__penalty': 'l2', 'vect__alternate_sign': 0, 'vect__norm': 'l2'}

Best score on development set:
0.998201438849


In [21]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf2.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf2.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      0.99      1.00       160
      legal       0.98      1.00      0.99       106
        ops       1.00      1.00      1.00       291

avg / total       1.00      1.00      1.00       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




# Results from Model 2
Model 2 score results are either identical or slightly lower than Model 1. It is difficult to see which vectorizer is more efficient, and further analysis will be needed to in order to confidently say which vectorizer is better. 

One benfit of using the HashingVectorizer is that it scales very well since it does not have memory. However, there is no way to compute the inverse transform (from feature indices to string feature names) which can be a problem when trying to introspect which features are most important to a model. 


# Model 3
In Model 3 I will apply a new classifier, Support Vector Machine (SVM). SVM is one of the oldest AI algorithms, and is the basis for neural networks. SVC does not care about the 'perfect' point, instead it wants the 'ugliest' point that still classifies. You can control the kernel type, gamma variable (y), and slack variable (c) 

In addition, I will resume using the CountVectorizer and TfidfTransformer, which are combined in TfidfVectorizer. Since it will help with feature analysis in the sections to come. 


In [22]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC()),
])

param_grid = dict(tfidf__sublinear_tf=[0,1],
                  tfidf__smooth_idf=[0,1],
                  tfidf__norm =['l1','l2',None],
                  clf__kernel=['linear','rbf'],
                  clf__C=[1, 10, 100]
                 )

In [23]:
print("# Tuning hyper-parameters...")
print()

clf = GridSearchCV(pipe, param_grid)
clf.fit(X_train, y_train)


print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

# Tuning hyper-parameters...


Grid scores on development set:

0.520 (+/-0.003) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.520 (+/-0.003) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}
0.520 (+/-0.003) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.520 (+/-0.003) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 1}
0.989 (+/-0.023) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.995 (+/-0.015) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}
0.989 (+/-0.015) for {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.995 (+/-0.015) fo

0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l1', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 1}
0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}
0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 0}
0.520 (+/-0.003) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 1, 'tfidf__sublinear_tf': 1}
0.914 (+/-0.009) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': None, 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 0}
0.973 (+/-0.009) for {'clf__C': 100, 'clf__kernel': 'rbf', 'tfidf__norm': None, 'tfidf__sm

In [24]:
print("Best parameters set found on development set:")
print(clf.best_params_)
print()
print("Best score on development set:")
print(clf.best_score_)

Best parameters set found on development set:
{'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__norm': 'l2', 'tfidf__smooth_idf': 0, 'tfidf__sublinear_tf': 1}

Best score on development set:
0.994604316547


In [25]:
print("Detailed classification report on test set:")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

print("Detailed classification report on training set")
print()
y_true, y_pred = y_train, clf.predict(X_train)
print(classification_report(y_true, y_pred))
print()

Detailed classification report on test set:

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       160
      legal       0.97      1.00      0.99       106
        ops       1.00      0.99      0.99       291

avg / total       0.99      0.99      0.99       557


Detailed classification report on training set

             precision    recall  f1-score   support

 accounting       1.00      1.00      1.00       164
      legal       1.00      1.00      1.00       103
        ops       1.00      1.00      1.00       289

avg / total       1.00      1.00      1.00       556




## Analysis of Model 3
The SVM classifier is performing well, but slightly less accurate than the SGDC classifier. Precision, Recall and the combined F1 score are all averaging 99%.

## Frequency analysis
What are the most common tokens, and which tokens are influencing the model?

In [26]:
corpus_l = " ".join(list_l)
corpus_l = nlp(corpus_l)

In [27]:
token_text = [token.orth_ for token in corpus_l]
token_pos = [token.pos_ for token in corpus_l]
token_entity_type = [token.ent_type_ for token in corpus_l]
token_shape = [token.shape_ for token in corpus_l]
token_prob = [token.prob for token in corpus_l]

token_df = pd.DataFrame(list(zip(token_text, token_pos, token_entity_type, token_shape, token_prob)),
             columns=['token_text', 'part_of_speech','entity_type', 'token_shape', 'log_probability'])
token_df.head(10)

Unnamed: 0,token_text,part_of_speech,entity_type,token_shape,log_probability
0,midcon,NOUN,,xxxx,-20.0
1,dat,NOUN,,xxx,-20.0
2,serv,NOUN,,xxxx,-20.0
3,llc,NOUN,,xxx,-20.0
4,thi,NOUN,,xxx,-20.0
5,dat,NOUN,,xxx,-20.0
6,destruct,NOUN,,xxxx,-20.0
7,agr,NOUN,,xxx,-20.0
8,mad,ADJ,,xxx,-20.0
9,ent,NOUN,,xxx,-20.0


In [28]:
def get_feature_df(grid_search, features):
    """
    Return the feature names and coefficients from the final classifier of the
    best pipeline found by GridSearchCV. See https://git.io/vPWLI.

    Params
    ------
    grid_search: GridSearchCV object
        A post-fit GridSearchCV object where the estimator is a Pipeline.
    features: list
        initial feature names

    Returns
    -------
    pandas.DataFrame
        Dataframe of feature name and coefficient values
    """
    features = np.array(features)
    pipeline = grid_search.best_estimator_
    for name, transformer in pipeline.steps:
        if name.startswith('select'):
            X_index = np.arange(len(features)).reshape(1, -1)
            indexes = transformer.transform(X_index).tolist()
            features = features[indexes]
    step_name, classifier = pipeline.steps[-1]
    coefficients = classifier.coef_
    feature_df = pd.DataFrame(list(zip(features, coefficients)), columns=['feature', 'coefficient'])
    return feature_df

In [43]:
cv = CountVectorizer()
tfidf = TfidfTransformer()
clf = SGDClassifier()
counts = cv.fit_transform(X_train, y_train)
inverse = tfidf.fit_transform(counts)
clf.fit(inverse, y_train)

#df = get_feature_df(clf1,cv.get_feature_names())
#df



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [31]:
print (len(cv.vocabulary_))
print (cv.get_feature_names())



17274


['00', '000', '0000', '000000001', '000001', '0000010005', '000001310', '000002', '0000030003', '000005819', '000009508', '00001010442', '00001010445', '000010331', '000012172', '000012380', '000012590', '00001630439', '00001690620000600007', '0000175', '0000179', '0000183', '00001910985', '00002', '00002100003', '0000218426', '00002244320001100002', '00002957320001100005', '00003', '00004209070000600005', '0000420907q000600005', '0000443404', '0000443404000', '00004434040000600005', '0000443404000p600005', '00004434040q00600005', '00005', '000051', '00005624730000400010', '00005907960000300007', '00005909130000300007', '00006543290000300007', '00006554140000300007', '00006555540000300007', '00009', '000099253000048326000000055539', '0000lb', '0001', '000105659', '00011', '0001112280', '000116580', '0001172200', '0001172280', '0001441', '0002', '000201l3', '000225l3', '000258l3', '000273l3', '000284253', '000292l3', '000294982', '000295l3', '0003', '000301l3', '000303l3', '000312l3', '

In [44]:
    feature_names = cv.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:10]
    topClass2 = coefs_with_fns[:-(10 + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)

Class 1 best: 
(-2.8642400220731195, 'item')
(-2.2997224002322563, 'agr')
(-2.0929150856120784, 'stor')
(-1.9305386225483963, '2017')
(-1.8574002487446613, 'iiii')
(-1.7846928254749626, '0621385')
(-1.7061929400109046, 'forb')
(-1.6899368238785064, 'shal')
(-1.5856396846257172, 'pick')
(-1.5318195904286589, 'ye')
Class 2 best: 
(2.7360559944468541, '00')
(2.4283914784702256, '16')
(2.3386285524832284, 'serv')
(2.0295616797862652, 'windstream')
(1.9213143983659964, 'policy')
(1.8755388756666729, 'tax')
(1.8755257092815334, '2016')
(1.7113486448026585, 'invo')
(1.632431309059365, 'ok')
(1.5329317019567257, 'alarm')


In [52]:
coefs_with_fns[-10:]

[(1.5329317019567257, 'alarm'),
 (1.632431309059365, 'ok'),
 (1.7113486448026585, 'invo'),
 (1.8755257092815334, '2016'),
 (1.8755388756666729, 'tax'),
 (1.9213143983659964, 'policy'),
 (2.0295616797862652, 'windstream'),
 (2.3386285524832284, 'serv'),
 (2.4283914784702256, '16'),
 (2.7360559944468541, '00')]