In [255]:
from pprint import pprint
from time import time
import logging
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### Load 20newsgroups dataset

In [18]:
print("Loading 20 newsgroups dataset for categories:")
pprint(data)

Loading 20 newsgroups dataset for categories:
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [39]:
categories = fetch_20newsgroups(subset='train').target_names
data = fetch_20newsgroups(subset='train', categories = categories)
data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

### Pipeline with SGD Classifier algorithm and Bag of Words, TF-IDF

In [57]:
sgdc_pipeline = Pipeline(
[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
('clf', SGDClassifier())
])

### GridSearchCV for best hyperparameters

In [58]:
sgdc_param_grid = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'tfidf__use_idf': (True, False),
'clf__max_iter': (20,),
'clf__penalty': ('l2', 'elasticnet')
}

In [59]:
sgdc_grid = GridSearchCV(sgdc_pipeline, sgdc_param_grid, cv=5, verbose=1, n_jobs=1)

In [60]:
sgdc = sgdc_grid.fit(data.data, data.target)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 44.4min finished


### Best Score

In [61]:
sgdc.best_score_

0.9289376675406815

### Best Parameter

In [62]:
sgdc.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))),
  ('tfidf', TfidfTransformer()),
  ('clf', SGDClassifier(max_iter=20))],
 'verbose': False,
 'vect': CountVectorizer(max_df=0.75, ngram_range=(1, 2)),
 'tfidf': TfidfTransformer(),
 'clf': SGDClassifier(max_iter=20),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.75,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'clf__alpha': 0.0001,
 'clf__average': False,
 'clf__class_weight': None,
 'clf__early_stopping': False,
 'c

### SGD Classifier best model hyperparameters

In [118]:
sgdc.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.75, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(max_iter=20))])

### Other Algorithms

#### Random Forest Classifier

In [144]:
rfc_pipeline = Pipeline(
[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('rfc', RandomForestClassifier())
])

In [150]:
rfc_param_grid = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'tfidf__use_idf': (True, False),
'rfc__n_estimators': (10, 50),
'rfc__max_depth': (5, 10)
}

In [151]:
rfc_grid = GridSearchCV(rfc_pipeline, rfc_param_grid, cv=5, verbose=1, n_jobs=1)

In [152]:
rfc = rfc_grid.fit(data.data, data.target)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 58.7min finished


In [153]:
rfc.best_score_

0.6712039252137079

In [154]:
rfc.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(max_df=0.75)),
  ('tfidf', TfidfTransformer(use_idf=False)),
  ('rfc', RandomForestClassifier(max_depth=10, n_estimators=50))],
 'verbose': False,
 'vect': CountVectorizer(max_df=0.75),
 'tfidf': TfidfTransformer(use_idf=False),
 'rfc': RandomForestClassifier(max_depth=10, n_estimators=50),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.75,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': False,
 'rfc__bootstrap': True,
 'rfc__ccp_alpha': 0.0,
 'rfc__class_weight

In [155]:
rfc.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.75)),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('rfc', RandomForestClassifier(max_depth=10, n_estimators=50))])

#### Decision Tree Classifier

In [164]:
dtc_pipeline = Pipeline(
[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('dtc', DecisionTreeClassifier())
])

In [165]:
dtc_param_grid = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'tfidf__use_idf': (True, False),
'dtc__min_samples_split': (1, 5),
'dtc__max_depth': (5, 10)
}

In [166]:
dtc_grid = GridSearchCV(dtc_pipeline, dtc_param_grid, cv=5, verbose=1, n_jobs=1)

In [167]:
dtc = dtc_grid.fit(data.data, data.target)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\SURFACE\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\SURFACE\anaconda3\lib\site-packages\sklearn\pipeline.py", line 335, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\SURFACE\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 894, in fit
    X_idx_sorted=X_idx_sorted)
  File "C:\Users\SURFACE\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 231, in fit
    % self.min_samples_split)
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

Traceback (most recent call last):
  File "C:\Users\SURFACE\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **

In [169]:
dtc.best_score_

0.321196169650312

In [170]:
dtc.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(max_df=0.5)),
  ('tfidf', TfidfTransformer(use_idf=False)),
  ('dtc', DecisionTreeClassifier(max_depth=10, min_samples_split=5))],
 'verbose': False,
 'vect': CountVectorizer(max_df=0.5),
 'tfidf': TfidfTransformer(use_idf=False),
 'dtc': DecisionTreeClassifier(max_depth=10, min_samples_split=5),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.5,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': False,
 'dtc__ccp_alpha': 0.0,
 'dtc__class_weight': None,
 'dtc__crit

In [171]:
dtc.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5)),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('dtc',
                 DecisionTreeClassifier(max_depth=10, min_samples_split=5))])

#### KNearest Neighbors

In [178]:
knn_pipeline = Pipeline(
[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('knn', KNeighborsClassifier())
])

In [179]:
knn_param_grid = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'tfidf__use_idf': (True, False),
'knn__n_neighbors': (5, 10),
'knn__weights': ('uniform', 'distance')
}

In [180]:
knn_grid = GridSearchCV(knn_pipeline, knn_param_grid, cv=5, verbose=1, n_jobs=1)

In [181]:
knn = knn_grid.fit(data.data, data.target)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 68.4min finished


In [182]:
knn.best_score_

0.8134173981706248

In [183]:
knn.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(max_df=0.5)),
  ('tfidf', TfidfTransformer()),
  ('knn', KNeighborsClassifier(weights='distance'))],
 'verbose': False,
 'vect': CountVectorizer(max_df=0.5),
 'tfidf': TfidfTransformer(),
 'knn': KNeighborsClassifier(weights='distance'),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.5,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': 

In [184]:
knn.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5)),
                ('tfidf', TfidfTransformer()),
                ('knn', KNeighborsClassifier(weights='distance'))])

#### MultinomialNB

In [186]:
nb_pipeline = Pipeline(
[('vect', CountVectorizer()),
 ('tfidf', TfidfTransformer()),
 ('nb', MultinomialNB())
])

In [187]:
nb_param_grid = {
'vect__max_df': (0.5, 0.75, 1.0),
'vect__ngram_range': ((1, 1), (1, 2)),
'tfidf__norm': ('l1', 'l2'),
'tfidf__use_idf': (True, False),
'nb__alpha': (1.0, 2.0),
'nb__fit_prior': (True, False)
}

In [188]:
nb_grid = GridSearchCV(nb_pipeline, nb_param_grid, cv=5, verbose=1, n_jobs=1)

In [189]:
nb = nb_grid.fit(data.data, data.target)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed: 56.4min finished


In [190]:
nb.best_score_

0.8744032807009935

In [191]:
nb.best_estimator_.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
  ('tfidf', TfidfTransformer()),
  ('nb', MultinomialNB(fit_prior=False))],
 'verbose': False,
 'vect': CountVectorizer(max_df=0.5, ngram_range=(1, 2)),
 'tfidf': TfidfTransformer(),
 'nb': MultinomialNB(fit_prior=False),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.int64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.5,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2),
 'vect__preprocessor': None,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__vocabulary': None,
 'tfidf__norm': 'l2',
 'tfidf__smooth_idf': True,
 'tfidf__sublinear_tf': False,
 'tfidf__use_idf': True,
 'nb__alpha': 1.0,
 'nb__class_prior': None,
 'nb__fit_prior': False}

In [234]:
nb.best_estimator_

Pipeline(steps=[('vect', CountVectorizer(max_df=0.5, ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer()),
                ('nb', MultinomialNB(fit_prior=False))])

### Sample classification using SGD Classifier model

In [204]:
sgdc_vect = CountVectorizer(max_df=0.75, ngram_range=(1, 2))
sgdc_tfidf = TfidfTransformer()

In [205]:
text_vect = sgdc_vect.fit_transform(data.data)
text_tfidf = sgdc_tfidf.fit_transform(text_vect)

In [220]:
sgdc_best = SGDClassifier(max_iter=20).fit(text_vect, data.target)



In [219]:
text_piece = ['It was a 2-door sports car, looked to be from the late 60s']

text_vect1 = sgdc_vect.transform(text_piece)
text_tfidf1 = sgdc_tfidf.transform(text_vect1)

predicted = sgdc_best.predict(text_tfidf1)

for text, category in zip(text_piece, predicted):
    print(f" Text: {text} , Category = {data.target_names[category]}")

 Text: It was a 2-door sports car, looked to be from the late 60s , Category = rec.autos


### Benchmark Models

In [296]:
model = ['SGDClassifier', 'MultinomialNB', 'KNearest Neighbors', 'Random Forest Classifier', ' Decision Tree Classifier']
model = pd.DataFrame(model)

model_feature_extractor = [sgdc.best_estimator_, nb.best_estimator_, knn.best_estimator_, rfc.best_estimator_, dtc.best_estimator_]
model_feature_extractor = np.array(model_feature_extractor)
model_feature_extractor = pd.DataFrame(model_feature_extractor)

model_accuracy = [sgdc.best_score_, nb.best_score_, knn.best_score_, rfc.best_score_, dtc.best_score_]
model_accuracy = pd.DataFrame(model_accuracy)

model_list = pd.concat([model, model_feature_extractor, model_accuracy], axis=1)
model_list.columns = ['Algorithm', 'Feature Extractor Count', 'Feature Extractor TF-IDF', 'Algorithm Parameters', 'Accuracy']

model_list

  """
