In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV

In [2]:
newsgroup_train = fetch_20newsgroups(subset='train')
newsgroup_test = fetch_20newsgroups(subset='test')
X_train = newsgroup_train.data
y_train = newsgroup_train.target
X_test = newsgroup_test.data
y_test = newsgroup_test.target

In [8]:
dict_classifiers = {
#    "Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, weights='distance'),
#    "Linear SVM": SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)
#    "Decision Tree": tree.DecisionTreeClassifier(),
    "Neural Net": MLPClassifier(alpha = 1)
#    "Naive Bayes": MultinomialNB()
}

In [10]:
%%time
for name, classifier in dict_classifiers.items():
    text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', classifier)])
    text_clf = text_clf.fit(X_train, y_train)
    predicted = text_clf.predict(X_test)
    print("Accuracy of "+ name + " : "+str(np.mean(predicted == y_test)))

Accuracy of Neural Net : 0.650690387679
CPU times: user 21min 9s, sys: 5min 48s, total: 26min 58s
Wall time: 21min 25s


In [None]:
# Grid Search 

#KNeighborsClassifier
def knn_param_selection(X, y, nfolds):
    n_neighbors= np.arange(1, 31, 2)
    metric= ["euclidean", "cityblock"]
    param_grid = {'n_neighbors': n_neighbors, 'metric' : metric}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=nfolds, n_jobs=-1)
    return search_best_params(X, y, grid_search)

# SVM
def svc_param_selection(X, y, nfolds):
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    gs_clf = gs_clf.fit(X_train, y_train)
    print("Best score of "+name+" : "+ str(gs_clf.best_score_))
    print("Best params of "+name+" : "+ str(gs_clf.best_params_))

# Decision Tree
def dt_param_selection(X, y, nfolds):
    max_depth = np.arange(3, 10)
    param_grid = {'max_depth': max_depth}
    grid_search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv=nfolds)
    return search_best_params(X, y, grid_search)

In [None]:
knn_param_selection(X_train, y_train, 10)