In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import GridSearchCV
from time import time

In [2]:
newsgroup_train = fetch_20newsgroups(subset='train')
newsgroup_test = fetch_20newsgroups(subset='test')
X_train = newsgroup_train.data
y_train = newsgroup_train.target
X_test = newsgroup_test.data
y_test = newsgroup_test.target

In [3]:
dict_classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, weights='distance'),
    "Linear SVM": SGDClassifier(random_state=42),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Neural Net": MLPClassifier(alpha = 1)
}

In [9]:
for name, classifier in dict_classifiers.items():
    text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf', classifier)])
    start_training = time()
    text_clf = text_clf.fit(X_train, y_train)
    end_training = time()
    start_testing = time()
    predicted = text_clf.predict(X_test)
    end_testing = time()
    print("Accuracy of "+ name + " : "+str(np.mean(predicted == y_test)))
    print("Training time of "+ name + " : "+str(end_training - start_training))
    print("Testing time of "+ name + " : "+str(end_testing - start_testing))

Accuracy of Nearest Neighbors : 0.694370685077
Training time of Nearest Neighbors : 4.362587213516235
Testing time of Nearest Neighbors : 13.058164119720459




Accuracy of Linear SVM : 0.84824747743
Training time of Linear SVM : 4.500420093536377
Testing time of Linear SVM : 2.20298171043396


KeyboardInterrupt: 

In [None]:
# Grid Search 

#KNeighborsClassifier
def knn_param_selection(X, y, nfolds):
    n_neighbors= np.arange(1, 31, 2)
    metric= ["euclidean", "cityblock"]
    param_grid = {'n_neighbors': n_neighbors, 'metric' : metric}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=nfolds, n_jobs=-1)
    return search_best_params(X, y, grid_search)

# SVM
def svc_param_selection(X, y, nfolds):
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False)}
    gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    gs_clf = gs_clf.fit(X_train, y_train)
    print("Best score of "+name+" : "+ str(gs_clf.best_score_))
    print("Best params of "+name+" : "+ str(gs_clf.best_params_))

# Decision Tree
def dt_param_selection(X, y, nfolds):
    max_depth = np.arange(3, 10)
    param_grid = {'max_depth': max_depth}
    grid_search = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, cv=nfolds)
    return search_best_params(X, y, grid_search)

In [None]:
knn_param_selection(X_train, y_train, 10)