## Part 3: Tuning and Model Comparison

In [1]:

import pandas as pd
import numpy as np
import time
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups

Your result will look like the following.

In [2]:
%%capture
'''
tuning naive bayes...
alpha  score
 0.00  0.146634
 0.02  0.837728
 0.04  0.840571
 0.06  0.839864
 0.08  0.834170
 0.10  0.834167
 0.20  0.804260
 0.30  0.787169
 0.40  0.767946
 0.50  0.755127
 0.60  0.740888
 0.70  0.731638
 0.80  0.720959
 0.90  0.704589
 1.00  0.691061
 1.10  0.683228
 1.20  0.674684
running models...
                Name   Score TrainTime  TestTime
       Random Forest   0.730      1.49      0.06
       Decision Tree   0.608     16.31      0.03
                 kNN   0.798      1.89     24.69
         Naive Bayes   0.750      0.59      0.06
                 SVM   0.807     97.71     34.25
            Logistic   0.864      2.07      0.39
'''

In [3]:
def get_data():
    newsgroups_train = fetch_20newsgroups(subset='train', categories=['alt.atheism', 
    'sci.space', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns'])
    data = newsgroups_train.data
    labels = newsgroups_train.target
    le = LabelEncoder()
    y = le.fit_transform(labels)
    return np.array(data), np.array(y)

In [4]:
def tune_naive_bayes(data, y):
    print("tuning naive bayes...")
    kfold = KFold(5)
    alphas = np.concatenate((np.arange(0, 0.1, 0.02), np.arange(.1, 1.3, 0.1)))
    scores = defaultdict(list)
    for train_index, test_index in kfold.split(data):
        data_train, data_test = data[train_index], data[test_index]
        y_train, y_test = y[train_index], y[test_index]
        tfidf = TfidfVectorizer()
        X_train = tfidf.fit_transform(data_train)
        print(X_train.shape)
        X_test = tfidf.transform(data_test)
        for alpha in alphas:
            nb = MultinomialNB(alpha=alpha)
            nb.fit(X_train, y_train)
            scores[alpha].append(nb.score(X_test, y_test))

    print("alpha  score")
    for alpha in alphas:
        print(" %.2f  %f" % (alpha, np.average(scores[alpha])))

In [5]:
def run_models(data, y):
    data_train, data_test, y_train, y_test = train_test_split(data, y)

    tfidf = TfidfVectorizer()
    X_train = tfidf.fit_transform(data_train).toarray()
    X_test = tfidf.transform(data_test).toarray()

    print("running models...")
    models = [("Random Forest", RandomForestClassifier()),
              ("Decision Tree", DecisionTreeClassifier()),
              ("kNN", KNeighborsClassifier()),  
              ("Naive Bayes", MultinomialNB()),
              ("SVM", OneVsRestClassifier(SVC())),
              ("Logistic", OneVsRestClassifier(LogisticRegression()))]

    print("%20s %7s %9s %9s" % ("Name", "Score", "TrainTime", "TestTime"))

    for name, model in models:
        start = time.time()
        model.fit(X_train, y_train)
        trained = time.time()
        score = model.score(X_test, y_test)
        tested = time.time()

        # Silly stuff to make it print nicely
        print("%20s   %.3f %9s %9s" % (name, score,
                                       str(round(trained - start, 2)),
                                       str(round(tested - trained, 2))))

### Run the whole program

In [37]:
data, y = get_data()
tune_naive_bayes(data, y)
run_models(data, y)

tuning naive bayes...
(858, 20415)
(858, 20019)
(858, 20094)
(859, 20119)
(859, 20207)
alpha  score
 0.00  0.991619
 0.02  0.994414
 0.04  0.994414
 0.06  0.994414
 0.08  0.995344
 0.10  0.995344
 0.20  0.995344
 0.30  0.995344
 0.40  0.995344
 0.50  0.995344
 0.60  0.995344
 0.70  0.995344
 0.80  0.995344
 0.90  0.995344
 1.00  0.995344
 1.10  0.995344
 1.20  0.995344
running models...
                Name   Score TrainTime  TestTime
       Random Forest   0.974      2.31      0.07
       Decision Tree   0.929      0.94      0.01
                 kNN   0.989      1.45      9.18
         Naive Bayes   0.993      0.15      0.01
                 SVM   0.985     19.81      6.17
            Logistic   0.981      0.64      0.01
