# Title

Our goal is to classify texts from Project Gutenberg by the author.

In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn import model_selection
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [2]:
author_title_df = pd.read_csv('author_title.csv')
X = joblib.load('features.pkl')
X = X.toarray()
Y = author_title_df.Author

In order for there to be enough training samples for each class ('author'), I will limit the dataset to only include the authors with at least 10 works in the dataset.

In [3]:
Y = Y[list(Y.value_counts()[author]>=10 for author in Y)]
X = X[Y.index]
print("There are now {} texts by {} authors.".format(Y.count(), Y.nunique()))

There are now 2798 texts by 83 authors.


In [4]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y,
                                                                    test_size = 0.3,
                                                                    random_state = 4
                                                                   )

In [5]:
%%time

clf = MultinomialNB()
clf.fit(X_train, Y_train)

CPU times: user 9.4 s, sys: 2.42 s, total: 11.8 s
Wall time: 9.35 s


In [6]:
%%time

print("Train score:", clf.score(X_train, Y_train))
print("Test score:", clf.score(X_test, Y_test))

Train score: 0.370275791624
Test score: 0.258333333333
CPU times: user 11.3 s, sys: 1.14 s, total: 12.4 s
Wall time: 7.73 s


In [7]:
params = {'alpha':np.logspace(-7,1,9)}
gridcv = model_selection.GridSearchCV(clf, param_grid = params)

In [8]:
%%time
gridcv.fit(X_train, Y_train)

CPU times: user 7min 4s, sys: 1min 14s, total: 8min 19s
Wall time: 5min 36s


GridSearchCV(cv=None, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
gridcv.best_estimator_

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

In [10]:
clf = MultinomialNB(alpha = .0001)
clf.fit(X_train,Y_train)

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

In [11]:
clf.score(X_test,Y_test)

0.90238095238095239

In [12]:
vectorizer = TfidfVectorizer(input='filename', min_df=3, max_df=.95, ngram_range=(2,2), encoding='iso-8859-1')
X_3gram = vectorizer.fit_transform('txt/' + author_title_df.Name[Y.index])

In [13]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_3gram, Y,
                                                                    test_size = 0.3,
                                                                    random_state = 4
                                                                   )

In [14]:
clf = MultinomialNB(alpha = .0001)
clf.fit(X_train,Y_train)

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

In [15]:
clf.score(X_test,Y_test)

0.92142857142857137

In [16]:
vectorizer = TfidfVectorizer(input='filename', min_df=3, max_df=.95, ngram_range=(1,2), encoding='iso-8859-1')
X_2gram = vectorizer.fit_transform('txt/' + author_title_df.Name[Y.index])

In [17]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X_2gram, Y,
                                                                    test_size = 0.3,
                                                                    random_state = 4
                                                                   )

In [18]:
clf = MultinomialNB(alpha = .0001)
clf.fit(X_train,Y_train)
clf.score(X_test, Y_test)

0.92142857142857137

In [19]:
X_3gram.shape

(2798, 3613484)

In [20]:
X_2gram.shape

(2798, 3760578)