In [87]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV, SelectKBest, chi2
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import TransformerMixin
from sklearn.metrics import log_loss
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [5]:
train['word_count'] = train.text.apply(lambda x: len(x.split()))

In [6]:
train.groupby('author').word_count.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
EAP,7900.0,25.442405,18.567706,2.0,12.0,21.0,33.0,267.0
HPL,5635.0,27.799645,14.123252,4.0,18.0,26.0,35.0,147.0
MWS,6044.0,27.417273,23.13444,2.0,15.0,23.0,34.0,861.0


In [7]:
train.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [8]:
def do_CV(data, ngr):
    CV = CountVectorizer(stop_words = "english", ngram_range = ngr )
    cv_data = CV.fit_transform(data)
    df  = pd.DataFrame(cv_data.todense(),
             columns=CV.get_feature_names())
    return df

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['author'], train_size = .7)

In [10]:
print(X_train.shape, X_test.shape)
print (y_train.shape, y_test.shape)

((13705,), (5874,))
((13705,), (5874,))


In [41]:
pipeline = Pipeline([
    ('vect', CountVectorizer( lowercase=True, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('cls', MultinomialNB())
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test)
preds = pipeline.predict_proba(X_test)
print log_loss(y_test, preds)

0.609419717283


In [42]:
pipeline = Pipeline([
    ('vect', CountVectorizer( lowercase=True, stop_words = 'english')),
    ('tfidf', TfidfTransformer()),
    ('cls', LogisticRegression())
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test)
preds = pipeline.predict_proba(X_test)
print log_loss(y_test, preds)

0.650670647807


In [62]:
stop = set(stopwords.words('english') + ['.'])

In [63]:
def preprocess(text):
    try:
        text = text.lower()
        tokens = word_tokenize(text)
        tokens = [t for t in tokens if not t in stop]
        if len(tokens) == 0:
            return None
        else:
            return ' '.join(tokens)
    except:
        return None

In [64]:
train['tokens'] = train.text.apply(preprocess)
data = train[train['tokens'].notnull()]
data.reset_index(inplace=True)
data.drop('index', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [65]:
data.head()

Unnamed: 0,id,text,author,word_count,tokens
0,id26305,"This process, however, afforded me no means of...",EAP,41,"process , however , afforded means ascertainin..."
1,id17569,It never once occurred to me that the fumbling...,HPL,14,never occurred fumbling might mere mistake
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,36,"left hand gold snuff box , , capered hill , cu..."
3,id27763,How lovely is spring As we looked from Windsor...,MWS,34,lovely spring looked windsor terrace sixteen f...
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,27,"finding nothing else , even gold , superintend..."


In [73]:
texts = data.tokens.tolist()
texts_2 = [q for q in [str(z) for z in texts] ]

In [75]:
texts[289], texts[289]

('win ; shall deny grief know secret pour balm soul shall enjoy ravishing delight beholding smile , seeing eyes beam pleasure least gentle love thankfulness',
 'win ; shall deny grief know secret pour balm soul shall enjoy ravishing delight beholding smile , seeing eyes beam pleasure least gentle love thankfulness')

In [82]:
texts = data.tokens.tolist()
y = data.author.tolist()
vectorizer = TfidfVectorizer(min_df=5, max_df=0.8,decode_error = 'replace')
X = vectorizer.fit_transform(texts)

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)

In [86]:
total_features = len(vectorizer.get_feature_names())
print('{} total features prior to selection'.format(total_features))
ch2 = SelectKBest(chi2, k=500)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

8244 total features prior to selection


In [97]:
def run_mod(X_t, y_t, X_te, y_te, classifier):
    classify = classifier
    classify.fit(X_t, y_t)
    print('Accuracy: {} for {}'.format(round(classify.score(X_te, y_te), 3),classifier))
    print (log_loss(y_te,classify.predict_proba(X_test) ))

In [99]:
for c in [ RandomForestClassifier(), MultinomialNB(),LogisticRegression()]:
    run_mod(X_train, y_train, X_test, y_test, c)

Accuracy: 0.63 for RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
3.22326031514
Accuracy: 0.698 for MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.83437025377
Accuracy: 0.706 for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.768411086799


In [None]:
## add some of Evann Smith's stuff
## tokenize and clean using her approaches - keep commas
## try SVC, XGBoost, Logistic, Random Forest, etc.
## try word2vec