In [20]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.base import TransformerMixin

In [4]:
train = pd.read_csv('train.csv')

In [5]:
train.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [6]:
train['word_count'] = train.text.apply(lambda x: len(x.split()))

In [7]:
train.groupby('author').word_count.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
EAP,7900.0,25.442405,18.567706,2.0,12.0,21.0,33.0,267.0
HPL,5635.0,27.799645,14.123252,4.0,18.0,26.0,35.0,147.0
MWS,6044.0,27.417273,23.13444,2.0,15.0,23.0,34.0,861.0


In [8]:
train.author.value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

In [15]:
def do_CV(data, ngr):
    CV = CountVectorizer(stop_words = "english", ngram_range = ngr )
    cv_data = CV.fit_transform(data)
    df  = pd.DataFrame(cv_data.todense(),
             columns=CV.get_feature_names())
    return df

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train['text'], train['author'], train_size = .7)

In [17]:
print(X_train.shape, X_test.shape)
print (y_train.shape, y_test.shape)

((13705,), (5874,))
((13705,), (5874,))


In [34]:
pipeline = Pipeline([
    ('vect', CountVectorizer( lowercase=True, stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('cls', MultinomialNB())
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test)

0.8032005447735785

In [35]:
pipeline = Pipeline([
    ('vect', CountVectorizer( lowercase=True, stop_words = 'english')),
    ('tfidf', TfidfTransformer()),
    ('cls', LogisticRegression())
])
pipeline.fit(X_train, y_train)
pipeline.score(X_test,y_test)

0.78838951310861427

In [8]:
count_vector=do_CV(train['text'], (1,3))

In [9]:
count_vector.shape

(19579, 397066)

In [10]:
count_vector.head()
cv=count_vector.transpose()
## seeing which words appear most

In [None]:
#cv['sum']=cv.sum(axis=1)
cv['sum']= cv.apply(lambda x: sum(x))

In [17]:
cv_1 = cv.sort_values('sum', ascending = False)

In [18]:
cv_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19570,19571,19572,19573,19574,19575,19576,19577,19578,sum
man,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,779
time,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,730
said,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,704
did,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,700
old,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,616


In [19]:
[(word,cv_1.loc[word,'sum']) for word in cv_1.index if cv_1.loc[word,'sum']>50] 

[(u'man', 779),
 (u'time', 730),
 (u'said', 704),
 (u'did', 700),
 (u'old', 616),
 (u'like', 613),
 (u'life', 569),
 (u'night', 566),
 (u'eyes', 540),
 (u'little', 531),
 (u'day', 523),
 (u'great', 512),
 (u'long', 511),
 (u'saw', 502),
 (u'came', 461),
 (u'thought', 442),
 (u'say', 409),
 (u'death', 396),
 (u'mind', 378),
 (u'far', 376),
 (u'heart', 375),
 (u'things', 369),
 (u'shall', 368),
 (u'heard', 366),
 (u'house', 366),
 (u'men', 363),
 (u'thing', 356),
 (u'left', 354),
 (u'years', 346),
 (u'felt', 343),
 (u'place', 340),
 (u'know', 337),
 (u'earth', 337),
 (u'love', 332),
 (u'world', 323),
 (u'light', 320),
 (u'come', 314),
 (u'room', 304),
 (u'way', 304),
 (u'door', 303),
 (u'head', 300),
 (u'let', 298),
 (u'having', 296),
 (u'away', 295),
 (u'words', 292),
 (u'hand', 292),
 (u'nature', 286),
 (u'strange', 283),
 (u'seen', 282),
 (u'make', 280),
 (u'length', 278),
 (u'good', 277),
 (u'friend', 275),
 (u'human', 274),
 (u'knew', 273),
 (u'raymond', 272),
 (u'voice', 271),
 (u'

In [None]:
## add some of Evann Smith's stuff
## tokenize and clean using her approaches - keep commas
## try SVC, XGBoost, Logistic, Random Forest, etc.
## try word2vec