In [None]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |

---

### Data

In [None]:
import pandas

In [None]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

In [None]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

---

### Features

##### Basic features

In [None]:
import re

In [None]:
def lower_case_words(s):
    return re.findall(r'\w+', s.lower())

In [None]:
def all_words(s):
    return re.findall(r'\w+', s)

Number of words:

In [None]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(lower_case_words(s)))
    .as_matrix()
    [:, None]
)

Number of unique words:

In [None]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(lower_case_words(s))))
    .as_matrix()
    [:, None]
)

Length of text:

In [None]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

Number of stopwords:

In [None]:
from nltk.corpus import stopwords

In [None]:
english_stopwords = set(stopwords.words("english"))

In [None]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in lower_case_words(s)
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

Number of punctuation characters:

In [None]:
import string

In [None]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

Number of upper-case words:

In [None]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in all_words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

Number of title-case words:

In [None]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in all_words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

Mean length of words:

In [None]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in all_words(s)
    ]))
    .as_matrix()
    [:, None]
)

All of them:

In [None]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

In [None]:
f_basic_features = numpy.hstack(basic_features)

##### Tf-Idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

In [None]:
f_tfidf = t_tfidf.fit_transform(data["text"])

With SVD:

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
n_comp = 20

In [None]:
t_svd = TruncatedSVD(n_components=n_comp, algorithm="arpack")

In [None]:
f_svd = t_svd.fit_transform(f_tfidf)

##### Counters

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

In [None]:
f_count = t_count.fit_transform(data["text"])

##### Hashes

TODO.

---

### Application

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
cv = StratifiedKFold(n_splits=5)

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
from sklearn.metrics import log_loss

In [None]:
import numpy

In [None]:
import scipy
import scipy.sparse

In [None]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) is scipy.sparse.csr_matrix,
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(model, f_train, y_train, cv=cv, method="predict_proba")
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

---

### Models

##### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

Basic features:

In [None]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

TF-IDF:

In [None]:
apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
)

##### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

TF-IDF:

In [None]:
apply_model(
    MultinomialNB(),
    (f_tfidf, ),
)

Counts:

In [None]:
apply_model(
    MultinomialNB(),
    (f_count, ),
)

##### XGBoost

In [None]:
from xgboost import XGBClassifier

Basic features:

In [None]:
for x in [1, 3, 10, 30, 100, 300]:
    print(f'{x} estimators:')
    apply_model(
        XGBClassifier(n_estimators=x),
        basic_features,
    )

SVD:

In [None]:
for x in [1, 3, 10, 30, 100, 300]:
    print(f'{x} estimators:')
    apply_model(
        XGBClassifier(n_estimators=x),
        (f_svd, ),
    )

With LR predictions:

In [None]:
apply_model(
    XGBClassifier(n_estimators=10),
    basic_features + (p_lr_base, ),
)

---

### Submission

TODO.