In [1]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |
| `t_` | transformers |

---

### Data

In [2]:
import pandas

time: 3.8 s


In [3]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

time: 533 ms


In [4]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

time: 86.7 ms


---

### Features

##### Basic features

In [5]:
import re

time: 54.6 ms


In [6]:
def words(s):
    return re.findall(r'\w+', s)

time: 84 ms


Number of words:

In [7]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(words(s)))
    .as_matrix()
    [:, None]
)

time: 300 ms


Number of unique words:

In [8]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(words(s.lower()))))
    .as_matrix()
    [:, None]
)

time: 328 ms


Length of text:

In [9]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

time: 14.7 ms


Number of stopwords:

In [10]:
from nltk.corpus import stopwords

time: 5.49 s


In [11]:
english_stopwords = set(stopwords.words("english"))

time: 120 ms


In [12]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s.lower())
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

time: 480 ms


Number of punctuation characters:

In [13]:
import string

time: 630 µs


In [14]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

time: 693 ms


Number of upper-case words:

In [15]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

time: 311 ms


Number of title-case words:

In [16]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

time: 299 ms


Mean length of words:

In [17]:
import numpy

time: 607 µs


In [18]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in words(s)
    ]))
    .as_matrix()
    [:, None]
)

time: 1.09 s


All of them:

In [19]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

time: 1.84 ms


In [20]:
f_basic_features = numpy.hstack(basic_features)

time: 94.4 ms


##### Tf-Idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

time: 149 ms


In [22]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

time: 85.6 ms


In [23]:
f_tfidf = t_tfidf.fit_transform(data["text"])

time: 3.56 s


With SVD:

In [24]:
from sklearn.decomposition import TruncatedSVD

time: 417 ms


In [25]:
n_components = 20

time: 725 µs


In [26]:
t_svd = TruncatedSVD(n_components=n_components, algorithm="arpack")

time: 112 ms


In [27]:
f_svd = t_svd.fit_transform(f_tfidf)

time: 3.56 s


##### Counters

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

time: 790 µs


In [29]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

time: 96.9 ms


In [30]:
f_counts = t_count.fit_transform(data["text"])

time: 3.15 s


---

### Application

In [31]:
from sklearn.model_selection import StratifiedKFold

time: 755 µs


In [32]:
cv = StratifiedKFold(n_splits=5)

time: 109 ms


In [33]:
from sklearn.model_selection import cross_val_predict

time: 178 ms


In [34]:
from sklearn.metrics import log_loss

time: 71.9 ms


In [35]:
import scipy.sparse

time: 95.5 ms


In [36]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) is scipy.sparse.csr_matrix,
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(
        model,
        f_train,
        y_train,
        cv=cv,
        method="predict_proba",
    )
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

time: 95.3 ms


---

### Models

##### Logistic regression

In [37]:
from sklearn.linear_model import LogisticRegression

time: 108 ms


Basic features:

In [38]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

train log loss = 1.00201
   cv log loss = 1.00285

time: 1.04 s


TF-IDF:

In [39]:
p_lr_tfidf = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.60284
   cv log loss = 0.83517

time: 8.8 s


Counts:

In [40]:
p_lr_counts = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts, ),
    predict=True,
)

train log loss = 0.11811
   cv log loss = 0.53677

time: 18.2 s


##### Naive Bayes

In [41]:
from sklearn.naive_bayes import MultinomialNB

time: 28.8 ms


TF-IDF:

In [42]:
p_nb_tfidf = apply_model(
    MultinomialNB(),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.46620
   cv log loss = 0.84280

time: 719 ms


Counts:

In [43]:
p_nb_counts = apply_model(
    MultinomialNB(),
    (f_counts, ),
    predict=True,
)

train log loss = 0.03011
   cv log loss = 0.45322

time: 610 ms


##### XGBoost

In [44]:
from xgboost import XGBClassifier

time: 333 ms




Basic features:

In [45]:
apply_model(
    XGBClassifier(n_estimators=100),
    basic_features,
)

train log loss = 0.96825
   cv log loss = 0.99003

time: 2.9 s


SVD:

In [46]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_svd, ),
)

train log loss = 0.79868
   cv log loss = 0.83843

time: 9.58 s


Stack all the things:

In [47]:
p_xgb_final = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        p_lr_basic,
        p_lr_counts,
        p_lr_tfidf,
        p_nb_counts,
        p_nb_tfidf,
    ),
    predict=True,
)

train log loss = 0.34991
   cv log loss = 0.38360

time: 14.7 s


---

### Submission

In [48]:
import os.path

time: 743 µs


In [49]:
submission_path = os.path.realpath('./output')

time: 263 ms


In [50]:
def make_submission(predictions, file_name):
    file_name = os.path.join(
        submission_path,
        os.path.basename(file_name),
    )
    predictions = predictions.copy()
    predictions.columns = y_train.unique()
    pandas.concat(
        (
            data_test["id"].reset_index(drop=True),
            predictions[i_test].reset_index(drop=True),
        ),
        axis=1,
    ).to_csv(file_name, index=False)

time: 199 ms


In [51]:
make_submission(p_xgb_final, "xgb_final.csv")

time: 176 ms
