In [1]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |
| `t_` | transformers |

---

### Data

In [2]:
import pandas

time: 326 ms


In [3]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

time: 70.7 ms


In [4]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

time: 99.3 ms


---

### Features

##### Basic features

In [5]:
import re

time: 102 ms


In [6]:
def words(s):
    return re.findall(r'\w+', s)

time: 101 ms


Number of words:

In [7]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(words(s)))
    .as_matrix()
    [:, None]
)

time: 302 ms


Number of unique words:

In [8]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(words(s.lower()))))
    .as_matrix()
    [:, None]
)

time: 311 ms


Length of text:

In [9]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

time: 14.3 ms


Number of stopwords:

In [10]:
from nltk.corpus import stopwords

time: 477 ms


In [11]:
english_stopwords = set(stopwords.words("english"))

time: 3.96 ms


In [12]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s.lower())
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

time: 442 ms


Number of punctuation characters:

In [13]:
import string

time: 670 µs


In [14]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

time: 415 ms


Number of upper-case words:

In [15]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

time: 313 ms


Number of title-case words:

In [16]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

time: 325 ms


Mean length of words:

In [17]:
import numpy

time: 1.31 ms


In [18]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in words(s)
    ]))
    .as_matrix()
    [:, None]
)

time: 1.08 s


All of them:

In [19]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

time: 2.6 ms


In [20]:
f_basic_features = numpy.hstack(basic_features)

time: 81.9 ms


##### Tf-Idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

time: 166 ms


In [22]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

time: 113 ms


In [23]:
f_tfidf = t_tfidf.fit_transform(data["text"])

time: 3.21 s


With SVD:

In [24]:
from sklearn.decomposition import TruncatedSVD

time: 8.15 ms


In [25]:
n_components = 20

time: 105 ms


In [26]:
t_svd = TruncatedSVD(n_components=n_components, algorithm="arpack")

time: 173 ms


In [27]:
f_svd = t_svd.fit_transform(f_tfidf)

time: 2.91 s


##### Counters

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

time: 835 µs


In [29]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

time: 130 ms


In [30]:
f_counts = t_count.fit_transform(data["text"])

time: 3.18 s


##### Character-level features

In [31]:
t_tfidf_c = TfidfVectorizer(analyzer="char", ngram_range=(1, 7))
f_tfidf_c = t_tfidf_c.fit_transform(data["text"])

time: 25.5 s


In [32]:
t_svd_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_c = t_svd_c.fit_transform(f_tfidf_c)

time: 26 s


In [33]:
t_count_c = CountVectorizer(analyzer="char", ngram_range=(1, 7))
f_counts_c = t_count_c.fit_transform(data["text"])

time: 22.9 s


---

### Application

In [34]:
from sklearn.model_selection import StratifiedKFold

time: 1.14 ms


In [35]:
cv = StratifiedKFold(n_splits=5)

time: 100 ms


In [36]:
from sklearn.model_selection import cross_val_predict

time: 188 ms


In [37]:
from sklearn.metrics import log_loss

time: 97.3 ms


In [38]:
import scipy.sparse

time: 89.8 ms


In [39]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) is scipy.sparse.csr_matrix,
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(
        model,
        f_train,
        y_train,
        cv=cv,
        method="predict_proba",
    )
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

time: 91.6 ms


---

### Models

##### Logistic regression

In [40]:
from sklearn.linear_model import LogisticRegression

time: 118 ms


Basic features:

In [41]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

train log loss = 1.00201
   cv log loss = 1.00285

time: 911 ms


TF-IDF:

In [42]:
p_lr_tfidf = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.60284
   cv log loss = 0.83517

time: 8.37 s


SVD:

In [43]:
p_lr_svd = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd, ),
    predict=True,
)

train log loss = 0.97553
   cv log loss = 0.98460

time: 934 ms


Counts:

In [44]:
p_lr_counts = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts, ),
    predict=True,
)

train log loss = 0.11811
   cv log loss = 0.53677

time: 18.3 s


Character:

In [45]:
p_lr_tfidf_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.47451
   cv log loss = 0.62743

time: 1min 16s


In [46]:
p_lr_svd_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_c, ),
    predict=True,
)

train log loss = 0.86968
   cv log loss = 0.87547

time: 1.08 s


In [47]:
p_lr_counts_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.27036
   cv log loss = 0.46097

time: 2min 2s


##### Naive Bayes

In [48]:
from sklearn.naive_bayes import MultinomialNB

time: 1.74 ms


TF-IDF:

In [49]:
p_nb_tfidf = apply_model(
    MultinomialNB(),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.46620
   cv log loss = 0.84280

time: 692 ms


***Note:*** SVD - does not work for NB.

Counts:

In [50]:
p_nb_counts = apply_model(
    MultinomialNB(),
    (f_counts, ),
    predict=True,
)

train log loss = 0.03011
   cv log loss = 0.45322

time: 568 ms


Character:

In [51]:
p_nb_tfidf_c = apply_model(
    MultinomialNB(),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.65026
   cv log loss = 0.91851

time: 4.42 s


In [52]:
p_nb_counts_c = apply_model(
    MultinomialNB(),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.96735
   cv log loss = 3.77831

time: 4.61 s


##### XGBoost

In [53]:
from xgboost import XGBClassifier

time: 13.3 ms




Basic features:

In [54]:
apply_model(
    XGBClassifier(n_estimators=100),
    basic_features,
)

train log loss = 0.96825
   cv log loss = 0.99003

time: 2.78 s


SVD:

In [55]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_svd, ),
)

train log loss = 0.79868
   cv log loss = 0.83843

time: 8.96 s


Stack all the things:

In [56]:
p_xgb_final = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        p_lr_basic,
        p_lr_counts,
        p_lr_tfidf,
        p_nb_counts,
        p_nb_tfidf,
    ),
    predict=True,
)

train log loss = 0.34991
   cv log loss = 0.38360

time: 14.8 s


Stack all the things with LR:

In [57]:
apply_model(
    LogisticRegression(max_iter=10),
    basic_features + (
        f_svd,
        p_lr_basic,
        p_lr_counts,
        p_lr_tfidf,
        p_nb_counts,
        p_nb_tfidf,
    ),
)

train log loss = 0.50498
   cv log loss = 0.55637

time: 2.27 s


More to stack:

In [58]:
p_xgb_2_wo_nbcc_32198 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        f_svd_c,  # 1) 33758/37680
        
        p_lr_basic,
        p_lr_tfidf,
        p_lr_svd,  # 2) 33735/37664
        p_lr_counts,
        
        p_lr_tfidf_c,  # 3) 29181/32814
        p_lr_svd_c,  # 4) 28924/32609
        p_lr_counts_c,  # 5) 28541/32236
        
        p_nb_tfidf,
        p_nb_counts,
        
        p_nb_tfidf_c,  # 6) 28518/32198
#         p_nb_counts_c,  # 7) 28961/32188
    ),
    predict=True,
)

train log loss = 0.28518
   cv log loss = 0.32198

time: 25.4 s


PL score: 0.31830.

In [59]:
p_xgb_2_w_nbcc_32188 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        f_svd_c,  # 1) 33758/37680
        
        p_lr_basic,
        p_lr_tfidf,
        p_lr_svd,  # 2) 33735/37664
        p_lr_counts,
        
        p_lr_tfidf_c,  # 3) 29181/32814
        p_lr_svd_c,  # 4) 28924/32609
        p_lr_counts_c,  # 5) 28541/32236
        
        p_nb_tfidf,
        p_nb_counts,
        
        p_nb_tfidf_c,  # 6) 28518/32198
        p_nb_counts_c,  # 7) 28961/32188
    ),
    predict=True,
)

train log loss = 0.28961
   cv log loss = 0.32188

time: 26.2 s


PL score: 0.32272.

In [60]:
p_xgb_2_wo_two_last_32236 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        f_svd_c,  # 1) 33758/37680
        
        p_lr_basic,
        p_lr_tfidf,
        p_lr_svd,  # 2) 33735/37664
        p_lr_counts,
        
        p_lr_tfidf_c,  # 3) 29181/32814
        p_lr_svd_c,  # 4) 28924/32609
        p_lr_counts_c,  # 5) 28541/32236
        
        p_nb_tfidf,
        p_nb_counts,
        
#         p_nb_tfidf_c,  # 6) 28518/32198
#         p_nb_counts_c,  # 7) 28961/32188
    ),
    predict=True,
)

train log loss = 0.28541
   cv log loss = 0.32236

time: 26.9 s


PL score: 0.31832.

---

### Submission

In [61]:
import os.path

time: 661 µs


In [62]:
submission_path = os.path.realpath('./output')

time: 142 ms


In [63]:
def make_submission(predictions, file_name):
    file_name = os.path.join(
        submission_path,
        os.path.basename(file_name),
    )
    predictions = predictions.copy()
    predictions.columns = y_train.unique()
    pandas.concat(
        (
            data_test["id"].reset_index(drop=True),
            predictions[i_test].reset_index(drop=True),
        ),
        axis=1,
    ).to_csv(file_name, index=False)

time: 161 ms


In [64]:
make_submission(p_xgb_final, "xgb_final.csv")

time: 144 ms


Best:

In [65]:
make_submission(p_xgb_2_wo_nbcc_32198, "p_xgb_2_wo_nbcc_32198.csv")

time: 151 ms


In [66]:
make_submission(p_xgb_2_w_nbcc_32188, "p_xgb_2_w_nbcc_32188.csv")

time: 121 ms


Almost the same as the best:

In [67]:
make_submission(p_xgb_2_wo_two_last_32236, "p_xgb_2_wo_two_last_32236.csv")

time: 116 ms
