In [1]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |
| `t_` | transformers |

---

### Data

In [2]:
import pandas

time: 3.34 s


In [3]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

time: 637 ms


In [4]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

time: 25.4 ms


---

### Features

##### Basic features

In [5]:
import re

time: 121 ms


In [6]:
def words(s):
    return re.findall(r'\w+', s)

time: 82.7 ms


Number of words:

In [7]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(words(s)))
    .as_matrix()
    [:, None]
)

time: 338 ms


Number of unique words:

In [8]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(words(s.lower()))))
    .as_matrix()
    [:, None]
)

time: 332 ms


Length of text:

In [9]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

time: 16.3 ms


Number of stopwords:

In [10]:
from nltk.corpus import stopwords

time: 5.29 s


In [11]:
english_stopwords = set(stopwords.words("english"))

time: 474 ms


In [12]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s.lower())
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

time: 458 ms


Number of punctuation characters:

In [13]:
import string

time: 983 µs


In [14]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

time: 459 ms


Number of upper-case words:

In [15]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

time: 325 ms


Number of title-case words:

In [16]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

time: 343 ms


Mean length of words:

In [17]:
import numpy

time: 633 µs


In [18]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in words(s)
    ]))
    .as_matrix()
    [:, None]
)

time: 1.1 s


All of them:

In [19]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

time: 3.12 ms


In [20]:
f_basic_features = numpy.hstack(basic_features)

time: 74.6 ms


##### Tf-Idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

time: 71.2 ms


In [22]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

time: 82.8 ms


In [23]:
f_tfidf = t_tfidf.fit_transform(data["text"])

time: 3.51 s


With SVD:

In [24]:
from sklearn.decomposition import TruncatedSVD

time: 650 ms


In [25]:
n_components = 20

time: 689 µs


In [26]:
t_svd = TruncatedSVD(n_components=n_components, algorithm="arpack")

time: 169 ms


In [27]:
f_svd = t_svd.fit_transform(f_tfidf)

time: 4.98 s


##### Counters

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

time: 761 µs


In [29]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

time: 151 ms


In [30]:
f_counts = t_count.fit_transform(data["text"])

time: 3.51 s


In [31]:
t_svdc = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc = t_svdc.fit_transform(f_counts.astype(float))

time: 2.64 s


##### NLTK stems

In [32]:
import nltk
from nltk.stem.porter import PorterStemmer

time: 1.4 ms


In [33]:
stemmer = PorterStemmer()

time: 161 ms


In [34]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

time: 82.8 ms


In [35]:
def tokenize_s(text):
    tokens = nltk.word_tokenize(text.lower())
    stems = stem_tokens(tokens, stemmer)
    return stems

time: 95.9 ms


In [36]:
t_tfidf_s = TfidfVectorizer(
    tokenizer=tokenize_s,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_s = t_tfidf_s.fit_transform(data["text"])

time: 24.8 s


In [37]:
t_svd_s = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_s = t_svd_s.fit_transform(f_tfidf_s)

time: 3.92 s


In [39]:
t_count_s = CountVectorizer(
    tokenizer=tokenize_s,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_s = t_count_s.fit_transform(data["text"])

time: 26.1 s


In [40]:
t_svdc_s = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_s = t_svdc_s.fit_transform(f_counts_s.astype(float))

time: 3.55 s


##### NLTK lemmas

In [41]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

time: 1.97 ms


In [42]:
lemmatizer = WordNetLemmatizer()

time: 113 ms


In [43]:
def tokenize_l(text):
    return [
        lemmatizer.lemmatize(i, j[0].lower())
        if j[0].lower() in ['a', 'n', 'v']
        else lemmatizer.lemmatize(i)
        for i, j in pos_tag(
            word_tokenize(text.lower())
        )
    ]

time: 109 ms


In [44]:
t_tfidf_l = TfidfVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_l = t_tfidf_l.fit_transform(data["text"])

time: 59.2 s


In [45]:
t_svd_l = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_l = t_svd_l.fit_transform(f_tfidf_l)

time: 3.74 s


In [46]:
t_count_l = CountVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_l = t_count_l.fit_transform(data["text"])

time: 54.4 s


In [47]:
t_svdc_l = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_l = t_svdc_l.fit_transform(f_counts_l.astype(float))

time: 3.19 s


##### Character-level features

In [48]:
t_tfidf_c = TfidfVectorizer(analyzer="char", ngram_range=(1, 7))
f_tfidf_c = t_tfidf_c.fit_transform(data["text"])

time: 31.4 s


In [49]:
t_svd_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_c = t_svd_c.fit_transform(f_tfidf_c)

time: 29.4 s


In [50]:
t_count_c = CountVectorizer(analyzer="char", ngram_range=(1, 7))
f_counts_c = t_count_c.fit_transform(data["text"])

time: 35.7 s


In [51]:
t_svdc_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_c = t_svdc_c.fit_transform(f_counts_c.astype(float))

time: 25.1 s


---

### Application

In [52]:
from sklearn.model_selection import StratifiedKFold

time: 758 µs


In [53]:
cv = StratifiedKFold(n_splits=5)

time: 367 ms


In [54]:
from sklearn.model_selection import cross_val_predict

time: 84.7 ms


In [55]:
from sklearn.metrics import log_loss

time: 84.2 ms


In [56]:
import scipy.sparse

time: 73.1 ms


In [57]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) in [
            scipy.sparse.csr_matrix,
            scipy.sparse.csc_matrix,
            scipy.sparse.coo_matrix,
        ],
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(
        model,
        f_train,
        y_train,
        cv=cv,
        method="predict_proba",
    )
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

time: 85.1 ms


---

### Models

##### Logistic regression

In [58]:
from sklearn.linear_model import LogisticRegression

time: 84.6 ms


Basic features:

In [104]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

train log loss = 1.00201
   cv log loss = 1.00285

time: 892 ms


In [105]:
po_lr_basic = apply_model(
    LogisticRegression(max_iter=100, penalty='l2', C=1e3),
    basic_features,
    predict=True,
)

train log loss = 1.00128
   cv log loss = 1.00213

time: 1.37 s


TF-IDF:

In [106]:
p_lr_tfidf = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.60284
   cv log loss = 0.83517

time: 8.77 s


In [107]:
po_lr_tfidf = apply_model(
    LogisticRegression(C=300, penalty="l2"),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.01544
   cv log loss = 0.51201

time: 20.2 s


SVD:

In [108]:
p_lr_svd = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd, ),
    predict=True,
)

train log loss = 0.97553
   cv log loss = 0.98460

time: 1.03 s


In [109]:
po_lr_svd = apply_model(
    LogisticRegression(C=1e6, penalty="l2"),
    (f_svd, ),
    predict=True,
)

train log loss = 0.91699
   cv log loss = 0.92039

time: 1.58 s


Counts:

In [62]:
p_lr_counts = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts, ),
    predict=True,
)

train log loss = 0.11811
   cv log loss = 0.53677

time: 24 s


In [114]:
po_lr_counts = apply_model(
    LogisticRegression(C=3, penalty="l2"),
    (f_counts, ),
    predict=True,
)

train log loss = 0.05867
   cv log loss = 0.53203

time: 24.3 s


In [63]:
p_lr_svdc = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc, ),
    predict=True,
)

train log loss = 0.98263
   cv log loss = 0.98512

time: 2.07 s


In [113]:
po_lr_svdc = apply_model(
    LogisticRegression(C=1e6, penalty="l2"),
    (f_svdc, ),
    predict=True,
)

train log loss = 0.98163
   cv log loss = 0.98401

time: 1.63 s


Character:

In [64]:
p_lr_tfidf_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.47451
   cv log loss = 0.62743

time: 1min 57s


In [None]:
po_lr_tfidf_c = apply_model(
    LogisticRegression(C=100),
    (f_tfidf_c, ),
    predict=True,
)

In [65]:
p_lr_svd_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_c, ),
    predict=True,
)

train log loss = 0.86968
   cv log loss = 0.87547

time: 1.56 s


In [None]:
po_lr_svd_c = apply_model(
    LogisticRegression(C=1e6),
    (f_svd_c, ),
    predict=True,
)

In [66]:
p_lr_counts_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.27036
   cv log loss = 0.46097

time: 2min 52s


In [122]:
po_lr_counts_c = apply_model(
    LogisticRegression(C=0.1),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.06079
   cv log loss = 0.43166

time: 6min 52s


In [67]:
p_lr_svdc_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_c, ),
    predict=True,
)

train log loss = 0.93961
   cv log loss = 0.94175

time: 2 s


In [None]:
for x in [1e6, 1e5, 1e4, 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2, 1e-3]:
    print(x)
    for y in ['l1', 'l2']:
        print(y)
        apply_model(
            LogisticRegression(C=x, penalty=y),
            (f_svdc_c, ),
        )

NLTK stems:

In [68]:
p_lr_tfidf_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.56389
   cv log loss = 0.77756

time: 14.4 s


In [69]:
p_lr_svd_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_s, ),
    predict=True,
)

train log loss = 0.89632
   cv log loss = 0.90756

time: 1.62 s


In [70]:
p_lr_counts_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.08449
   cv log loss = 0.49929

time: 46.8 s


In [71]:
p_lr_svdc_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_s, ),
    predict=True,
)

train log loss = 0.94342
   cv log loss = 0.94601

time: 3.44 s


NLTK lemmas:

In [72]:
p_lr_tfidf_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.57171
   cv log loss = 0.78810

time: 13.9 s


In [73]:
p_lr_svd_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_l, ),
    predict=True,
)

train log loss = 0.91920
   cv log loss = 0.92882

time: 1.61 s


In [74]:
p_lr_counts_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.09439
   cv log loss = 0.51219

time: 36.5 s


In [75]:
p_lr_svdc_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_l, ),
    predict=True,
)

train log loss = 0.91874
   cv log loss = 0.92122

time: 2.93 s


##### Calibrated

In [76]:
from sklearn.calibration import CalibratedClassifierCV

time: 164 ms


##### Naive Bayes

In [77]:
from sklearn.naive_bayes import MultinomialNB

time: 41.2 ms


TF-IDF:

In [78]:
p_nb_tfidf = apply_model(
    MultinomialNB(),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.46620
   cv log loss = 0.84280

time: 786 ms


***Note:*** SVD - does not work for NB.

Counts:

In [79]:
p_nb_counts = apply_model(
    MultinomialNB(),
    (f_counts, ),
    predict=True,
)

train log loss = 0.03011
   cv log loss = 0.45322

time: 705 ms


Character:

In [80]:
p_nb_tfidf_c = apply_model(
    MultinomialNB(),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.65026
   cv log loss = 0.91851

time: 6.14 s


In [81]:
p_nb_counts_c = apply_model(
    MultinomialNB(),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.96735
   cv log loss = 3.77831

time: 6.46 s


NLTK stems:

In [82]:
p_nb_tfidf_s = apply_model(
    MultinomialNB(),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.44457
   cv log loss = 0.80402

time: 865 ms


In [83]:
p_nb_counts_s = apply_model(
    MultinomialNB(),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.02554
   cv log loss = 0.64781

time: 856 ms


NLTK lemmas:

In [84]:
p_nb_tfidf_l = apply_model(
    MultinomialNB(),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.44638
   cv log loss = 0.80741

time: 773 ms


In [85]:
p_nb_counts_l = apply_model(
    MultinomialNB(),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.03101
   cv log loss = 0.62511

time: 792 ms


##### XGBoost

In [86]:
from xgboost import XGBClassifier

time: 477 ms




Basic features:

In [87]:
apply_model(
    XGBClassifier(n_estimators=100),
    basic_features,
)

train log loss = 0.96825
   cv log loss = 0.99003

time: 48.4 s


SVD:

In [88]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_svd, ),
)

train log loss = 0.79868
   cv log loss = 0.83843

time: 53.9 s


##### Stacking

In [89]:
p_xgb_4 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
    predict=True,
)

train log loss = 0.26724
   cv log loss = 0.30961

time: 3min 11s


Calibrated:

In [90]:
p_ci_xgb_4 = apply_model(
    CalibratedClassifierCV(
        XGBClassifier(n_estimators=100),
        method="isotonic",
    ),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
    predict=True,
)

train log loss = 0.26171
   cv log loss = 0.30990

time: 6min 47s


---

### Submission

In [91]:
import os.path

time: 932 µs


In [92]:
submission_path = os.path.realpath('./output')

time: 118 ms


In [93]:
def make_submission(predictions, file_name):
    file_name = os.path.join(
        submission_path,
        os.path.basename(file_name),
    )
    predictions = predictions.copy()
    predictions.columns = y_train.unique()
    pandas.concat(
        (
            data_test["id"].reset_index(drop=True),
            predictions[i_test].reset_index(drop=True),
        ),
        axis=1,
    ).to_csv(file_name, index=False)

time: 84.1 ms
