In [1]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |
| `t_` | transformers |

---

### Data

In [2]:
import pandas

time: 355 ms


In [3]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

time: 61.9 ms


In [4]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

time: 44.8 ms


---

### Features

##### Basic features

In [5]:
import re

time: 88.3 ms


In [6]:
def words(s):
    return re.findall(r'\w+', s)

time: 81.4 ms


Number of words:

In [7]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(words(s)))
    .as_matrix()
    [:, None]
)

time: 391 ms


Number of unique words:

In [8]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(words(s.lower()))))
    .as_matrix()
    [:, None]
)

time: 324 ms


Length of text:

In [9]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

time: 14.4 ms


Number of stopwords:

In [10]:
from nltk.corpus import stopwords

time: 521 ms


In [11]:
english_stopwords = set(stopwords.words("english"))

time: 5.13 ms


In [12]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s.lower())
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

time: 426 ms


Number of punctuation characters:

In [13]:
import string

time: 943 µs


In [14]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

time: 427 ms


Number of upper-case words:

In [15]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

time: 306 ms


Number of title-case words:

In [16]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

time: 331 ms


Mean length of words:

In [17]:
import numpy

time: 678 µs


In [18]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in words(s)
    ]))
    .as_matrix()
    [:, None]
)

time: 1.08 s


All of them:

In [19]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

time: 1.83 ms


In [20]:
f_basic_features = numpy.hstack(basic_features)

time: 108 ms


##### Tf-Idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

time: 159 ms


In [22]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

time: 94.6 ms


In [23]:
f_tfidf = t_tfidf.fit_transform(data["text"])

time: 3.34 s


With SVD:

In [24]:
from sklearn.decomposition import TruncatedSVD

time: 8.36 ms


In [25]:
n_components = 20

time: 110 ms


In [26]:
t_svd = TruncatedSVD(n_components=n_components, algorithm="arpack")

time: 150 ms


In [27]:
f_svd = t_svd.fit_transform(f_tfidf)

time: 2.95 s


##### Counters

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

time: 756 µs


In [29]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

time: 113 ms


In [30]:
f_counts = t_count.fit_transform(data["text"])

time: 3.2 s


##### NLTK stems

In [80]:
import nltk
from nltk.stem.porter import PorterStemmer

time: 998 µs


In [81]:
stemmer = PorterStemmer()

time: 118 ms


In [82]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

time: 84.8 ms


In [156]:
def tokenize_s(text):
    tokens = nltk.word_tokenize(text.lower())
    stems = stem_tokens(tokens, stemmer)
    return stems

time: 2.06 ms


In [158]:
t_tfidf_s = TfidfVectorizer(
    tokenizer=tokenize_s,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_s = t_tfidf_s.fit_transform(data["text"])

time: 23.1 s


In [159]:
t_svd_s = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_s = t_svd_s.fit_transform(f_tfidf_s)

time: 2.77 s


In [160]:
t_count_s = CountVectorizer(
    tokenizer=tokenize,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_s = t_count_s.fit_transform(data["text"])

time: 22.4 s


##### NLTK lemmas

In [143]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

time: 1.25 ms


In [144]:
lemmatizer = WordNetLemmatizer()

time: 1.06 ms


In [153]:
def tokenize_l(text):
    return [
        lemmatizer.lemmatize(i, j[0].lower())
        if j[0].lower() in ['a', 'n', 'v']
        else lemmatizer.lemmatize(i)
        for i, j in pos_tag(
            word_tokenize(text.lower())
        )
    ]

time: 3.59 ms


In [161]:
t_tfidf_l = TfidfVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_l = t_tfidf_l.fit_transform(data["text"])

time: 49.6 s


In [162]:
t_svd_l = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_l = t_svd_l.fit_transform(f_tfidf_l)

time: 2.53 s


In [163]:
t_count_l = CountVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_l = t_count_l.fit_transform(data["text"])

time: 48.9 s


##### Character-level features

In [31]:
t_tfidf_c = TfidfVectorizer(analyzer="char", ngram_range=(1, 7))
f_tfidf_c = t_tfidf_c.fit_transform(data["text"])

time: 25.2 s


In [32]:
t_svd_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_c = t_svd_c.fit_transform(f_tfidf_c)

time: 25.9 s


In [33]:
t_count_c = CountVectorizer(analyzer="char", ngram_range=(1, 7))
f_counts_c = t_count_c.fit_transform(data["text"])

time: 22.9 s


---

### Application

In [34]:
from sklearn.model_selection import StratifiedKFold

time: 1.12 ms


In [35]:
cv = StratifiedKFold(n_splits=5)

time: 123 ms


In [36]:
from sklearn.model_selection import cross_val_predict

time: 126 ms


In [37]:
from sklearn.metrics import log_loss

time: 83.2 ms


In [38]:
import scipy.sparse

time: 83.7 ms


In [39]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) is scipy.sparse.csr_matrix,
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(
        model,
        f_train,
        y_train,
        cv=cv,
        method="predict_proba",
    )
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

time: 82.4 ms


---

### Models

##### Logistic regression

In [40]:
from sklearn.linear_model import LogisticRegression

time: 84 ms


Basic features:

In [41]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

train log loss = 1.00201
   cv log loss = 1.00285

time: 910 ms


TF-IDF:

In [42]:
p_lr_tfidf = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.60284
   cv log loss = 0.83517

time: 9.11 s


SVD:

In [43]:
p_lr_svd = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd, ),
    predict=True,
)

train log loss = 0.97553
   cv log loss = 0.98460

time: 1.04 s


Counts:

In [44]:
p_lr_counts = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts, ),
    predict=True,
)

train log loss = 0.11811
   cv log loss = 0.53677

time: 18.7 s


Character:

In [45]:
p_lr_tfidf_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.47451
   cv log loss = 0.62743

time: 1min 18s


In [46]:
p_lr_svd_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_c, ),
    predict=True,
)

train log loss = 0.86968
   cv log loss = 0.87547

time: 1.09 s


In [47]:
p_lr_counts_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.27036
   cv log loss = 0.46097

time: 2min 5s


NLTK stems:

In [87]:
p_lr_tfidf_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.56389
   cv log loss = 0.77756

time: 11.3 s


In [88]:
p_lr_svd_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_s, ),
    predict=True,
)

train log loss = 0.89632
   cv log loss = 0.90756

time: 1.06 s


In [89]:
p_lr_counts_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.08449
   cv log loss = 0.49929

time: 35.6 s


NLTK lemmas:

In [165]:
p_lr_tfidf_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.57171
   cv log loss = 0.78810

time: 10.1 s


In [166]:
p_lr_svd_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_l, ),
    predict=True,
)

train log loss = 0.91920
   cv log loss = 0.92882

time: 1.13 s


In [167]:
p_lr_counts_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.09439
   cv log loss = 0.51219

time: 27.9 s


##### Calibrated

In [68]:
from sklearn.calibration import CalibratedClassifierCV

time: 788 µs


##### Naive Bayes

In [48]:
from sklearn.naive_bayes import MultinomialNB

time: 1.7 ms


TF-IDF:

In [49]:
p_nb_tfidf = apply_model(
    MultinomialNB(),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.46620
   cv log loss = 0.84280

time: 708 ms


***Note:*** SVD - does not work for NB.

Counts:

In [50]:
p_nb_counts = apply_model(
    MultinomialNB(),
    (f_counts, ),
    predict=True,
)

train log loss = 0.03011
   cv log loss = 0.45322

time: 586 ms


Character:

In [51]:
p_nb_tfidf_c = apply_model(
    MultinomialNB(),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.65026
   cv log loss = 0.91851

time: 4.51 s


In [52]:
p_nb_counts_c = apply_model(
    MultinomialNB(),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.96735
   cv log loss = 3.77831

time: 4.78 s


NLTK stems:

In [90]:
p_nb_tfidf_s = apply_model(
    MultinomialNB(),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.44457
   cv log loss = 0.80402

time: 699 ms


In [91]:
p_nb_counts_s = apply_model(
    MultinomialNB(),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.02554
   cv log loss = 0.64781

time: 700 ms


NLTK lemmas:

In [168]:
p_nb_tfidf_l = apply_model(
    MultinomialNB(),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.44638
   cv log loss = 0.80741

time: 661 ms


In [169]:
p_nb_counts_l = apply_model(
    MultinomialNB(),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.03101
   cv log loss = 0.62511

time: 646 ms


##### XGBoost

In [53]:
from xgboost import XGBClassifier

time: 13.1 ms




Basic features:

In [54]:
apply_model(
    XGBClassifier(n_estimators=100),
    basic_features,
)

train log loss = 0.96825
   cv log loss = 0.99003

time: 2.82 s


SVD:

In [55]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_svd, ),
)

train log loss = 0.79868
   cv log loss = 0.83843

time: 10.4 s


Stack all the things:

In [56]:
p_xgb_final = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
        p_lr_basic,
        p_lr_counts,
        p_lr_tfidf,
        p_nb_counts,
        p_nb_tfidf,
    ),
    predict=True,
)

train log loss = 0.34991
   cv log loss = 0.38360

time: 17.2 s


Stack all the things with LR:

In [57]:
apply_model(
    LogisticRegression(max_iter=10),
    basic_features + (
        f_svd,
        p_lr_basic,
        p_lr_counts,
        p_lr_tfidf,
        p_nb_counts,
        p_nb_tfidf,
    ),
)

train log loss = 0.50498
   cv log loss = 0.55637

time: 2.24 s


More to stack:

In [113]:
p_xgb_3 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,  # -10) 31183 x
#         f_svd_c,  # -11) 31100
        f_svd_s,  # 1) 32109
        
        p_lr_basic,  # -3) 31164 x
        
        p_lr_tfidf,  # -4) 31153 x
        p_lr_svd,  # -5) 31235 x
#         p_lr_counts,  # -6) 31108
        
        p_lr_tfidf_c,  # -7) 31202 x
        p_lr_svd_c,  # -8) 31117 x
        p_lr_counts_c,  # -9) 32103 x
        
        p_lr_tfidf_s,  # 2) 32044
#         p_lr_svd_s,  # 3) 32068 x
        p_lr_counts_s,  # 4) 31965
        
#         p_nb_tfidf,  # -1) 31134
        p_nb_counts,  # -2) 32615 x
        
        # these two features removed from best model
        p_nb_tfidf_c,  # 7) 31348
        p_nb_counts_c,  # 8) 31184
        
        p_nb_tfidf_s,  # 5) 31777
        p_nb_counts_s,  # 6) 31369
    ),
#     predict=True,
)

train log loss = 0.27102
   cv log loss = 0.31100

time: 30.3 s


In [185]:
p_xgb_3 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,  # 1) 31094 +
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,  # -2) 31119 x
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,  # 2) 31109 x
        p_lr_svd_l,  # 3) 31093
#         p_lr_counts_l,  # 4) 31126 x
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,  # 5) 31099 x
#         p_nb_counts_l,  # 6) 31146 x
    ),
    predict=True,
)

train log loss = 0.26964
   cv log loss = 0.31093

time: 38.4 s


Calibrated:

In [186]:
p_ci_xgb_3 = apply_model(
    CalibratedClassifierCV(
        XGBClassifier(n_estimators=100),
        method="isotonic",
    ),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,  # 1) 31094 +
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,  # -2) 31119 x
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,  # 2) 31109 x
        p_lr_svd_l,  # 3) 31093
#         p_lr_counts_l,  # 4) 31126 x
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,  # 5) 31099 x
#         p_nb_counts_l,  # 6) 31146 x
    ),
    predict=True,
)

train log loss = 0.26513
   cv log loss = 0.31193

time: 1min 16s


---

### Submission

In [61]:
import os.path

time: 712 µs


In [62]:
submission_path = os.path.realpath('./output')

time: 149 ms


In [63]:
def make_submission(predictions, file_name):
    file_name = os.path.join(
        submission_path,
        os.path.basename(file_name),
    )
    predictions = predictions.copy()
    predictions.columns = y_train.unique()
    pandas.concat(
        (
            data_test["id"].reset_index(drop=True),
            predictions[i_test].reset_index(drop=True),
        ),
        axis=1,
    ).to_csv(file_name, index=False)

time: 119 ms


In [187]:
make_submission(p_xgb_3, "p_xgb_3.csv")

time: 44.8 ms


In [188]:
make_submission(p_ci_xgb_3, "p_ci_xgb_3.csv")

time: 142 ms
