In [1]:
%load_ext autotime

---

### Conventions

| entity | semantic |
|-|-|
| `data_` | dataframes containing both raw data and targets |
| `i_` | indices in `data_` objects |
| `y_` | targets |
| `f_` | features |
| `p_` | predictions of models (incl. cross-validated) |
| `q_` | quality metrics (log loss) |
| `t_` | transformers |

---

### Data

In [2]:
import pandas

time: 412 ms


In [3]:
data_train = pandas.read_csv('./input/train.csv')
data_test = pandas.read_csv('./input/test.csv')
data = pandas.concat([data_train, data_test], ignore_index=True)

time: 78.3 ms


In [4]:
y = data.author

i_train = ~y.isnull()
i_test = y.isnull()

y_train = y[i_train]
y_test = y[i_test]

time: 77.7 ms


---

### Features

##### Basic features

In [5]:
import re

time: 91.1 ms


In [6]:
def words(s):
    return re.findall(r'\w+', s)

time: 84.7 ms


Number of words:

In [7]:
f_n_words = (
    data["text"]
    .apply(lambda s: len(words(s)))
    .as_matrix()
    [:, None]
)

time: 328 ms


Number of unique words:

In [8]:
f_n_unique_words = (
    data["text"]
    .apply(lambda s: len(set(words(s.lower()))))
    .as_matrix()
    [:, None]
)

time: 352 ms


Length of text:

In [9]:
f_n_chars = (
    data["text"]
    .apply(lambda s: len(s))
    .as_matrix()
    [:, None]
)

time: 17.8 ms


Number of stopwords:

In [10]:
from nltk.corpus import stopwords

time: 742 ms


In [11]:
english_stopwords = set(stopwords.words("english"))

time: 3.92 ms


In [12]:
f_n_stopwords = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s.lower())
        if w in english_stopwords
    ]))
    .as_matrix()
    [:, None]
)

time: 419 ms


Number of punctuation characters:

In [13]:
import string

time: 740 µs


In [14]:
f_n_punct = (
    data["text"]
    .apply(lambda s: len([
        c
        for c in s
        if c in string.punctuation
    ]))
    .as_matrix()
    [:, None]
)

time: 437 ms


Number of upper-case words:

In [15]:
f_n_upper_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.isupper()
    ]))
    .as_matrix()
    [:, None]
)

time: 340 ms


Number of title-case words:

In [16]:
f_n_title_words = (
    data["text"]
    .apply(lambda s: len([
        w
        for w in words(s)
        if w.istitle()
    ]))
    .as_matrix()
    [:, None]
)

time: 350 ms


Mean length of words:

In [17]:
import numpy

time: 706 µs


In [18]:
f_mean_word_length = (
    data["text"]
    .apply(lambda s: numpy.mean([
        len(w)
        for w in words(s)
    ]))
    .as_matrix()
    [:, None]
)

time: 1.17 s


All of them:

In [19]:
basic_features = (
    f_n_words,
    f_n_unique_words,
    f_n_chars,
    f_n_stopwords,
    f_n_punct,
    f_n_upper_words,
    f_n_title_words,
    f_mean_word_length,
)

time: 3.04 ms


In [20]:
f_basic_features = numpy.hstack(basic_features)

time: 101 ms


##### Tf-Idf

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

time: 162 ms


In [22]:
t_tfidf = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))

time: 85.3 ms


In [23]:
f_tfidf = t_tfidf.fit_transform(data["text"])

time: 4.09 s


With SVD:

In [24]:
from sklearn.decomposition import TruncatedSVD

time: 9.07 ms


In [25]:
n_components = 20

time: 117 ms


In [26]:
t_svd = TruncatedSVD(n_components=n_components, algorithm="arpack")

time: 140 ms


In [27]:
f_svd = t_svd.fit_transform(f_tfidf)

time: 4.55 s


##### Counters

In [28]:
from sklearn.feature_extraction.text import CountVectorizer

time: 980 µs


In [29]:
t_count = CountVectorizer(stop_words="english", ngram_range=(1, 3))

time: 138 ms


In [30]:
f_counts = t_count.fit_transform(data["text"])

time: 3.97 s


In [31]:
t_svdc = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc = t_svdc.fit_transform(f_counts.astype(float))

time: 3.2 s


##### NLTK stems

In [32]:
import nltk
from nltk.stem.porter import PorterStemmer

time: 1.05 ms


In [33]:
stemmer = PorterStemmer()

time: 121 ms


In [34]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

time: 93.8 ms


In [35]:
def tokenize_s(text):
    tokens = nltk.word_tokenize(text.lower())
    stems = stem_tokens(tokens, stemmer)
    return stems

time: 83.6 ms


In [36]:
t_tfidf_s = TfidfVectorizer(
    tokenizer=tokenize_s,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_s = t_tfidf_s.fit_transform(data["text"])

time: 27 s


In [37]:
t_svd_s = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_s = t_svd_s.fit_transform(f_tfidf_s)

time: 4.27 s


In [38]:
t_count_s = CountVectorizer(
    tokenizer=tokenize_s,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_s = t_count_s.fit_transform(data["text"])

time: 27.7 s


In [39]:
t_svdc_s = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_s = t_svdc_s.fit_transform(f_counts_s.astype(float))

time: 4.73 s


##### NLTK lemmas

In [40]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

time: 2.12 ms


In [41]:
lemmatizer = WordNetLemmatizer()

time: 116 ms


In [42]:
def tokenize_l(text):
    return [
        lemmatizer.lemmatize(i, j[0].lower())
        if j[0].lower() in ['a', 'n', 'v']
        else lemmatizer.lemmatize(i)
        for i, j in pos_tag(
            word_tokenize(text.lower())
        )
    ]

time: 106 ms


In [43]:
t_tfidf_l = TfidfVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_tfidf_l = t_tfidf_l.fit_transform(data["text"])

time: 1min 7s


In [44]:
t_svd_l = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_l = t_svd_l.fit_transform(f_tfidf_l)

time: 3.8 s


In [45]:
t_count_l = CountVectorizer(
    tokenizer=tokenize_l,
    stop_words="english",
    ngram_range=(1, 3),
)
f_counts_l = t_count_l.fit_transform(data["text"])

time: 1min 2s


In [46]:
t_svdc_l = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_l = t_svdc_l.fit_transform(f_counts_l.astype(float))

time: 4.28 s


##### Character-level features

In [47]:
t_tfidf_c = TfidfVectorizer(analyzer="char", ngram_range=(1, 7))
f_tfidf_c = t_tfidf_c.fit_transform(data["text"])

time: 33.9 s


In [48]:
t_svd_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svd_c = t_svd_c.fit_transform(f_tfidf_c)

time: 35.6 s


In [49]:
t_count_c = CountVectorizer(analyzer="char", ngram_range=(1, 7))
f_counts_c = t_count_c.fit_transform(data["text"])

time: 32.1 s


In [50]:
t_svdc_c = TruncatedSVD(n_components=n_components, algorithm="arpack")
f_svdc_c = t_svdc_c.fit_transform(f_counts_c.astype(float))

time: 28.5 s


##### Word2vec

In [51]:
import tqdm

time: 4.03 ms


In [52]:
!ls -alhS /home/mityajj/nlp_models/

total 16G
-rw-rw-r--  1 mityajj mityajj 5.3G Oct 24  2015 glove.840B.300d.txt
-rw-rw-r--  1 mityajj mityajj 4.7G Oct 25  2015 glove.42B.300d.txt
-rw-rw-r--  1 mityajj mityajj 2.0G Aug 14  2014 glove.twitter.27B.200d.txt
-rw-rw-r--  1 mityajj mityajj 990M Aug 27  2014 glove.6B.300d.txt
-rw-rw-r--  1 mityajj mityajj 975M Aug 14  2014 glove.twitter.27B.100d.txt
-rw-rw-r--  1 mityajj mityajj 662M Aug  5  2014 glove.6B.200d.txt
-rw-rw-r--  1 mityajj mityajj 488M Aug 14  2014 glove.twitter.27B.50d.txt
-rw-rw-r--  1 mityajj mityajj 332M Aug  5  2014 glove.6B.100d.txt
-r--r--r--  1 mityajj mityajj 246M Aug 14  2014 glove.twitter.27B.25d.txt
-rw-rw-r--  1 mityajj mityajj 164M Aug  5  2014 glove.6B.50d.txt
drwxr-xr-x  3 mityajj mityajj 4.0K Dec  4 23:51 .
drwx------ 42 mityajj mityajj 4.0K Dec  4 23:04 ..
drwxr-xr-x  2 mityajj mityajj 4.0K Dec  4 23:51 packed
time: 278 ms


In [53]:
!wc -l /home/mityajj/nlp_models/*.txt

    1917494 /home/mityajj/nlp_models/glove.42B.300d.txt
     400000 /home/mityajj/nlp_models/glove.6B.100d.txt
     400000 /home/mityajj/nlp_models/glove.6B.200d.txt
     400000 /home/mityajj/nlp_models/glove.6B.300d.txt
     400000 /home/mityajj/nlp_models/glove.6B.50d.txt
    2196017 /home/mityajj/nlp_models/glove.840B.300d.txt
    1193514 /home/mityajj/nlp_models/glove.twitter.27B.100d.txt
    1193514 /home/mityajj/nlp_models/glove.twitter.27B.200d.txt
    1193514 /home/mityajj/nlp_models/glove.twitter.27B.25d.txt
    1193514 /home/mityajj/nlp_models/glove.twitter.27B.50d.txt
   10487567 total
time: 28.3 s


In [54]:
fn_glove_6B_50d = "/home/mityajj/nlp_models/glove.6B.50d.txt"  # 400k
fn_glove_6B_100d = "/home/mityajj/nlp_models/glove.6B.100d.txt"  # 400k
fn_glove_27B_50d = "/home/mityajj/nlp_models/glove.twitter.27B.50d.txt"  # 1193514
fn_glove_42B_50d = "/home/mityajj/nlp_models/glove.42B.300d.txt" # 1917494
fn_glove_840B_300d = "/home/mityajj/nlp_models/glove.840B.300d.txt"  # 2196017

time: 3.77 ms


In [55]:
def load_w2v(file_name, total=None):
    with open(file_name, "r") as f:
        return {
            line.split()[0]: numpy.array([float(x) for x in line.split()[1:]])
            for line in tqdm.tqdm(f, total=total)
        }

time: 109 ms


In [56]:
w2v_6B_50d = load_w2v(fn_glove_6B_50d, total=400000)

100%|██████████| 400000/400000 [00:09<00:00, 43065.13it/s]

time: 9.51 s





In [57]:
w2v_6B_100d = load_w2v(fn_glove_6B_100d, total=400000)

100%|██████████| 400000/400000 [00:18<00:00, 21710.26it/s]

time: 18.4 s





In [58]:
def w2v_mean(w2v):
    def w2v_mean_impl(t):
        vectors = [
            w2v[w]
            for w in word_tokenize(t.lower())
            if w in w2v
        ]
        if vectors:
            return numpy.mean(vectors, axis=0)
        else:
            return numpy.zeros_like(next(iter(w2v.items()))[1])
    return w2v_mean_impl

time: 6.47 ms


In [59]:
w2v_6B_50d_mean = w2v_mean(w2v_6B_50d)
w2v_6B_100d_mean = w2v_mean(w2v_6B_100d)

time: 105 ms


In [60]:
f_w2v_6B_50d_mean = numpy.vstack(data.text.apply(w2v_6B_50d_mean).as_matrix())

time: 8.85 s


In [61]:
f_w2v_6B_100d_mean = numpy.vstack(data.text.apply(w2v_6B_100d_mean).as_matrix())

time: 8.63 s


---

### Application

In [62]:
from sklearn.model_selection import StratifiedKFold

time: 794 µs


In [63]:
cv = StratifiedKFold(n_splits=5)

time: 125 ms


In [64]:
from sklearn.model_selection import cross_val_predict

time: 140 ms


In [65]:
from sklearn.metrics import log_loss

time: 95.7 ms


In [66]:
import scipy.sparse

time: 84.5 ms


In [67]:
def apply_model(
    model,
    features,
    evaluate=True,
    predict=False,
):
    if any(map(
        lambda z: type(z) in [
            scipy.sparse.csr_matrix,
            scipy.sparse.csc_matrix,
            scipy.sparse.coo_matrix,
        ],
        features,
    )):
        hstack = scipy.sparse.hstack
    else:
        hstack = numpy.hstack
    
    f_all = hstack(features)
    f_train = f_all[numpy.nonzero(i_train)]
    f_test = f_all[numpy.nonzero(i_test)]
    
    p_cv = cross_val_predict(
        model,
        f_train,
        y_train,
        cv=cv,
        method="predict_proba",
    )
    q_cv = log_loss(y_train, p_cv)
    
    model.fit(f_train, y_train)
    
    p_train = model.predict_proba(f_train)
    q_train = log_loss(y_train, p_train)
    
    if evaluate:
        print(f"train log loss = {q_train:.5f}")
        print(f"   cv log loss = {q_cv:.5f}")
        print()
    
    if predict:
        p_test = model.predict_proba(f_test)
        p_full = numpy.concatenate((p_cv, p_test), axis=0)
        return pandas.DataFrame(p_full)

time: 74.4 ms


---

### Models

##### Logistic regression

In [68]:
from sklearn.linear_model import LogisticRegression

time: 129 ms


Basic features:

In [69]:
p_lr_basic = apply_model(
    LogisticRegression(max_iter=10),
    basic_features,
    predict=True,
)

train log loss = 1.00201
   cv log loss = 1.00285

time: 2.56 s


Word2vec:

In [101]:
p_lr_w2v_6B_50d_mean = apply_model(
    LogisticRegression(max_iter=10),
    (f_w2v_6B_50d_mean, ),
    predict=True,
)

train log loss = 0.83138
   cv log loss = 0.83611

time: 6.08 s


In [102]:
p_lr_w2v_6B_100d_mean = apply_model(
    LogisticRegression(max_iter=10),
    (f_w2v_6B_100d_mean, ),
    predict=True,
)

train log loss = 0.77558
   cv log loss = 0.78560

time: 9.04 s


TF-IDF:

In [72]:
p_lr_tfidf = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.60284
   cv log loss = 0.83517

time: 10.4 s


SVD:

In [73]:
p_lr_svd = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd, ),
    predict=True,
)

train log loss = 0.97553
   cv log loss = 0.98460

time: 1.64 s


Counts:

In [74]:
p_lr_counts = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts, ),
    predict=True,
)

train log loss = 0.11811
   cv log loss = 0.53677

time: 22.7 s


In [75]:
p_lr_svdc = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc, ),
    predict=True,
)

train log loss = 0.98263
   cv log loss = 0.98512

time: 2.43 s


Character:

In [76]:
p_lr_tfidf_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.47451
   cv log loss = 0.62743

time: 1min 45s


In [77]:
p_lr_svd_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_c, ),
    predict=True,
)

train log loss = 0.86968
   cv log loss = 0.87547

time: 2.22 s


In [78]:
p_lr_counts_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.27036
   cv log loss = 0.46097

time: 2min 53s


In [79]:
p_lr_svdc_c = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_c, ),
    predict=True,
)

train log loss = 0.93961
   cv log loss = 0.94175

time: 2.93 s


NLTK stems:

In [80]:
p_lr_tfidf_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.56389
   cv log loss = 0.77756

time: 14.5 s


In [81]:
p_lr_svd_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_s, ),
    predict=True,
)

train log loss = 0.89632
   cv log loss = 0.90756

time: 2.28 s


In [82]:
p_lr_counts_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.08449
   cv log loss = 0.49929

time: 47.4 s


In [83]:
p_lr_svdc_s = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_s, ),
    predict=True,
)

train log loss = 0.94342
   cv log loss = 0.94601

time: 4.92 s


NLTK lemmas:

In [84]:
p_lr_tfidf_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.57171
   cv log loss = 0.78810

time: 13.4 s


In [85]:
p_lr_svd_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_svd_l, ),
    predict=True,
)

train log loss = 0.91920
   cv log loss = 0.92882

time: 2.17 s


In [86]:
p_lr_counts_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.09439
   cv log loss = 0.51219

time: 37 s


In [87]:
p_lr_svdc_l = apply_model(
    LogisticRegression(max_iter=10),
    (f_svdc_l, ),
    predict=True,
)

train log loss = 0.91874
   cv log loss = 0.92122

time: 4.38 s


##### Calibrated

In [88]:
from sklearn.calibration import CalibratedClassifierCV

time: 3.18 ms


##### Naive Bayes

In [89]:
from sklearn.naive_bayes import MultinomialNB

time: 112 ms


TF-IDF:

In [90]:
p_nb_tfidf = apply_model(
    MultinomialNB(),
    (f_tfidf, ),
    predict=True,
)

train log loss = 0.46620
   cv log loss = 0.84280

time: 826 ms


***Note:*** SVD - does not work for NB.

Counts:

In [91]:
p_nb_counts = apply_model(
    MultinomialNB(),
    (f_counts, ),
    predict=True,
)

train log loss = 0.03011
   cv log loss = 0.45322

time: 711 ms


Character:

In [92]:
p_nb_tfidf_c = apply_model(
    MultinomialNB(),
    (f_tfidf_c, ),
    predict=True,
)

train log loss = 0.65026
   cv log loss = 0.91851

time: 6.75 s


In [93]:
p_nb_counts_c = apply_model(
    MultinomialNB(),
    (f_counts_c, ),
    predict=True,
)

train log loss = 0.96735
   cv log loss = 3.77831

time: 7.18 s


NLTK stems:

In [94]:
p_nb_tfidf_s = apply_model(
    MultinomialNB(),
    (f_tfidf_s, ),
    predict=True,
)

train log loss = 0.44457
   cv log loss = 0.80402

time: 812 ms


In [95]:
p_nb_counts_s = apply_model(
    MultinomialNB(),
    (f_counts_s, ),
    predict=True,
)

train log loss = 0.02554
   cv log loss = 0.64781

time: 834 ms


NLTK lemmas:

In [96]:
p_nb_tfidf_l = apply_model(
    MultinomialNB(),
    (f_tfidf_l, ),
    predict=True,
)

train log loss = 0.44638
   cv log loss = 0.80741

time: 776 ms


In [97]:
p_nb_counts_l = apply_model(
    MultinomialNB(),
    (f_counts_l, ),
    predict=True,
)

train log loss = 0.03101
   cv log loss = 0.62511

time: 786 ms


##### XGBoost

In [98]:
from xgboost import XGBClassifier

time: 17.4 ms




Basic features:

In [99]:
apply_model(
    XGBClassifier(n_estimators=100),
    basic_features,
)

train log loss = 0.96825
   cv log loss = 0.99003

time: 1min 59s


SVD:

In [100]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_svd, ),
)

train log loss = 0.79868
   cv log loss = 0.83843

time: 2min 16s


Word2vec:

In [103]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_w2v_6B_50d_mean, ),
)

train log loss = 0.79076
   cv log loss = 0.84265

time: 1min 29s


In [104]:
apply_model(
    XGBClassifier(n_estimators=100),
    (f_w2v_6B_100d_mean, ),
)

train log loss = 0.74976
   cv log loss = 0.80907

time: 46.3 s


##### Stacking

In [105]:
p_xgb_6 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
#     predict=True,
)

train log loss = 0.26724
   cv log loss = 0.30961

time: 3min 17s


In [116]:
p_xgb_6 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_w2v_6B_50d_mean,
        
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
    predict=True,
)

train log loss = 0.26585
   cv log loss = 0.30774

time: 3min 31s


In [109]:
p_xgb_6 = apply_model(
    XGBClassifier(n_estimators=100),
    basic_features + (
        f_w2v_6B_100d_mean,
        
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
#     predict=True,
)

train log loss = 0.26429
   cv log loss = 0.30779

time: 3min 54s


Calibrated:

In [None]:
p_ci_xgb_4 = apply_model(
    CalibratedClassifierCV(
        XGBClassifier(n_estimators=100),
        method="isotonic",
    ),
    basic_features + (
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
    predict=True,
)

In [110]:
p_ci_xgb_6 = apply_model(
    CalibratedClassifierCV(
        XGBClassifier(n_estimators=100),
        method="isotonic",
    ),
    basic_features + (
        f_w2v_6B_50d_mean,
        
        f_svd,
#         f_svd_c,
        f_svd_s,
        f_svd_l,
        
        # none - 31093
        # 1st - 31082
        # 2nd - 31065
        # all - 30999
        
        # SVDs
#         f_svdc,  # 1) 30981
        f_svdc_c,  # 2) 31009
        f_svdc_l,  # 3) 31048
        f_svdc_s,  # 4) 30984
        
        # SVD-trained
        p_lr_svdc,  # 5) 31091
        p_lr_svdc_c,  # 6) 31009
        p_lr_svdc_l,  # 7) 30997
        p_lr_svdc_s,  # 8) 30981
        
        p_lr_basic,
        
        p_lr_tfidf,
        p_lr_svd,
#         p_lr_counts,
        
        p_lr_tfidf_c,
        p_lr_svd_c,
        p_lr_counts_c,
        
        p_lr_tfidf_s,
#         p_lr_svd_s,
        p_lr_counts_s,
        
#         p_lr_tfidf_l,
        p_lr_svd_l,
#         p_lr_counts_l,
        
#         p_nb_tfidf,
        p_nb_counts,
        
        # these two features removed from best model
        p_nb_tfidf_c,
        p_nb_counts_c,
        
        p_nb_tfidf_s,
        p_nb_counts_s,
        
#         p_nb_tfidf_l,
#         p_nb_counts_l,
    ),
    predict=True,
)

train log loss = 0.25934
   cv log loss = 0.30740

time: 9min 4s


---

### Submission

In [112]:
import os.path

time: 1.22 ms


In [113]:
submission_path = os.path.realpath('./output')

time: 131 ms


In [114]:
def make_submission(predictions, file_name):
    file_name = os.path.join(
        submission_path,
        os.path.basename(file_name),
    )
    predictions = predictions.copy()
    predictions.columns = y_train.unique()
    pandas.concat(
        (
            data_test["id"].reset_index(drop=True),
            predictions[i_test].reset_index(drop=True),
        ),
        axis=1,
    ).to_csv(file_name, index=False)

time: 95.6 ms


In [117]:
make_submission(p_xgb_6, "p_xgb_6.csv")

time: 51.8 ms


In [115]:
make_submission(p_ci_xgb_6, "p_ci_xgb_6.csv")

time: 143 ms
