In [2]:
import pandas as pd
import numpy as np
import scipy
import re

from collections import Counter

from scipy.stats import norm

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import mpl_toolkits.mplot3d as mplt3d
from mpl_toolkits.mplot3d import Axes3D 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

%matplotlib notebook
%load_ext autoreload
%autoreload 2

# shouldn't be enabled when using interactive 3D plots
# %pylab inline
# pylab.rcParams['figure.figsize'] = (10, 7)


### Cleaning & preparing the data

In [3]:
# naming chosen for same-length, to look pretty
kicked = pd.read_csv('data/dismissed_complete.csv')
stayed = pd.read_csv('data/nodismissed_complete.csv')

In [3]:
kicked.head()

Unnamed: 0.1,Unnamed: 0,Author,Title paper,Journal,Year,Keywords,labels
0,0,AU=AYDIN AYDIN,"[Widespread involvement of hepatic, renal and ...",[TURKISH JOURNAL OF PEDIATRICS],[2007],"[mycotic aneurysm, visceral arteries, mesenter...",1.0
1,1,AU=AYDIN AYDIN,[Temporal Analysis of Finger-Tapping Test in I...,[ACTA PHYSIOLOGICA],[2015],,1.0
2,2,AU=AYDIN AYDIN,[UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS],[ACTA CLINICA CROATICA],[2011],"[Epididymis, Undiffrentiated carcinomas, Metas...",1.0
3,3,AU=AYDIN AYDIN,[Undifferentiated Primary Carcinoma of the Epi...,[UHOD-ULUSLARARASI HEMATOLOJI-ONKOLOJI DERGISI],[2009],"[Epididymis, Undifferentiated carcinoma, Adjuv...",1.0
4,4,AU=GİLGİL ERDAL,[Internal iliac artery pseudoaneurysm - An unu...,[AMERICAN JOURNAL OF PHYSICAL MEDICINE & REHAB...,[2008],"[lumbosacral plexopathy, sciatica, pseudoaneur...",1.0


In [4]:
stayed.head()

Unnamed: 0.1,Unnamed: 0,Author,Title paper,Journal,Year,Keywords,labels
0,0,AU=AU,[ASYMMETRIC LOADING OF AN EXTERNALLY CRACKED E...,[THEORETICAL AND APPLIED FRACTURE MECHANICS],[1990],,0.0
1,1,AU=AU,[HIGH-CAPACITY OIL-WATER SEPARATOR PERFORMANCE...,[ENVIRONMENTAL ENGINEERING : PROCEEDINGS OF TH...,[1990],,0.0
2,2,AU=AU,[NUMERICAL MODELING OF BRIDGE FLAWS IN FIBER R...,[LOCALIZED DAMAGE COMPUTER-AIDED ASSESSMENT AN...,[1990],,0.0
3,3,AU=AU,[INHIBIN - A MARKER OF SERTOLI-CELL FUNCTION],[HUMAN REPRODUCTION /],[1988],,0.0
4,4,AU=AU,[THE MEASUREMENT OF BULK AND SURFACE RECOMBINA...,[CONFERENCE RECORD OF THE TWENTIETH IEEE PHOTO...,[1988],,0.0


In [5]:
kicked.shape, stayed.shape

((12301, 7), (24859, 7))

In [6]:
# Just an experiment to try to learn on journal names, not on article names
# kicked['Title paper'] = kicked['Journal']
# stayed['Title paper'] = stayed['Journal']

In [4]:
kicked1 = kicked[['Author','Title paper', 'labels']]
stayed1 = stayed[['Author','Title paper', 'labels']]
stayed1 = stayed1.sample(frac=(1.0 * kicked.shape[0])/stayed.shape[0]) # random_state = 0

In [8]:
# make arrays equal in size
kicked1.shape, stayed1.shape

((12301, 3), (12301, 3))

In [5]:
# check some basic invariants on the input data, all should return True
print(kicked1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(stayed1['Author'].apply(lambda x: x[:3] == 'AU=').all())
print(kicked1['labels'].apply(lambda x: x == 1).all())
print(stayed1['labels'].apply(lambda x: x == 0).all())
print(kicked1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())
print(stayed1['Title paper'].apply(lambda x: x[0] == '[' and x[-1] == ']').all())

True
True
True
True
True
True


In [6]:
df0 = pd.concat([kicked1, stayed1])

In [11]:
df0.head()

Unnamed: 0,Author,Title paper,labels
0,AU=AYDIN AYDIN,"[Widespread involvement of hepatic, renal and ...",1.0
1,AU=AYDIN AYDIN,[Temporal Analysis of Finger-Tapping Test in I...,1.0
2,AU=AYDIN AYDIN,[UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS],1.0
3,AU=AYDIN AYDIN,[Undifferentiated Primary Carcinoma of the Epi...,1.0
4,AU=GİLGİL ERDAL,[Internal iliac artery pseudoaneurysm - An unu...,1.0


In [7]:
df1 = df0.copy()
df1['Author'] = df0['Author'].apply(lambda x: x[3:])
df1['Label'] = df0['labels'].apply(lambda x: int(x))
df1['Title paper'] = df0['Title paper'].apply(lambda s: s[1:][:-1])
df1 = df1.drop(columns=['labels'])
df1.head()

Unnamed: 0,Author,Title paper,Label
0,AYDIN AYDIN,"Widespread involvement of hepatic, renal and m...",1
1,AYDIN AYDIN,Temporal Analysis of Finger-Tapping Test in In...,1
2,AYDIN AYDIN,UNDIFFERENTIATED CARCINOMA OF THE EPIDIDYMIS,1
3,AYDIN AYDIN,Undifferentiated Primary Carcinoma of the Epid...,1
4,GİLGİL ERDAL,Internal iliac artery pseudoaneurysm - An unus...,1


In [8]:
# remove trash author names (whose length < 6)
print('Rows to remove:')
print(df1[df1['Author'].apply(lambda s : len(s) < 6)]['Author'])
df2 = df1[df1['Author'].apply(lambda s : len(s) >= 6)]

Rows to remove:
7764      ACM
22552    SGEM
7766      ACM
22548    SGEM
10549    ASEE
4          AU
22550    SGEM
10548    ASEE
3          AU
7767      ACM
Name: Author, dtype: object


In [9]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

In [10]:
# In fact we will do classification on "Title paper", so further working on it
print('Shape of not English words: ', df2[df2['Title paper'].apply(lambda s : not isEnglish(s))].shape)
df3 = df2.copy()
df3['Title paper'] = df2['Title paper'].apply(
    lambda s : s.lower()
).apply(
    lambda s : re.sub(r"[.,/()?:'%\";\[\]!\{\}><]", "", s) # delete all not-letters
).apply(
    lambda s : re.sub(r"[- + = @ & * # |]", " ", s) # substitute defis with spaces
).apply(
    lambda s : re.sub(r"\d", " ", s) # substitute numbers with spaces
).apply(
    lambda s : re.sub(r"\W\w{1,2}\W", " ", s) # naive removal of super-short words
).apply(
    lambda s : re.sub(r"\s+", " ", s) # substitute multiple spaces with one
)
df3 = df3[df3['Title paper'].apply(
    lambda s: s != 'untitled' and s != 'editorial' # drop some common but not-interesting names
)]

Shape of not English words:  (0, 3)


In [11]:
# try to find strange symbols in "Title paper" and print them 
symbols = df3['Title paper'].apply(
    lambda s: ''.join(c for c in s if not c.isalpha() and c != ' ')
)
print(symbols[symbols.apply(lambda s: s != '')])

Series([], Name: Title paper, dtype: object)


In [12]:
# okay, now in df3 in "Title paper" we have clean sentences, great, analysis should work

df4 = df3.drop(columns=['Author'])
df4.head()

Unnamed: 0,Title paper,Label
0,widespread involvement hepatic renal and mesen...,1
1,temporal analysis finger tapping test individu...,1
2,undifferentiated carcinoma the epididymis,1
3,undifferentiated primary carcinoma the epididy...,1
4,internal iliac artery pseudoaneurysm unusual c...,1


In [18]:
# the interesting observation is that now we have a dataset of (title, label), where label is
# if the author of the article was fired or not. Such dataset may be biased, because for one author
# there can be a lot of different articles. Thus many points are produced with single observation

# Let's try also with another dataset, where we will also have (conc_title, label), where conc_title
# will stay for all the titles of one author, being concatenated

In [19]:
df3.head()

Unnamed: 0,Author,Title paper,Label
0,AYDIN AYDIN,widespread involvement hepatic renal and mesen...,1
1,AYDIN AYDIN,temporal analysis finger tapping test individu...,1
2,AYDIN AYDIN,undifferentiated carcinoma the epididymis,1
3,AYDIN AYDIN,undifferentiated primary carcinoma the epididy...,1
4,GİLGİL ERDAL,internal iliac artery pseudoaneurysm unusual c...,1


In [13]:
titles_num = df4.shape[0]
kicked_titles_num = df4[df4['Label'] == 1].shape[0]
stayed_titles_num = df4[df4['Label'] == 0].shape[0]
print('For first dataset we have ' + str(titles_num) + ' titles, from them ' + str(kicked_titles_num) + ' kicked and ' + str(stayed_titles_num) + ' stayed')

# authors_num = len()

For first dataset we have 24590 titles, from them 12301 kicked and 12289 stayed


In [14]:
titles_per_author = {} # author -> article
labels_per_author = {} # author -> label

for i, r in df3.iterrows():
    author = r['Author']
    title = r['Title paper']
    label = int(r['Label'])
    
    titles_per_author[author] = titles_per_author.get(author, '') + ' ' + title # do concatenation
    labels_per_author[author] = label

kicked_cnt = 0
for k, v in labels_per_author.items():
    if v == 1: kicked_cnt+= 1
        
print('After aggregation we got ' + str(len(titles_per_author)) + ' authors, from which ' + str(kicked_cnt) + ' were kicked and ' + str(len(titles_per_author) - kicked_cnt) + ' not')
print('Taking prefix of required size for not-kicked')

authors = []
titles = []
labels = []
stayed_limit = kicked_cnt

for k, v in titles_per_author.items():
    if labels_per_author[k] == 0:
        if stayed_limit > 0: stayed_limit -= 1
        else: continue
    
    authors.append(k)
    titles.append(re.sub(r"\s+", " ", v))    
    labels.append(labels_per_author[k])
    
# aggregated DataFrame
adf4 = pd.DataFrame(data={'Title paper' : titles, 'Label' : labels}) # columns names are ugly, but for backwards-compatibility
print('Got aggregated dataset of size ' + str(len(authors)))


After aggregation we got 8094 authors, from which 2757 were kicked and 5337 not
Taking prefix of required size for not-kicked
Got aggregated dataset of size 5514


In [53]:
print('Dataset size: ' + str(adf4.shape[0]) + ' unique titles in dataset: ' + str(len(set(adf4['Title paper']))))

Dataset size: 5514 unique titles in dataset: 5479


In [57]:
len(list(filter(lambda x: x[1] == 2, Counter(adf4['Title paper']).items())))

31

In [58]:
len(list(filter(lambda x: x[1] > 2, Counter(adf4['Title paper']).items())))

2

In [15]:
labels_per_titles = {}
same_duplicate = 0
diff_duplicate = 0

# we assume, that noone meets 3 times, which +- correct
for i, r in adf4.iterrows():
    title = r['Title paper']
    label = int(r['Label'])
    if title in labels_per_titles:
        if labels_per_titles[title] == label:
            same_duplicate += 1
        else:
            diff_duplicate += 1
    else:
        labels_per_titles[title] = label

In [62]:
same_duplicate, diff_duplicate

(27, 8)

In [16]:
# to clean the data, let's throw away all the duplicates at all, both same and diff, everyone, who meets > 1 times
counter = Counter(adf4['Title paper'])

adf5 = adf4[
    adf4['Title paper'].apply(
        lambda title: counter[title] == 1
    )
]

print('New dataset size after duplicates removal is ' + str(adf5.shape[0]))

New dataset size after duplicates removal is 5471


### Naive Bayes classification

In [67]:
# df = df4
df = adf5

data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

X_train.shape, X_test.shape

((3812,), (1634,))

In [68]:
# Let's do the following model
# Features are unique words
# Samples are titles

# 1) Naive : for every sample we have binary value for every word (present / absent)
# 2) sklearn.CountVectorizer : counting
# 3) sklearn.TfidfVectorizer : with usual counting more weight is given to longer sentences, that's not really
#                               fair, TF-IDF (term frequency _times_ inverse document frequency) also gives
#                               every sample a weight for each present word, but in more sophisticated way

# We are doing (3) classificator

In [69]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
print('Train shape: ', X_train_tfidf.shape)

X_test_tfidf = tfidf_vect.transform(X_test)
print('Test  shape: ', X_test_tfidf.shape)

Train shape:  (3812, 18801)
Test  shape:  (1634, 18801)


In [70]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [71]:
np.mean(clf.predict(X_test_tfidf) == y_test)

0.572827417380661

### word2vec experiments + catboost

In [72]:
# Okay, we had a look at NB combined with TF-IDF
# Let's now work with word2vec. We need to handle sentences, thus we have 2 options:
# 1) Do simple averaging of all the words in sentence
# 2) Do TF-IDF weighting of every word in a sentence and then addition
# Then for classification we use catboost (ie gradient boosting ie combination of decision trees)

# We will try both

In [19]:
import gensim

# Load Google's pretrained model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [20]:
# df = df4
df = adf5

data_train, data_test = train_test_split(df, test_size=0.5) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

print('Train size: ' + str(X_train.shape[0]) + ' vs test size: ' + str(X_test.shape[0]))

Train size: 2735 vs test size: 2736


In [75]:
df.head(n = 50)

Unnamed: 0,Label,Title paper
0,1,the relation general anxiety levels anxiety w...
1,1,comparison sugammadex and neostigmine atropin...
2,1,protective effect curcumin carbapenem resista...
3,0,paratesticular tumors and clinicopathologic a...
4,0,re based determination ureteral stone volume ...
5,0,comparison and validation scoring systems a c...
6,0,acute myocardial infarction improving ventric...
7,0,continuous passive motion adhesive capsulitis...
8,0,commercial and ethnic uses satureja sivri kek...
9,0,evaluation the bond strength different adhesi...


In [21]:
def get_mean_w2v_embeddings(titles):
    embs = []
    for title in titles:
        title_emb = np.zeros(300)
        words = title.split(' ')
        for w in words:
            if w in word2vec_model:
                scalar = 1.
#                 scalar = 1. / len(words)
                
                vector = word2vec_model[w]
                
                title_emb += scalar * vector
        embs.append(title_emb)
    return embs

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Series is provided, indexation is made with iloc
def get_tfidf_w2v_embeddings(titles):
    tfidf_vect = TfidfVectorizer()
    
    titles_tfidf_matrix = tfidf_vect.fit_transform(titles)
    # have matrix, where rows are titles and cols are words from vocabulary
    tfidf_words_indices = {word : index for (word, index) in tfidf_vect.vocabulary_.items()}
    
    embs = []
    for i in range(len(titles)):
        title = titles.iloc[i]
        words = title.split(' ')
        
        # make sparse matrix row a dict:
        matrix_row = titles_tfidf_matrix[i]
        matrix_row_dict = {}
        indices = matrix_row.indices
        data    = matrix_row.data
        for i in range(len(data)):
            matrix_row_dict[indices[i]] = data[i]
        
        title_emb = np.zeros(300)
        for w in words:
            if w in word2vec_model:
                vector = word2vec_model[w]
                
                if w in tfidf_words_indices:
                    word_index = tfidf_words_indices[w]
                    scalar = matrix_row_dict.get(word_index, 0)
                else:
                    scalar = 1. / len(words) # take scalar as in mean
#                     scalar = 1.
                
                title_emb += scalar * vector
                
        embs.append(title_emb)
    return embs

In [23]:
if True:
    X_train_embs = get_mean_w2v_embeddings(X_train)
    X_test_embs  = get_mean_w2v_embeddings(X_test)
else:
    X_train_embs = get_tfidf_w2v_embeddings(X_train)
    X_test_embs  = get_tfidf_w2v_embeddings(X_test)

In [25]:
from catboost import CatBoostClassifier

cbc_model = CatBoostClassifier(iterations=20, learning_rate=0.01, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

0:	learn: 0.6895761	total: 166ms	remaining: 3.16s
1:	learn: 0.6863437	total: 260ms	remaining: 2.34s
2:	learn: 0.6828959	total: 354ms	remaining: 2.01s
3:	learn: 0.6792450	total: 450ms	remaining: 1.8s
4:	learn: 0.6757360	total: 541ms	remaining: 1.62s
5:	learn: 0.6726710	total: 631ms	remaining: 1.47s
6:	learn: 0.6691413	total: 728ms	remaining: 1.35s
7:	learn: 0.6659759	total: 824ms	remaining: 1.24s
8:	learn: 0.6629188	total: 920ms	remaining: 1.12s
9:	learn: 0.6600135	total: 1.02s	remaining: 1.02s
10:	learn: 0.6569584	total: 1.11s	remaining: 913ms
11:	learn: 0.6539126	total: 1.23s	remaining: 818ms
12:	learn: 0.6510188	total: 1.33s	remaining: 717ms
13:	learn: 0.6481151	total: 1.42s	remaining: 609ms
14:	learn: 0.6453650	total: 1.52s	remaining: 506ms
15:	learn: 0.6428302	total: 1.61s	remaining: 403ms
16:	learn: 0.6399679	total: 1.71s	remaining: 301ms
17:	learn: 0.6374751	total: 1.8s	remaining: 200ms
18:	learn: 0.6346658	total: 1.91s	remaining: 101ms
19:	learn: 0.6316207	total: 2.01s	remaining

0.7368421052631579

In [95]:
cbc_model = CatBoostClassifier(iterations=20, learning_rate=0.1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)model = CatBoostClassifier(iterations=20, learning_rate=0.1, depth=6, loss_function='Logloss')
cbc_

#Fast text classifier 

#Which words (key words) --> interpretation  

#Cluster vextors -> wich words represent well each cathegory? (sum up all words in one cathegory, 
·# which words are close to the resutl)

0:	learn: 0.6488069	total: 186ms	remaining: 3.54s
1:	learn: 0.6178566	total: 362ms	remaining: 3.25s
2:	learn: 0.5906073	total: 551ms	remaining: 3.12s
3:	learn: 0.5659806	total: 768ms	remaining: 3.07s
4:	learn: 0.5457881	total: 1.02s	remaining: 3.07s
5:	learn: 0.5305273	total: 1.25s	remaining: 2.91s
6:	learn: 0.5183062	total: 1.46s	remaining: 2.72s
7:	learn: 0.5052012	total: 1.66s	remaining: 2.48s
8:	learn: 0.4938492	total: 1.85s	remaining: 2.26s
9:	learn: 0.4824432	total: 2.03s	remaining: 2.03s
10:	learn: 0.4730168	total: 2.25s	remaining: 1.84s
11:	learn: 0.4639888	total: 2.43s	remaining: 1.62s
12:	learn: 0.4575706	total: 2.63s	remaining: 1.41s
13:	learn: 0.4524765	total: 2.8s	remaining: 1.2s
14:	learn: 0.4479556	total: 3s	remaining: 999ms
15:	learn: 0.4416208	total: 3.19s	remaining: 798ms
16:	learn: 0.4361867	total: 3.41s	remaining: 601ms
17:	learn: 0.4308438	total: 3.6s	remaining: 399ms
18:	learn: 0.4252953	total: 3.78s	remaining: 199ms
19:	learn: 0.4210432	total: 3.96s	remaining: 0u

0.8047735618115055

In [96]:
cbc_model = CatBoostClassifier(iterations=20, learning_rate=1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

0:	learn: 0.4879080	total: 189ms	remaining: 3.59s
1:	learn: 0.4557720	total: 367ms	remaining: 3.3s
2:	learn: 0.4416080	total: 553ms	remaining: 3.13s
3:	learn: 0.4268077	total: 774ms	remaining: 3.1s
4:	learn: 0.4209252	total: 956ms	remaining: 2.87s
5:	learn: 0.4108938	total: 1.14s	remaining: 2.66s
6:	learn: 0.4003444	total: 1.35s	remaining: 2.51s
7:	learn: 0.3917893	total: 1.58s	remaining: 2.37s
8:	learn: 0.3860356	total: 1.79s	remaining: 2.19s
9:	learn: 0.3837082	total: 1.98s	remaining: 1.98s
10:	learn: 0.3680311	total: 2.16s	remaining: 1.77s
11:	learn: 0.3602340	total: 2.33s	remaining: 1.55s
12:	learn: 0.3500398	total: 2.54s	remaining: 1.37s
13:	learn: 0.3467383	total: 2.72s	remaining: 1.17s
14:	learn: 0.3326018	total: 2.9s	remaining: 967ms
15:	learn: 0.3282389	total: 3.08s	remaining: 770ms
16:	learn: 0.3276505	total: 3.28s	remaining: 580ms
17:	learn: 0.3215752	total: 3.46s	remaining: 384ms
18:	learn: 0.3166910	total: 3.64s	remaining: 192ms
19:	learn: 0.3103794	total: 3.8s	remaining: 

0.7772337821297429

### sent2vec + catboost

In [None]:
import sent2vec # epfl-made 
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('./data/pretrained_models/wiki_unigrams.bin') # 600 features

In [98]:
# df = df4
df = adf5

data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['Title paper']
y_train = data_train['Label']

X_test = data_test['Title paper']
y_test = data_test['Label']

X_train.shape, X_test.shape

((3812,), (1634,))

In [99]:
# emb = model.embed_sentence(X_train.values[0])
X_train_embs = sent2vec_model.embed_sentences(X_train.values)
X_test_embs  = sent2vec_model.embed_sentences(X_test.values)

In [100]:
len(X_train_embs), type(X_train_embs[0]), len(X_train_embs[0]), type(X_train_embs[0][0])

(3812, numpy.ndarray, 600, numpy.float32)

In [101]:
# okay, now we have 600-dim vectors for every sentence

In [102]:
model = CatBoostClassifier(iterations=20, learning_rate=1e-2, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6921994	total: 353ms	remaining: 6.7s
1:	learn: 0.6913647	total: 713ms	remaining: 6.41s
2:	learn: 0.6905139	total: 1.09s	remaining: 6.2s
3:	learn: 0.6895561	total: 1.43s	remaining: 5.73s
4:	learn: 0.6886990	total: 1.81s	remaining: 5.44s
5:	learn: 0.6876661	total: 2.17s	remaining: 5.07s
6:	learn: 0.6869585	total: 2.52s	remaining: 4.68s
7:	learn: 0.6860869	total: 2.87s	remaining: 4.31s
8:	learn: 0.6853812	total: 3.22s	remaining: 3.94s
9:	learn: 0.6844824	total: 3.58s	remaining: 3.58s
10:	learn: 0.6837446	total: 3.92s	remaining: 3.21s
11:	learn: 0.6828041	total: 4.28s	remaining: 2.86s
12:	learn: 0.6819210	total: 4.63s	remaining: 2.49s
13:	learn: 0.6811049	total: 4.99s	remaining: 2.14s
14:	learn: 0.6801637	total: 5.32s	remaining: 1.77s
15:	learn: 0.6793613	total: 5.67s	remaining: 1.42s
16:	learn: 0.6784833	total: 6.02s	remaining: 1.06s
17:	learn: 0.6776294	total: 6.38s	remaining: 708ms
18:	learn: 0.6768629	total: 6.73s	remaining: 354ms
19:	learn: 0.6761846	total: 7.08s	remaining

0.6976744186046512

In [103]:
model = CatBoostClassifier(iterations=20, learning_rate=1e-1, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6841627	total: 327ms	remaining: 6.21s
1:	learn: 0.6771905	total: 697ms	remaining: 6.28s
2:	learn: 0.6681140	total: 1.05s	remaining: 5.97s
3:	learn: 0.6613314	total: 1.4s	remaining: 5.59s
4:	learn: 0.6548034	total: 1.74s	remaining: 5.23s
5:	learn: 0.6469348	total: 2.09s	remaining: 4.88s
6:	learn: 0.6404247	total: 2.44s	remaining: 4.53s
7:	learn: 0.6344413	total: 2.79s	remaining: 4.18s
8:	learn: 0.6296052	total: 3.15s	remaining: 3.84s
9:	learn: 0.6247514	total: 3.51s	remaining: 3.51s
10:	learn: 0.6205616	total: 3.86s	remaining: 3.16s
11:	learn: 0.6145091	total: 4.22s	remaining: 2.81s
12:	learn: 0.6105035	total: 4.57s	remaining: 2.46s
13:	learn: 0.6061513	total: 4.93s	remaining: 2.11s
14:	learn: 0.6006972	total: 5.27s	remaining: 1.75s
15:	learn: 0.5974907	total: 5.61s	remaining: 1.4s
16:	learn: 0.5939123	total: 5.95s	remaining: 1.05s
17:	learn: 0.5892206	total: 6.31s	remaining: 701ms
18:	learn: 0.5849957	total: 6.66s	remaining: 351ms
19:	learn: 0.5812133	total: 7.01s	remaining

0.7074663402692778

In [104]:
model = CatBoostClassifier(iterations=20, learning_rate=1, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6477382	total: 345ms	remaining: 6.55s
1:	learn: 0.6258880	total: 772ms	remaining: 6.95s
2:	learn: 0.6064956	total: 1.16s	remaining: 6.59s
3:	learn: 0.5822261	total: 1.52s	remaining: 6.07s
4:	learn: 0.5660781	total: 1.86s	remaining: 5.58s
5:	learn: 0.5494321	total: 2.2s	remaining: 5.13s
6:	learn: 0.5370378	total: 2.54s	remaining: 4.71s
7:	learn: 0.5209341	total: 2.9s	remaining: 4.34s
8:	learn: 0.5080571	total: 3.25s	remaining: 3.97s
9:	learn: 0.4923026	total: 3.62s	remaining: 3.62s
10:	learn: 0.4865990	total: 3.93s	remaining: 3.22s
11:	learn: 0.4693939	total: 4.29s	remaining: 2.86s
12:	learn: 0.4669019	total: 4.59s	remaining: 2.47s
13:	learn: 0.4524012	total: 4.94s	remaining: 2.12s
14:	learn: 0.4453808	total: 5.28s	remaining: 1.76s
15:	learn: 0.4347320	total: 5.65s	remaining: 1.41s
16:	learn: 0.4196183	total: 6s	remaining: 1.06s
17:	learn: 0.4137137	total: 6.35s	remaining: 706ms
18:	learn: 0.4097078	total: 6.68s	remaining: 352ms
19:	learn: 0.3878707	total: 7.05s	remaining: 0

0.6719706242350061

In [105]:
model = CatBoostClassifier(iterations=20, learning_rate=10, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


0:	learn: 1.5727963	total: 328ms	remaining: 6.22s
1:	learn: 536.3227302	total: 676ms	remaining: 6.09s
2:	learn: nan	total: 699ms	remaining: 3.96s


Training has stopped (degenerate solution on iteration 2, probably too small l2-regularization, try to increase it)


0.5416156670746634