In [15]:
import pandas as pd
import numpy as np
import scipy
import re

from collections import Counter

from scipy.stats import norm

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import mpl_toolkits.mplot3d as mplt3d
from mpl_toolkits.mplot3d import Axes3D 

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB

%matplotlib notebook
%load_ext autoreload
%autoreload 2

# shouldn't be enabled when using interactive 3D plots
# %pylab inline
# pylab.rcParams['figure.figsize'] = (10, 7)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
from utils import *

### Cleaning & preparing the data

In [17]:
# naming chosen for same-length, to look pretty
kicked = pd.read_csv('../data/DISMISSED_final.csv')
stayed = pd.read_csv('../data/UNDISMISSED_final.csv')

In [18]:
kicked.head(n=3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Query,Authors,organisation,abstract,author,co-authors,headings,keywords,publish_date,pubtype,source,title,labels
0,0,0,AU=SARAC CEMAL,SARAC CEMAL,MARMARA UNIV,"In this study, it was aimed to determine the v...",SARAC CEMAL,['SARAC CEMAL'],['Social Sciences'],['Turkish Language and Literature Teacher Trai...,2011-06-01,Journal,KURAM VE UYGULAMADA EGITIM BILIMLERI,"A Proposal of ""Applied Social Activities"" Modu...",1.0
1,1,939,AU=KOSE IBRAHIM,KOSE IBRAHIM,ADIYAMAN UNIV,We assessed the benefit of bone morphogenic pr...,KOSE IBRAHIM,"['KOPARAL MAHMUT', 'KOSE IBRAHIM', 'ATALAY YUS...",['Science & Technology'],"['Bone healing', 'experimental surgery', 'low-...",2016-01-01,Journal,INTERNATIONAL JOURNAL OF CLINICAL AND EXPERIME...,The effects of recombinant human bone morphoge...,1.0
2,2,940,AU=KOSE IBRAHIM,KOSE IBRAHIM,ADIYAMAN UNIV,This study was supported by the Scientific Res...,KOSE IBRAHIM,"['ATALAY YUSUF', 'BOZKURT MEHMET FATIH', 'GONU...",['Science & Technology'],"['amlodipine', 'calcium channel blockers', 'pl...",2015-01-01,Journal,DRUG DESIGN DEVELOPMENT AND THERAPY,The effects of amlodipine and platelet rich pl...,1.0


In [19]:
stayed.head(n=3)

Unnamed: 0.1,Unnamed: 0,abstract,author,co-authors,headings,keywords,organisation,publish_date,pubtype,source,title,labels
0,0,Previous studies have reported a relationship ...,'ISINGOR MELEK',"['KURT ERHAN', 'GULER OZKAN', 'OZBULUT OMER', ...","['Social Sciences', 'Science & Technology']","['suicide', 'ghrelin', 'leptin', 'cholesterol'...",['BAKIRKOY RES & TRAINING HOSP PSYCHIAT & NEUR...,2008-01-01,Journal,JOURNAL OF PSYCHOPHYSIOLOGY,Evaluation of Serum Ghrelin and Leptin Levels ...,0.0
1,1,,'YILMAZ MESUT',"['OZARAS RESAT', 'YILMAZ MESUT', 'METE BIRGUL'...",['Science & Technology'],"['INFECTION', 'ALT']","['CERRAHPASA MED FAC UNIV ISTANBUL AKSARAY', '...",2008-12-01,Journal,DIGESTIVE DISEASES AND SCIENCES,Recognizing Acute Hepatitis C in Hemodialysis ...,0.0
2,2,"Pseudomonas menclocina, a Gram-negative non-fe...",'YILMAZ MESUT',"['MERT ALI', 'YILMAZ MESUT', 'OZARAS RESAT', '...",['Science & Technology'],['INFECTIVE ENDOCARDITIS'],"['UNIV ISTANBUL', 'SIYAMI ERSEK THORAC & CARDI...",2007-01-01,Journal,SCANDINAVIAN JOURNAL OF INFECTIOUS DISEASES,Native valve endocarditis due to Pseudomonas m...,0.0


In [20]:
kicked.shape, stayed.shape

((28830, 15), (79979, 12))

In [21]:
kicked1 = kicked[['author','title', 'labels']]
stayed1 = stayed[['author','title', 'labels']]

stayed1 = stayed1.sample(frac=(1.0 * kicked.shape[0])/stayed.shape[0]) # random_state = 0

# made arrays equal in size
kicked1.shape, stayed1.shape

((28830, 3), (28830, 3))

In [22]:
# check some basic invariants on the input data, all should return True
print(kicked1['labels'].apply(lambda x: x == 1).all())
print(stayed1['labels'].apply(lambda x: x == 0).all())

True
True


In [24]:
df0 = pd.concat([kicked1, stayed1])
df0['labels'] = df0['labels'].apply(lambda x: int(x))

df1 = get_cleaned_dataset(df0)
df1.head()

Unnamed: 0,author,title,labels
0,SARAC CEMAL,a proposal applied social activities module fo...,1
1,KOSE IBRAHIM,the effects recombinant human bone morphogenic...,1
2,KOSE IBRAHIM,the effects amlodipine and platelet rich plasm...,1
3,MALKOC SIDDIK,a patient and family centered care approach or...,1
4,MALKOC SIDDIK,treatment post orthodontic white spot lesions ...,1


### Get (title, label) dataset

In [27]:
df2 = get_title_label_dataset(df1)
df2.head()

Unnamed: 0,title,labels
0,a proposal applied social activities module fo...,1
1,the effects recombinant human bone morphogenic...,1
2,the effects amlodipine and platelet rich plasm...,1
3,a patient and family centered care approach or...,1
4,treatment post orthodontic white spot lesions ...,1


### Get (concatenated-titles-per-author, label) dataset

In [28]:
# the interesting observation is that now we have a dataset of (title, label), where label is
# if the author of the article was fired or not. Such dataset may be biased, because for one author
# there can be a lot of different articles. Thus many points are produced with single observation

# Let's try also with another dataset, where we will also have (conc_title, label), where conc_title
# will stay for all the titles of one author, being concatenated

In [40]:
titles_num = df2.shape[0]
kicked_titles_num = df2[df2['labels'] == 1].shape[0]
stayed_titles_num = df2[df2['labels'] == 0].shape[0]
print('For first dataset we have ' + str(titles_num) + ' titles, from them ' + str(kicked_titles_num) + ' kicked and ' + str(stayed_titles_num) + ' stayed')

For first dataset we have 57629 titles, from them 28830 kicked and 28799 stayed


In [45]:
# adf = aggregated data frame
adf2 = get_cleaned_concatenated_titles_label_dataset(df1)
adf2.head()

After aggregation we got 4784 authors, from which 2083 were kicked and 2701 not
Taking prefix of required size for not-kicked
Got aggregated dataset of size 4166
New dataset size after duplicates removal is 4139


Unnamed: 0,labels,title
0,0,investigating aluminum sheet wrinkling during...
1,0,sulcal and gyral anatomy the orbitofrontal co...
2,0,a non contact method for part based process p...
3,1,cement thickness inlay restorations made lith...
4,0,chemical composition and biological activity ...


In [46]:
# important! Now all classification can be made on one of two dataframes, choice is made here!
if True:
    df = adf2
else:
    df = df2

### Naive Bayes classification

In [47]:
data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['title']
y_train = data_train['labels']

X_test = data_test['title']
y_test = data_test['labels']

X_train.shape, X_test.shape

((2897,), (1242,))

In [48]:
# Let's do the following model
# Features are unique words
# Samples are titles

# 1) Naive : for every sample we have binary value for every word (present / absent)
# 2) sklearn.CountVectorizer : counting
# 3) sklearn.TfidfVectorizer : with usual counting more weight is given to longer sentences, that's not really
#                               fair, TF-IDF (term frequency _times_ inverse document frequency) also gives
#                               every sample a weight for each present word, but in more sophisticated way

# We are doing (3) classificator

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)
print('Train shape: ', X_train_tfidf.shape)

X_test_tfidf = tfidf_vect.transform(X_test)
print('Test  shape: ', X_test_tfidf.shape)

Train shape:  (2897, 26798)
Test  shape:  (1242, 26798)


In [50]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [51]:
np.mean(clf.predict(X_test_tfidf) == y_test)

0.5933977455716586

### word2vec experiments + catboost

In [52]:
# Okay, we had a look at NB combined with TF-IDF
# Let's now work with word2vec. We need to handle sentences, thus we have 2 options:
# 1) Do simple averaging of all the words in sentence
# 2) Do TF-IDF weighting of every word in a sentence and then addition
# Then for classification we use catboost (ie gradient boosting ie combination of decision trees)

# We will try both

In [53]:
import gensim

# Load Google's pretrained model
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('../data/pretrained_models/GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [76]:
data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['title']
y_train = data_train['labels']

X_test = data_test['title']
y_test = data_test['labels']

print('Train size: ' + str(X_train.shape[0]) + ' vs test size: ' + str(X_test.shape[0]))

Train size: 2897 vs test size: 1242


In [64]:
df.head(n = 10)

Unnamed: 0,labels,title
0,0,investigating aluminum sheet wrinkling during...
1,0,sulcal and gyral anatomy the orbitofrontal co...
2,0,a non contact method for part based process p...
3,1,cement thickness inlay restorations made lith...
4,0,chemical composition and biological activity ...
5,1,effect nacl salt the permeability base clay l...
6,0,correlates clozapine use after first episode ...
7,0,investigation the effects alpha lipoic acid a...
8,0,an unexpected long term complication genital ...
9,0,urolithiasis infants evaluation risk factors ...


In [56]:
def get_mean_w2v_embeddings(titles):
    embs = []
    for title in titles:
        title_emb = np.zeros(300)
        words = title.split(' ')
        for w in words:
            if w in word2vec_model:
                scalar = 1.
#                 scalar = 1. / len(words)
                
                vector = word2vec_model[w]
                
                title_emb += scalar * vector
        embs.append(title_emb)
    return embs

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Series is provided, indexation is made with iloc
def get_tfidf_w2v_embeddings(titles):
    tfidf_vect = TfidfVectorizer()
    
    titles_tfidf_matrix = tfidf_vect.fit_transform(titles)
    # have matrix, where rows are titles and cols are words from vocabulary
    tfidf_words_indices = {word : index for (word, index) in tfidf_vect.vocabulary_.items()}
    
    embs = []
    for i in range(len(titles)):
        title = titles.iloc[i]
        words = title.split(' ')
        
        # make sparse matrix row a dict:
        matrix_row = titles_tfidf_matrix[i]
        matrix_row_dict = {}
        indices = matrix_row.indices
        data    = matrix_row.data
        for i in range(len(data)):
            matrix_row_dict[indices[i]] = data[i]
        
        title_emb = np.zeros(300)
        for w in words:
            if w in word2vec_model:
                vector = word2vec_model[w]
                
                if w in tfidf_words_indices:
                    word_index = tfidf_words_indices[w]
                    scalar = matrix_row_dict.get(word_index, 0)
                else:
                    scalar = 1. / len(words) # take scalar as in mean
#                     scalar = 1.
                
                title_emb += scalar * vector
                
        embs.append(title_emb)
    return embs

In [77]:
if True:
    X_train_embs = get_mean_w2v_embeddings(X_train)
    X_test_embs  = get_mean_w2v_embeddings(X_test)
else:
    X_train_embs = get_tfidf_w2v_embeddings(X_train)
    X_test_embs  = get_tfidf_w2v_embeddings(X_test)

In [78]:
from catboost import CatBoostClassifier

cbc_model = CatBoostClassifier(iterations=20, learning_rate=0.01, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

0:	learn: 0.6918035	total: 174ms	remaining: 3.3s
1:	learn: 0.6903760	total: 345ms	remaining: 3.1s
2:	learn: 0.6891240	total: 575ms	remaining: 3.26s
3:	learn: 0.6878366	total: 821ms	remaining: 3.28s
4:	learn: 0.6868379	total: 1.02s	remaining: 3.07s
5:	learn: 0.6855826	total: 1.24s	remaining: 2.89s
6:	learn: 0.6844164	total: 1.46s	remaining: 2.71s
7:	learn: 0.6830770	total: 1.65s	remaining: 2.48s
8:	learn: 0.6818916	total: 1.84s	remaining: 2.24s
9:	learn: 0.6807051	total: 2s	remaining: 2s
10:	learn: 0.6797081	total: 2.2s	remaining: 1.8s
11:	learn: 0.6788609	total: 2.43s	remaining: 1.62s
12:	learn: 0.6777693	total: 2.61s	remaining: 1.41s
13:	learn: 0.6765518	total: 2.78s	remaining: 1.19s
14:	learn: 0.6757515	total: 2.98s	remaining: 994ms
15:	learn: 0.6747088	total: 3.21s	remaining: 803ms
16:	learn: 0.6736187	total: 3.42s	remaining: 604ms
17:	learn: 0.6726837	total: 3.63s	remaining: 403ms
18:	learn: 0.6715939	total: 3.84s	remaining: 202ms
19:	learn: 0.6706568	total: 4.03s	remaining: 0us


0.6497584541062802

In [79]:
cbc_model = CatBoostClassifier(iterations=20, learning_rate=0.1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

0:	learn: 0.6803691	total: 164ms	remaining: 3.12s
1:	learn: 0.6711555	total: 330ms	remaining: 2.97s
2:	learn: 0.6636811	total: 536ms	remaining: 3.04s
3:	learn: 0.6556796	total: 742ms	remaining: 2.97s
4:	learn: 0.6475585	total: 917ms	remaining: 2.75s
5:	learn: 0.6395406	total: 1.1s	remaining: 2.58s
6:	learn: 0.6324571	total: 1.28s	remaining: 2.38s
7:	learn: 0.6279194	total: 1.46s	remaining: 2.19s
8:	learn: 0.6224003	total: 1.63s	remaining: 1.99s
9:	learn: 0.6171276	total: 1.83s	remaining: 1.83s
10:	learn: 0.6131675	total: 2.02s	remaining: 1.65s
11:	learn: 0.6097541	total: 2.19s	remaining: 1.46s
12:	learn: 0.6052413	total: 2.37s	remaining: 1.28s
13:	learn: 0.6017060	total: 2.55s	remaining: 1.09s
14:	learn: 0.5977467	total: 2.71s	remaining: 902ms
15:	learn: 0.5940677	total: 2.88s	remaining: 719ms
16:	learn: 0.5916500	total: 3.04s	remaining: 536ms
17:	learn: 0.5895611	total: 3.22s	remaining: 357ms
18:	learn: 0.5875943	total: 3.38s	remaining: 178ms
19:	learn: 0.5842895	total: 3.55s	remainin

0.6578099838969405

In [80]:
cbc_model = CatBoostClassifier(iterations=20, learning_rate=1, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

0:	learn: 0.6278073	total: 181ms	remaining: 3.43s
1:	learn: 0.6123651	total: 351ms	remaining: 3.16s
2:	learn: 0.5951767	total: 534ms	remaining: 3.03s
3:	learn: 0.5836713	total: 741ms	remaining: 2.96s
4:	learn: 0.5712709	total: 936ms	remaining: 2.81s
5:	learn: 0.5658822	total: 1.14s	remaining: 2.66s
6:	learn: 0.5465612	total: 1.37s	remaining: 2.55s
7:	learn: 0.5387028	total: 1.58s	remaining: 2.36s
8:	learn: 0.5354754	total: 1.83s	remaining: 2.23s
9:	learn: 0.5219828	total: 2.03s	remaining: 2.03s
10:	learn: 0.5071981	total: 2.21s	remaining: 1.81s
11:	learn: 0.5051707	total: 2.38s	remaining: 1.59s
12:	learn: 0.4917885	total: 2.56s	remaining: 1.38s
13:	learn: 0.4808361	total: 2.77s	remaining: 1.19s
14:	learn: 0.4734535	total: 2.93s	remaining: 978ms
15:	learn: 0.4650555	total: 3.1s	remaining: 776ms
16:	learn: 0.4639828	total: 3.26s	remaining: 575ms
17:	learn: 0.4574351	total: 3.44s	remaining: 382ms
18:	learn: 0.4489720	total: 3.61s	remaining: 190ms
19:	learn: 0.4412223	total: 3.77s	remainin

0.6159420289855072

In [81]:
cbc_model = CatBoostClassifier(iterations=20, learning_rate=10, depth=6, loss_function='Logloss')
cbc_model.fit(X_train_embs, y_train)
preds_class = cbc_model.predict(X_test_embs)

np.mean(preds_class == y_test)

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


0:	learn: 1.9837736	total: 189ms	remaining: 3.58s
1:	learn: 65.1416292	total: 352ms	remaining: 3.17s
2:	learn: 525.7721073	total: 541ms	remaining: 3.06s
3:	learn: nan	total: 551ms	remaining: 2.2s


Training has stopped (degenerate solution on iteration 3, probably too small l2-regularization, try to increase it)


0.5249597423510467

### sent2vec + catboost

In [67]:
import sent2vec # epfl-made 
sent2vec_model = sent2vec.Sent2vecModel()
sent2vec_model.load_model('../data/pretrained_models/wiki_unigrams.bin') # 600 features

In [68]:
data_train, data_test = train_test_split(df, test_size=0.3) # random_state = 0

X_train = data_train['title']
y_train = data_train['labels']

X_test = data_test['title']
y_test = data_test['labels']

X_train.shape, X_test.shape

((2897,), (1242,))

In [69]:
# emb = model.embed_sentence(X_train.values[0])
X_train_embs = sent2vec_model.embed_sentences(X_train.values)
X_test_embs  = sent2vec_model.embed_sentences(X_test.values)

In [70]:
len(X_train_embs), type(X_train_embs[0]), len(X_train_embs[0]), type(X_train_embs[0][0])

(2897, numpy.ndarray, 600, numpy.float32)

In [71]:
# okay, now we have 600-dim vectors for every sentence

In [72]:
model = CatBoostClassifier(iterations=20, learning_rate=1e-2, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6926772	total: 444ms	remaining: 8.44s
1:	learn: 0.6921521	total: 809ms	remaining: 7.28s
2:	learn: 0.6917532	total: 1.16s	remaining: 6.59s
3:	learn: 0.6911088	total: 1.5s	remaining: 5.99s
4:	learn: 0.6905577	total: 1.84s	remaining: 5.53s
5:	learn: 0.6899111	total: 2.19s	remaining: 5.11s
6:	learn: 0.6891931	total: 2.54s	remaining: 4.71s
7:	learn: 0.6886826	total: 2.87s	remaining: 4.3s
8:	learn: 0.6880764	total: 3.21s	remaining: 3.92s
9:	learn: 0.6875701	total: 3.57s	remaining: 3.57s
10:	learn: 0.6870024	total: 3.92s	remaining: 3.21s
11:	learn: 0.6865093	total: 4.27s	remaining: 2.85s
12:	learn: 0.6859601	total: 4.64s	remaining: 2.5s
13:	learn: 0.6853783	total: 5.01s	remaining: 2.15s
14:	learn: 0.6850455	total: 5.36s	remaining: 1.79s
15:	learn: 0.6846253	total: 5.72s	remaining: 1.43s
16:	learn: 0.6842575	total: 6.04s	remaining: 1.07s
17:	learn: 0.6837380	total: 6.41s	remaining: 712ms
18:	learn: 0.6831759	total: 6.77s	remaining: 356ms
19:	learn: 0.6826389	total: 7.13s	remaining:

0.5797101449275363

In [73]:
model = CatBoostClassifier(iterations=20, learning_rate=1e-1, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6886654	total: 335ms	remaining: 6.37s
1:	learn: 0.6839143	total: 727ms	remaining: 6.54s
2:	learn: 0.6789297	total: 1.09s	remaining: 6.2s
3:	learn: 0.6733863	total: 1.45s	remaining: 5.82s
4:	learn: 0.6684119	total: 1.86s	remaining: 5.58s
5:	learn: 0.6638178	total: 2.24s	remaining: 5.23s
6:	learn: 0.6597300	total: 2.64s	remaining: 4.91s
7:	learn: 0.6559614	total: 3.01s	remaining: 4.52s
8:	learn: 0.6530037	total: 3.35s	remaining: 4.1s
9:	learn: 0.6492160	total: 3.72s	remaining: 3.72s
10:	learn: 0.6460629	total: 4.07s	remaining: 3.33s
11:	learn: 0.6431014	total: 4.44s	remaining: 2.96s
12:	learn: 0.6399529	total: 4.77s	remaining: 2.57s
13:	learn: 0.6368744	total: 5.13s	remaining: 2.2s
14:	learn: 0.6348356	total: 5.45s	remaining: 1.81s
15:	learn: 0.6324356	total: 5.8s	remaining: 1.45s
16:	learn: 0.6310915	total: 6.14s	remaining: 1.08s
17:	learn: 0.6288093	total: 6.49s	remaining: 722ms
18:	learn: 0.6263107	total: 6.84s	remaining: 360ms
19:	learn: 0.6227640	total: 7.21s	remaining: 

0.605475040257649

In [74]:
model = CatBoostClassifier(iterations=20, learning_rate=1, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

0:	learn: 0.6694659	total: 333ms	remaining: 6.33s
1:	learn: 0.6557130	total: 686ms	remaining: 6.17s
2:	learn: 0.6433759	total: 1.01s	remaining: 5.75s
3:	learn: 0.6240297	total: 1.4s	remaining: 5.6s
4:	learn: 0.6161339	total: 1.72s	remaining: 5.16s
5:	learn: 0.6058691	total: 2.11s	remaining: 4.92s
6:	learn: 0.5933744	total: 2.51s	remaining: 4.66s
7:	learn: 0.5853006	total: 2.96s	remaining: 4.45s
8:	learn: 0.5708500	total: 3.41s	remaining: 4.17s
9:	learn: 0.5587854	total: 3.82s	remaining: 3.82s
10:	learn: 0.5494930	total: 4.18s	remaining: 3.42s
11:	learn: 0.5320407	total: 4.55s	remaining: 3.03s
12:	learn: 0.5228499	total: 4.88s	remaining: 2.63s
13:	learn: 0.5072924	total: 5.23s	remaining: 2.24s
14:	learn: 0.4919035	total: 5.58s	remaining: 1.86s
15:	learn: 0.4764114	total: 5.95s	remaining: 1.49s
16:	learn: 0.4626371	total: 6.31s	remaining: 1.11s
17:	learn: 0.4515207	total: 6.67s	remaining: 742ms
18:	learn: 0.4397122	total: 7.01s	remaining: 369ms
19:	learn: 0.4383143	total: 7.33s	remaining

0.6111111111111112

In [75]:
model = CatBoostClassifier(iterations=20, learning_rate=10, depth=6, loss_function='Logloss')
model.fit(X_train_embs, y_train)
preds_class = model.predict(X_test_embs)
np.mean(preds_class == y_test)

learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


0:	learn: 1.3848086	total: 346ms	remaining: 6.57s
1:	learn: 24.3764645	total: 703ms	remaining: 6.33s
2:	learn: 251.7797294	total: 1.06s	remaining: 6.01s
3:	learn: nan	total: 1.08s	remaining: 4.31s


Training has stopped (degenerate solution on iteration 3, probably too small l2-regularization, try to increase it)


0.5257648953301127