In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os, sys
import json
import pickle
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

from datetime import datetime
from itertools import product

from tqdm import tqdm
tqdm.pandas()

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier as xgb

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline, make_union

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import chi2, SelectKBest, f_classif, f_regression

import pymorphy2
from gensim.models import Word2Vec

In [83]:
(np.mean([3,5,5,4,5,5,5,5,3,2,3,2,4,2,5,5,4,5,3,5,4,3,4]) * 3 + 10) / 5

4.373913043478261

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [6]:
def tokenizer(list_words):
    return list_words

morph = pymorphy2.MorphAnalyzer()
def get_parsed_url(url, morph=morph):
        
    url = re.sub('(http(s)*://)+', 'http://', url)
    url = re.sub('(http://&referrer=http://)+', 'http://', url)
    
    parsed_url = urlparse(unquote(url.strip()))
    
    domain = re.sub('(^www\.)|(&referrer=)|[\&\{\}\$]', '', parsed_url.netloc.lower())
    
    domain_simple = re.split('[^a-zа-яё]+', domain.lower())
    domain_simple = list(map(lambda x: morph.parse(x)[0].normal_form if re.match('[а-яё]', x) else x , domain_simple))
    domain_simple = list(filter(lambda x: len(x) > 1, domain_simple))
    
    path = re.split('[^a-zа-яё]+', parsed_url.path.lower())
    path = list(map(lambda x: morph.parse(x)[0].normal_form if re.match('[а-яё]', x) else x , path))
    path = list(filter(lambda x: len(x) > 2, path))
   
    return domain, domain_simple, path

In [7]:
TRAIN = True
RANDOM_STATE = 42

# Чтение данных

In [9]:
# open data

if TRAIN:
    df = pd.read_csv('../../../data/gender_age_dataset.txt', sep='\t')
    df['is_test'] = (df['gender'] == '-') & (df['age'] == '-')
else:
    columns=['gender','age','uid','user_json']

    df = pd.read_table(
        sys.stdin, 
        sep='\t', 
        header=None, 
        names=columns
    )
    df['is_test'] = df.eval('uid == uid')
    
    
df['user_json_parsed'] = df['user_json'] \
    .progress_apply(json.loads) \
    .apply(lambda x: x['visits'])

100%|██████████████████████████████████████████████████████████████████████████| 41138/41138 [00:07<00:00, 5607.93it/s]


In [11]:
df.head()

Unnamed: 0,gender,age,uid,user_json,is_test,user_json_parsed
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000...",False,[{'url': 'http://zebra-zoya.ru/200028-chehol-o...
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,"{""visits"": [{""url"": ""http://sweetrading.ru/?p=...",False,"[{'url': 'http://sweetrading.ru/?p=900', 'time..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,"{""visits"": [{""url"": ""http://ru.oriflame.com/pr...",False,[{'url': 'http://ru.oriflame.com/products/prod...
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,"{""visits"": [{""url"": ""http://translate-tattoo.r...",False,[{'url': 'http://translate-tattoo.ru/font-sele...
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,"{""visits"": [{""url"": ""https://mail.rambler.ru/#...",False,"[{'url': 'https://mail.rambler.ru/#/folder/', ..."


# Кодирование таргета

In [12]:
# create target dict
if TRAIN:
    gender = df.query('~is_test').set_index('uid')['gender'].astype('category').copy()
    age = df.query('~is_test').set_index('uid')['age'].astype('category').copy()

    dict_uid2gender_id = gender.cat.codes.to_dict()
    dict_uid2age_id = age.cat.codes.to_dict()

    dict_id2gender = {i: cat for i, cat in enumerate(gender.cat.categories)}
    dict_id2age = {i: cat for i, cat in enumerate(age.cat.categories)}

    dict_gender2id = {cat: i  for i, cat in enumerate(gender.cat.categories)}
    dict_age2id = {cat: i for i, cat in enumerate(age.cat.categories)}

    gender_age = pd.DataFrame(
        list(product(dict_id2gender.keys(), dict_id2age.keys())), 
        columns=['gender', 'age']
    )
    gender_age['gender_age'] = range(gender_age.shape[0])

    pickle.dump(dict_id2gender, open("pickle/dict_id2gender.pickle", "wb"))
    pickle.dump(dict_id2age, open("pickle/dict_id2age.pickle", "wb"))
    pickle.dump(dict_gender2id, open("pickle/dict_gender2id.pickle", "wb"))
    pickle.dump(dict_age2id, open("pickle/dict_age2id.pickle", "wb"))
    pickle.dump(gender_age, open("pickle/gender_age.pickle", "wb"))
    
else:
    dict_id2gender = pickle.load(open("pickle/dict_id2gender.pickle", 'rb'))
    dict_id2age = pickle.load(open("pickle/dict_id2age.pickle", 'rb'))
    dict_gender2id = pickle.load(open("pickle/dict_gender2id.pickle", 'rb'))
    dict_age2id = pickle.load(open("pickle/dict_age2id.pickle", 'rb'))
    gender_age = pickle.load(open("pickle/gender_age.pickle", 'rb'))

# Обработка данных

In [51]:
%%time

# explode & parse url & parse timestamp

df_explode = pd.DataFrame({
    'uid': df['uid'].repeat(df['user_json_parsed'].str.len()), 
    'user_json_parsed': np.concatenate(df['user_json_parsed'].values)
})
df_explode = df_explode.merge(df.drop('user_json_parsed', axis=1), on='uid')

df_explode['url'] = df_explode['user_json_parsed'].progress_apply(lambda x: x['url'])

# parse time
ts = df_explode['user_json_parsed'].progress_apply(lambda x: datetime.utcfromtimestamp(int(x['timestamp'])/1000))
df_explode['ts'] = pd.to_datetime(ts)
df_explode['date'] = df_explode['ts'].dt.date
df_explode['hour'] = (df_explode['ts'].dt.hour*4//24).astype(str)
mask_weekends = df_explode['ts'].dt.dayofweek  > 5
df_explode['weekends'] = mask_weekends.astype(int).astype(str)
df_explode['weekdays'] = (~mask_weekends).astype(int).astype(str)
df_explode['worktime'] = ((9 <= ts.dt.hour) & (ts.dt.hour < 18) & ~mask_weekends).astype(int).astype(str)

# drop duplicated
mask_duplicated = df_explode.duplicated(subset=['uid', 'url', 'date', 'hour'], keep='first')
df_explode = df_explode[~mask_duplicated]

# parse url
parsed_url = df_explode['url'].progress_apply(get_parsed_url)
df_explode['domain'] = parsed_url.apply(lambda x: x[0])
df_explode['domain_simple'] = parsed_url.apply(lambda x: x[1])
df_explode['path'] = parsed_url.apply(lambda x: x[2])

# save
if TRAIN:
    pickle.dump(df_explode, open("pickle/df_explode.pickle", "wb"))

100%|████████████████████████████████████████████████████████████████████| 5829507/5829507 [00:06<00:00, 852175.03it/s]
100%|████████████████████████████████████████████████████████████████████| 5829507/5829507 [00:08<00:00, 696107.96it/s]
100%|█████████████████████████████████████████████████████████████████████| 2791073/2791073 [02:17<00:00, 20306.00it/s]


Wall time: 3min 25s


In [69]:
# if TRAIN:
#     df_explode = pickle.load(open("pickle/df_explode.pickle", 'rb'))

In [73]:
df_explode.head(3).T

Unnamed: 0,0,1,2
uid,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777
user_json_parsed,{'url': 'http://zebra-zoya.ru/200028-chehol-or...,{'url': 'http://news.yandex.ru/yandsearch?cl4u...,{'url': 'http://www.sotovik.ru/news/240283-htc...
gender,F,F,F
age,18-24,18-24,18-24
user_json,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000...","{""visits"": [{""url"": ""http://zebra-zoya.ru/2000...","{""visits"": [{""url"": ""http://zebra-zoya.ru/2000..."
is_test,False,False,False
url,http://zebra-zoya.ru/200028-chehol-organayzer-...,http://news.yandex.ru/yandsearch?cl4url=chezas...,http://www.sotovik.ru/news/240283-htc-one-m9-z...
ts,2014-12-27 13:49:04.068000,2015-03-18 08:11:38.001000,2015-03-18 08:11:38
date,2014-12-27,2015-03-18,2015-03-18
hour,2,1,1


In [18]:
%%time

if TRAIN:
    search_domains = [
        'an.yandex.ru', 'yandex.ru', 'yandex.kz', 'yandex.ua', 'yandex.by',    
        'google.ru', 'google.de', 'google.by', 'google.com.ua', 'google.com.eg', 
        'google.com', 'google.az', 'google.com.tr', 'google.nl', 'google.fr',
        'google.lt', 'google.com.cy', 'google.es', 'google.ae', 'google.com.tj',
        'google.so', 'google.fi', 'google.tm', 'google.pt', 'google.lv', 'google.be',
        'm.google.com', 'google.co.uz', 'google.se', 'google.cz', 'google.hu',
        'google.co.kr', 'google.bg', 'googlead.ru', 'google.co.nz', 'google.kg', 
        'google.com.ph', 'google.com.my', 'google.gr', 'google.co.th', 
    ]
    mask_domain_search = df_explode['domain'].isin(search_domains)
    
    # lda
    lda = make_pipeline(
        CountVectorizer(tokenizer=tokenizer, lowercase=False, min_df=15, binary=True), 
        LatentDirichletAllocation(n_components=20, learning_method='batch', random_state=RANDOM_STATE, n_jobs=-1)
    )

    W = lda.fit_transform(
        df_explode \
            .loc[~mask_domain_search] \
            .groupby('uid')['domain'] \
            .apply(set) \
            .apply(list)
    )
    H = lda.steps[1][1].components_

    dict_domain2cat_lda = dict(zip(
        lda.steps[0][1].get_feature_names(), 
        list(map(lambda x: f'lda_{x}', H.T.argmax(axis=1)))
    ))

    pickle.dump(lda, open("pickle/lda.pickle", "wb"))
    pickle.dump(dict_domain2cat_lda, open("pickle/dict_domain2cat_lda.pickle", "wb"))

    
    # domen2vec
    model_w2v = Word2Vec(
        df_explode.loc[~mask_domain_search].groupby(['uid', 'date'])['domain'].agg(list), 
        min_count=15, 
        size=300, 
        workers=14,
        window=5,
        sample=1e-3,
    ).wv
    dict_domain2wv = {k: list(model_w2v[k]) for k in model_w2v.index2word}

    pickle.dump(model_w2v, open("pickle/model_w2v.pickle", "wb"))
    pickle.dump(dict_domain2wv, open("pickle/dict_domain2wv.pickle", "wb"))
    

lda = pickle.load(open("pickle/lda.pickle", 'rb'))
dict_domain2cat_lda = pickle.load(open("pickle/dict_domain2cat_lda.pickle", 'rb'))
dict_domain2wv = pickle.load(open("pickle/dict_domain2wv.pickle", 'rb'))

Wall time: 1min 51s


In [16]:
# проверка
H = lda.steps[1][1].components_
tmp = pd.DataFrame(list(zip(
    lda.steps[0][1].get_feature_names(), 
    H.T.argmax(axis=1),
    H.T.max(axis=1),
)), columns=['domain', 'theme', 'weight'])
tmp['rank'] = tmp.groupby('theme')['weight'].rank('first', ascending=False)
tmp.query('rank <= 7').set_index([ 'rank', 'theme'])['domain'].unstack('rank')

rank,1.0,2.0,3.0,4.0,5.0,6.0,7.0
theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,cache.betweendigital.com,meganovosti.net,fastpic.ru,marketgid.com,fishki.net,wow-impulse.ru,content.adspynet.com
1,knowledge.allbest.ru,otherreferats.allbest.ru,dic.academic.ru,myshared.ru,alleng.ru,znanija.com,festival.1september.ru
2,zaycev.net,gismeteo.ru,myspongebob.ru,game01.ru,xmusic.me,games.mail.ru,stoboi.ru
3,xvideos.com,nudevista.tv,i-sux.com,ruxvideos.ru,tube8.com,drtuber.com,nuvid.com
4,otzovik.com,banki.ru,babyblog.ru,kakprosto.ru,fb.ru,m24.ru,rusnovosti.ru
5,i6.webware.ru,irecommend.ru,tonkosti.ru,sprashivai.ru,womanadvice.ru,tury.ru,galya.ru
6,dns-shop.ru,interfax.ru,mobile-review.com,s7.addthis.com,svyaznoy.ru,citilink.ru,ebay.com
7,hhcdn.ru,irr.ru,ru.jobrapido.com,buhonline.ru,rusprofile.ru,audit-it.ru,domkadrov.ru
8,enter.ru,playcast.ru,eva.ru,muzofon.com,labirint.ru,koolinar.ru,litres.ru
9,mirtesen.ru,actuallno.com,zagopod.com,zakon.mirtesen.ru,temydnya.mirtesen.ru,interesnienovosti.mirtesen.ru,superinteres.mirtesen.ru


In [19]:
# проверка

model_w2v = pickle.load(open("pickle/model_w2v.pickle", 'rb'))
model_w2v.most_similar('mobile-review.com')

[('android.mobile-review.com', 0.8925578594207764),
 ('rap-game.ru', 0.8034778833389282),
 ('smartphone.ua', 0.7887734770774841),
 ('helpix.ru', 0.7706729769706726),
 ('iguides.ru', 0.7702073454856873),
 ('fcenter.ru', 0.7654719948768616),
 ('hdtracker.me', 0.7602051496505737),
 ('sotovik.ru', 0.7177555561065674),
 ('iconosquare.com', 0.7057325839996338),
 ('megaobzor.com', 0.7013732194900513)]

In [74]:
# добавление категорий и генерация фичей

list_cat_dicts = [dict_domain2cat_lda]
list_cat_dicts_names = ['cat_lda']
time_clms = ['hour', 'weekends', 'weekdays', 'worktime']

for cat_name, cat_dict in tqdm(zip(list_cat_dicts_names, list_cat_dicts), total=len(list_cat_dicts_names)):
    df_explode[cat_name] = df_explode['domain'].map(cat_dict).fillna('unkn')
    for time_col in time_clms:
        df_explode[f'{cat_name}_{time_col}'] = df_explode[cat_name] + '_' + df_explode[time_col]

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.70s/it]


In [75]:
df_explode.head(3).T

Unnamed: 0,0,1,2
uid,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777
user_json_parsed,{'url': 'http://zebra-zoya.ru/200028-chehol-or...,{'url': 'http://news.yandex.ru/yandsearch?cl4u...,{'url': 'http://www.sotovik.ru/news/240283-htc...
gender,F,F,F
age,18-24,18-24,18-24
user_json,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000...","{""visits"": [{""url"": ""http://zebra-zoya.ru/2000...","{""visits"": [{""url"": ""http://zebra-zoya.ru/2000..."
is_test,False,False,False
url,http://zebra-zoya.ru/200028-chehol-organayzer-...,http://news.yandex.ru/yandsearch?cl4url=chezas...,http://www.sotovik.ru/news/240283-htc-one-m9-z...
ts,2014-12-27 13:49:04.068000,2015-03-18 08:11:38.001000,2015-03-18 08:11:38
date,2014-12-27,2015-03-18,2015-03-18
hour,2,1,1


In [36]:
dict_domain2text = pickle.load(open('pickle/dict_domain2text.pickle', 'rb'))

In [40]:
dict_domain2text['sotovik.ru']

'сотовик сотовый телефон цена мобильный телефон каталог телефон обзор мобильный телефон смартфон сотовик новостикаталог телефоновцена купитьфорумкаталог планшетовandroidwindows phoneнесотовые новость барахолкаhi tech weekобзоры статьиновинкиобзор тестыкаталог смартфонованалитик реклама youdo ремонт apple https youdo com apple руб биржа фриланс https freelance youdo com топ производитель applesamsungmicrosofthtclgnokiaasusflymotorolaphilipshuaweilenovosonyztemeizuxiaomileeco letv факт рынок мобильный приложение топовый realme pro плюс обзор neffos восьмой делать ставка красный белый семейный ценность международный признание смартфон hisense прислать российский рынок тридцатка новость лента новость факт рынок мобильный приложениймобильный устройство последний несколько год пережить значительный изменение штуковина который просто позвонить незаменимый помощник'

# Создание финального датасета

In [46]:
%%time

feature_clms = [
    'domain', 'domain_simple', 'path',
    'hour', 'weekends', 'weekdays', 'worktime',
    'cat_lda', 'cat_lda_hour',
    'cat_lda_weekends', 'cat_lda_weekdays', 'cat_lda_worktime', 
]

# усреднение векторов word2vec для каждого пользователя
dict_uid2domain2vec = {uid: np.ones(300) for uid in df['uid'].unique()}
dict_uid2domain2vec.update(
    df_explode[['uid', 'domain']] \
        .drop_duplicates() \
        .set_index('uid')['domain'] \
        .map(dict_domain2wv) \
        .dropna() \
        .groupby(level=0) \
        .agg(lambda x: x.tolist()) \
        .apply(np.array) \
        .apply(lambda x: np.mean(x, axis=0)) \
        .to_dict()
)

df_new = df_explode.groupby(['uid', 'is_test'])[feature_clms].agg(list).reset_index()
df_new['domain'] = df_new['domain'].apply(set).apply(list)
df_new['domain_simple'] = df_new['domain_simple'].apply(np.concatenate).apply(set).apply(list)
df_new['path'] = df_new['path'].apply(np.concatenate).apply(set).apply(list)
df_new['word2vec'] = df_new['uid'].map(dict_uid2domain2vec)
df_new['text'] = df_new['domain'] \
    .progress_apply(lambda x: ' '.join([dict_domain2text[domain] for domain in x if domain in dict_domain2text]))

if TRAIN:
    df_new['gender'] = df_new['uid'].map(dict_uid2gender_id)
    df_new['age'] = df_new['uid'].map(dict_uid2age_id)
    df_new = df_new.merge(gender_age, on=['gender', 'age'], how='left')

100%|█████████████████████████████████████████████████████████████████████████| 41138/41138 [00:00<00:00, 50867.48it/s]

Wall time: 1min 27s





In [77]:
df_new.head(3).T

Unnamed: 0,0,1,2
uid,0000e7ca-32e6-4bef-bdca-e21c025071ff,0000f3cf-6e9a-4eab-92f4-cefdad108c83,000381a6-0400-40f8-98c0-93a6c0852d2d
is_test,True,True,True
domain,"[retail-tech.ru, biancoloto.com, hotelcosmos.r...","[zenit-penza.ru, google.com, onedivision.ru, d...","[google.ru, russianfood.com]"
domain_simple,"[ucheba, retail, business, ru, biancoloto, com...","[onedivision, classes, ru, com, zenit, dizel, ...","[com, russianfood, google, ru]"
path,"[okeanariuem, msk, west, masok, offers, noch, ...","[synonyms, news, item, russian, all, htm, dict...","[php, recipe, bytype, recipes, iframes, html, ..."
hour,"[1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 3, 3]","[2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 2, 2, 2]"
weekends,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
weekdays,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[1, 1, 1, 1, 1, 1, 1, 1, 1]"
worktime,"[0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]","[1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0]","[0, 0, 0, 1, 1, 1, 1, 1, 1]"
cat_lda,"[unkn, unkn, unkn, lda_1, lda_1, unkn, unkn, l...","[lda_11, unkn, unkn, unkn, unkn, unkn, unkn, l...","[lda_12, lda_12, lda_12, lda_12, lda_12, unkn,..."


# Модель

In [54]:
class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, col):
        self.col = col
        

    def fit(self, X, y=None):
        return self
    

    def transform(self, X, y=None):
        return X[self.col]
    
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    

    
class Bagging(BaseEstimator):
    
    def __init__(self, list_models):
        self.list_models = list_models

        
    def fit(self, X, y):
        for model in self.list_models:
            model.fit(X, y)
       
    
    def _get_list_predict_proba(self, X):
        return [model.predict_proba(X) for model in self.list_models]


    def predict_proba(self, X):
        return np.dstack(self._get_list_predict_proba(X)).mean(axis=2)
    
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [55]:
# features
domain = make_pipeline(
    FeatureSelector('domain'),
    CountVectorizer(tokenizer=tokenizer, lowercase=False, binary=True),
    SelectKBest(chi2, k=8000),
)
domain_simple = make_pipeline(
    FeatureSelector('domain_simple'),
    CountVectorizer(tokenizer=tokenizer, lowercase=False, binary=True),
    SelectKBest(chi2, k=8000),
)
path = make_pipeline(
    FeatureSelector('path'),
    CountVectorizer(tokenizer=tokenizer, lowercase=False, binary=True),
    SelectKBest(chi2, k=8000),
)
cat_lda = make_pipeline(
    FeatureSelector('cat_lda'),
    CountVectorizer(tokenizer=tokenizer, lowercase=False, binary=True),
)
word2vec = make_pipeline(
    FeatureSelector('word2vec'),
)
text = make_pipeline(
    FeatureSelector('text'),
    TfidfVectorizer(analyzer='word', min_df=10, max_df=0.8, lowercase=False, binary=False, use_idf=True), 
    SelectKBest(chi2, k=20000),
)


# models
domain_model = make_pipeline(
    make_union(domain, cat_lda), 
    MultinomialNB(alpha=1.0),
)
domain_simple_model = make_pipeline(
    make_union(domain_simple, cat_lda), 
    MultinomialNB(alpha=1.2),
)
path_model = make_pipeline(
    make_union(path, cat_lda), 
    MultinomialNB(alpha=1.0),
)
text_model = make_pipeline(
     text, MultinomialNB(alpha=0.7),
)

list_models = [
    domain_model, 
    domain_simple_model, 
    path_model, 
    text_model,
]
final_model = Bagging(list_models)

In [57]:
%%time

# проверка

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

X = df_new.query('~is_test').copy()
y = df_new.query('~is_test')['gender_age'].copy()

score = cross_val_score(final_model, X, y, cv=skf, scoring='accuracy', n_jobs=1)
print(np.mean(score))
print(score)

0.32110381541614885
[0.31549841 0.31940794 0.33019651 0.31663438 0.32378184]
Wall time: 3min 17s


In [684]:
%%time

# fit

if TRAIN:

    mask = df_new['domain'].apply(len) > 4
    X = df_new.loc[mask].query('~is_test').copy()
    y = df_new.loc[mask].query('~is_test')['gender_age'].copy()

    final_model.fit(X, y)
    pickle.dump(final_model, open("pickle/final_model.pickle", "wb"))

Wall time: 40.2 s


In [None]:
# predict

final_model = pickle.load(open("pickle/final_model.pickle", 'rb'))

predict_proba = final_model.predict_proba(df_new)
predict = predict_proba.argmax(axis=1)
mask_bad_predict = predict_proba.max(axis=1) < np.median(predict_proba.max(axis=1))

output = df_new[['uid']].copy()
output['gender_age'] = predict
output = output.merge(gender_age, on='gender_age', how='left')
output['gender'] = output['gender'].map(dict_id2gender)
output['age'] = output['age'].map(dict_id2age)

output.loc[mask_bad_predict, 'gender'] = '-'
output.loc[mask_bad_predict, 'age'] = '-'

output = output[['uid', 'gender', 'age']].sort_values(by='uid', axis=0, ascending=True)

if not TRAIN:
    sys.stdout.write(output.to_json(orient='records'))