In [238]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os, sys
import json
import pickle

In [239]:
from tqdm import tqdm
tqdm.pandas()

### Чтение данных

In [240]:
file_path = '/data/share/project01/gender_age_dataset.txt'

In [241]:
nrows = None #10
df = pd.read_csv(file_path, sep='\t', nrows=nrows)

In [242]:
df.head()

Unnamed: 0,gender,age,uid,user_json
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000..."
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,"{""visits"": [{""url"": ""http://sweetrading.ru/?p=..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,"{""visits"": [{""url"": ""http://ru.oriflame.com/pr..."
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,"{""visits"": [{""url"": ""http://translate-tattoo.r..."
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,"{""visits"": [{""url"": ""https://mail.rambler.ru/#..."


### Чистка данных

In [243]:
# убеждаемся, что целевые переменные либо одновременно заполнены, либо одновременно пусты
assert len(df.loc[(df['age'] == '-') & (df['gender'] != '-')]) == 0
assert len(df.loc[(df['gender'] == '-') & (df['age'] != '-')]) == 0

In [244]:
# отделяем данные для обучения
df = df.loc[(df['age'] != '-') & (df['gender'] != '-')]

### Генерация признаков

In [277]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

In [278]:
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

In [279]:
class ExtractDomainTransformer(BaseEstimator, TransformerMixin):
    """Добавляет столбец со списком доменов."""        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X['user_json'].progress_apply(lambda x: [url2domain(el['url']) for el in json.loads(x)['visits']])
    
    @staticmethod
    def url2domain(url):
        url = re.sub('(http(s)*://)+', 'http://', url)
        parsed_url = urlparse(unquote(url.strip()))
        if parsed_url.scheme not in ['http','https']: return None
        netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
        if netloc is not None: return netloc.strip()
        return None

In [280]:
from sklearn.feature_extraction.text import CountVectorizer

In [281]:
class ToFloatTransformer(BaseEstimator, TransformerMixin):
    """Приводит элементы матрицы признаков к типу float64."""
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X.astype(np.float64)

### Кодирование целевой переменной

In [282]:
from sklearn.preprocessing import OrdinalEncoder

In [283]:
enc = OrdinalEncoder()

In [284]:
y = enc.fit_transform(df[['gender', 'age']])

In [285]:
enc.categories_

[array(['F', 'M'], dtype=object),
 array(['18-24', '25-34', '35-44', '45-54', '>=55'], dtype=object)]

In [286]:
y

array([[0., 0.],
       [1., 1.],
       [0., 1.],
       ...,
       [1., 1.],
       [1., 0.],
       [1., 1.]])

In [300]:
X = df

### Обучение

In [287]:
from sklearn.model_selection import train_test_split

In [288]:
# разбиваем данные на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [289]:
class TwoOutputClassifier(BaseEstimator, ClassifierMixin):  
    """Классификатор, предсказывающий две целевые переменные 
       с использованием базовых классификаторов."""
    def __init__(self, first_classifier, second_classifier):
        self.first_classifier = first_classifier
        self.second_classifier = second_classifier

    def fit(self, X, y):
        self.first_classifier.fit(X, y[:,0])
        self.second_classifier.fit(X, y[:,1])
        return self
        
    def predict(self, X):
        return np.stack([self.first_classifier.predict(X), self.second_classifier.predict(X)], axis=1)

In [290]:
from lightgbm.sklearn import LGBMClassifier

### Построение Pipeline

In [291]:
from sklearn.pipeline import Pipeline

In [305]:
def identity(x):
    return x

In [306]:
pipeline = Pipeline([('extract_domain', ExtractDomainTransformer()),
                     ('count_domain', CountVectorizer(analyzer=identity)),
                     ('to_float', ToFloatTransformer()),
                     ('clf', TwoOutputClassifier(LGBMClassifier(verbose=2), LGBMClassifier(verbose=2)))],
                    verbose=True)

In [293]:
pipeline.fit(X_train, y_train)


  0%|          | 0/27103 [00:00<?, ?it/s][A
  0%|          | 21/27103 [00:00<02:13, 203.40it/s][A
  0%|          | 89/27103 [00:00<01:45, 257.04it/s][A
  1%|          | 184/27103 [00:00<01:23, 324.22it/s][A
  1%|          | 226/27103 [00:00<01:20, 335.03it/s][A
  1%|          | 267/27103 [00:00<01:23, 320.52it/s][A
  1%|          | 304/27103 [00:00<01:31, 293.44it/s][A
  1%|▏         | 343/27103 [00:00<01:26, 311.07it/s][A
  1%|▏         | 391/27103 [00:00<01:17, 346.70it/s][A
  2%|▏         | 430/27103 [00:01<01:19, 336.69it/s][A
  2%|▏         | 476/27103 [00:01<01:13, 364.73it/s][A
  2%|▏         | 541/27103 [00:01<01:06, 402.05it/s][A
  2%|▏         | 597/27103 [00:01<01:00, 438.86it/s][A
  2%|▏         | 644/27103 [00:01<01:06, 397.79it/s][A
  3%|▎         | 698/27103 [00:01<01:01, 429.35it/s][A
  3%|▎         | 744/27103 [00:01<01:22, 320.49it/s][A
  3%|▎         | 783/27103 [00:01<01:21, 324.20it/s][A
  3%|▎         | 826/27103 [00:02<01:15, 349.81it/s][A
  3%

 30%|███       | 8160/27103 [00:19<00:42, 447.74it/s][A
 30%|███       | 8206/27103 [00:19<00:42, 448.41it/s][A
 30%|███       | 8252/27103 [00:19<00:42, 444.50it/s][A
 31%|███       | 8297/27103 [00:19<00:42, 441.95it/s][A
 31%|███       | 8362/27103 [00:20<00:39, 474.33it/s][A
 31%|███       | 8412/27103 [00:20<00:39, 476.36it/s][A
 31%|███▏      | 8476/27103 [00:20<00:36, 504.91it/s][A
 31%|███▏      | 8528/27103 [00:20<00:37, 489.43it/s][A
 32%|███▏      | 8578/27103 [00:20<00:44, 416.89it/s][A
 32%|███▏      | 8623/27103 [00:20<00:44, 419.20it/s][A
 32%|███▏      | 8667/27103 [00:20<00:49, 373.18it/s][A
 32%|███▏      | 8711/27103 [00:20<00:47, 385.04it/s][A
 32%|███▏      | 8752/27103 [00:21<00:50, 362.87it/s][A
 32%|███▏      | 8790/27103 [00:21<00:50, 364.05it/s][A
 33%|███▎      | 8845/27103 [00:21<00:45, 397.80it/s][A
 33%|███▎      | 8888/27103 [00:21<00:44, 405.28it/s][A
 33%|███▎      | 8930/27103 [00:21<00:48, 377.07it/s][A
 33%|███▎      | 8980/27103 [00

 58%|█████▊    | 15846/27103 [00:36<00:25, 443.31it/s][A
 59%|█████▊    | 15892/27103 [00:37<00:30, 362.50it/s][A
 59%|█████▉    | 15968/27103 [00:37<00:25, 428.77it/s][A
 59%|█████▉    | 16019/27103 [00:37<00:26, 421.67it/s][A
 59%|█████▉    | 16080/27103 [00:37<00:23, 462.62it/s][A
 60%|█████▉    | 16132/27103 [00:37<00:24, 450.97it/s][A
 60%|█████▉    | 16183/27103 [00:37<00:23, 458.35it/s][A
 60%|█████▉    | 16232/27103 [00:37<00:23, 458.85it/s][A
 60%|██████    | 16280/27103 [00:37<00:25, 417.74it/s][A
 60%|██████    | 16332/27103 [00:38<00:24, 433.57it/s][A
 60%|██████    | 16377/27103 [00:38<00:25, 423.81it/s][A
 61%|██████    | 16421/27103 [00:38<00:26, 401.30it/s][A
 61%|██████    | 16474/27103 [00:38<00:26, 406.62it/s][A
 61%|██████    | 16520/27103 [00:38<00:25, 416.00it/s][A
 61%|██████    | 16590/27103 [00:38<00:22, 472.98it/s][A
 61%|██████▏   | 16643/27103 [00:38<00:21, 488.19it/s][A
 62%|██████▏   | 16715/27103 [00:38<00:19, 538.79it/s][A
 62%|██████▏  

 86%|████████▋ | 23429/27103 [00:54<00:07, 475.77it/s][A
 87%|████████▋ | 23480/27103 [00:54<00:07, 457.79it/s][A
 87%|████████▋ | 23555/27103 [00:54<00:06, 517.95it/s][A
 87%|████████▋ | 23628/27103 [00:54<00:06, 564.56it/s][A
 87%|████████▋ | 23689/27103 [00:54<00:06, 535.71it/s][A
 88%|████████▊ | 23746/27103 [00:54<00:06, 539.23it/s][A
 88%|████████▊ | 23804/27103 [00:54<00:05, 550.74it/s][A
 88%|████████▊ | 23861/27103 [00:54<00:05, 544.97it/s][A
 88%|████████▊ | 23917/27103 [00:54<00:05, 540.87it/s][A
 88%|████████▊ | 23972/27103 [00:55<00:06, 506.08it/s][A
 89%|████████▊ | 24024/27103 [00:55<00:06, 500.40it/s][A
 89%|████████▉ | 24094/27103 [00:55<00:05, 536.62it/s][A
 89%|████████▉ | 24149/27103 [00:55<00:06, 479.77it/s][A
 89%|████████▉ | 24199/27103 [00:55<00:06, 415.11it/s][A
 89%|████████▉ | 24244/27103 [00:55<00:07, 394.82it/s][A
 90%|████████▉ | 24304/27103 [00:55<00:06, 439.79it/s][A
 90%|████████▉ | 24352/27103 [00:55<00:06, 437.06it/s][A
 90%|█████████

Pipeline(memory=None,
         steps=[('extract_domain', ExtractDomainTransformer()),
                ('count_domain',
                 CountVectorizer(analyzer=<function <lambda> at 0x7f26249b0ea0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None,...
                                     second_classifier=LGBMClassifier(boosting_type='gbdt',
                                                                      class_weight=None,
                                                                      colsample_bytree=1.0,
                                                                      learning_rate=0.1,
                       

### Оценка предсказательной способности

In [294]:
# предсказываем целевую переменную
y_pred = pipeline.predict(X_test)


  0%|          | 0/9035 [00:00<?, ?it/s][A
  0%|          | 26/9035 [00:00<00:37, 238.83it/s][A
  1%|          | 97/9035 [00:00<00:30, 295.86it/s][A
  2%|▏         | 146/9035 [00:00<00:27, 326.79it/s][A
  2%|▏         | 184/9035 [00:00<00:27, 324.46it/s][A
  2%|▏         | 218/9035 [00:00<00:27, 320.52it/s][A
  3%|▎         | 273/9035 [00:00<00:24, 361.71it/s][A
  4%|▎         | 325/9035 [00:00<00:21, 398.04it/s][A
  4%|▍         | 372/9035 [00:00<00:21, 394.70it/s][A
  5%|▍         | 433/9035 [00:00<00:19, 431.08it/s][A
  5%|▌         | 490/9035 [00:01<00:19, 442.41it/s][A
  6%|▌         | 543/9035 [00:01<00:18, 462.48it/s][A
  7%|▋         | 591/9035 [00:01<00:19, 427.12it/s][A
  7%|▋         | 646/9035 [00:01<00:18, 451.91it/s][A
  8%|▊         | 693/9035 [00:01<00:18, 443.02it/s][A
  8%|▊         | 753/9035 [00:01<00:17, 470.84it/s][A
  9%|▉         | 802/9035 [00:01<00:20, 405.47it/s][A
  9%|▉         | 845/9035 [00:01<00:22, 360.29it/s][A
 10%|█         | 921/9

 90%|████████▉ | 8114/9035 [00:17<00:01, 519.15it/s][A
 90%|█████████ | 8170/9035 [00:17<00:01, 496.16it/s][A
 91%|█████████ | 8223/9035 [00:17<00:01, 459.63it/s][A
 92%|█████████▏| 8272/9035 [00:17<00:01, 436.24it/s][A
 92%|█████████▏| 8333/9035 [00:17<00:01, 476.37it/s][A
 93%|█████████▎| 8384/9035 [00:17<00:01, 464.27it/s][A
 93%|█████████▎| 8441/9035 [00:17<00:01, 489.59it/s][A
 94%|█████████▍| 8505/9035 [00:18<00:01, 496.53it/s][A
 95%|█████████▍| 8556/9035 [00:18<00:01, 472.44it/s][A
 95%|█████████▌| 8619/9035 [00:18<00:00, 508.06it/s][A
 96%|█████████▋| 8699/9035 [00:18<00:00, 558.46it/s][A
 97%|█████████▋| 8758/9035 [00:18<00:00, 529.13it/s][A
 98%|█████████▊| 8835/9035 [00:18<00:00, 583.23it/s][A
 98%|█████████▊| 8897/9035 [00:18<00:00, 519.65it/s][A
 99%|█████████▉| 8959/9035 [00:18<00:00, 534.70it/s][A
100%|█████████▉| 9031/9035 [00:18<00:00, 569.14it/s][A
100%|██████████| 9035/9035 [00:18<00:00, 476.24it/s][A

In [295]:
y_pred

array([[0., 1.],
       [1., 1.],
       [1., 2.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 1.]])

In [296]:
# считаем долю полностью совпадающих строк матриц y_true и y_pred
def full_accuracy(y_true, y_pred):
    return sum(np.logical_and(y_test[:,0] == y_pred[:,0], y_test[:,1] == y_pred[:,1])) / len(y_test)

In [297]:
full_accuracy(y_test, y_pred)

0.30282235749861647

### Сохранение модели

In [310]:
pipeline.fit(X, y)


  0%|          | 0/36138 [00:00<?, ?it/s][A
  0%|          | 47/36138 [00:00<01:20, 446.53it/s][A
  0%|          | 105/36138 [00:00<01:17, 462.09it/s][A
  1%|          | 200/36138 [00:00<01:08, 524.68it/s][A
  1%|          | 289/36138 [00:00<01:00, 594.21it/s][A
  1%|          | 401/36138 [00:00<00:51, 690.51it/s][A
  1%|▏         | 500/36138 [00:00<00:49, 721.50it/s][A
  2%|▏         | 593/36138 [00:00<00:45, 773.43it/s][A
  2%|▏         | 673/36138 [00:00<00:49, 720.85it/s][A
  2%|▏         | 750/36138 [00:01<00:48, 729.94it/s][A
  2%|▏         | 825/36138 [00:01<00:54, 651.85it/s][A
  2%|▏         | 893/36138 [00:01<00:55, 631.47it/s][A
  3%|▎         | 959/36138 [00:01<00:59, 591.16it/s][A
  3%|▎         | 1021/36138 [00:01<00:59, 586.13it/s][A
  3%|▎         | 1081/36138 [00:01<01:07, 515.59it/s][A
  3%|▎         | 1147/36138 [00:01<01:04, 545.72it/s][A
  3%|▎         | 1204/36138 [00:01<01:10, 494.08it/s][A
  4%|▎         | 1269/36138 [00:01<01:06, 526.41it/s][

 28%|██▊       | 10160/36138 [00:18<00:36, 703.82it/s][A
 28%|██▊       | 10243/36138 [00:19<00:44, 579.97it/s][A
 29%|██▊       | 10314/36138 [00:19<01:01, 417.43it/s][A
 29%|██▊       | 10371/36138 [00:19<01:14, 347.31it/s][A
 29%|██▉       | 10419/36138 [00:19<01:31, 280.66it/s][A
 29%|██▉       | 10458/36138 [00:20<01:43, 248.53it/s][A
 29%|██▉       | 10492/36138 [00:20<01:46, 241.13it/s][A
 29%|██▉       | 10522/36138 [00:20<01:44, 245.38it/s][A
 29%|██▉       | 10551/36138 [00:20<01:42, 250.54it/s][A
 29%|██▉       | 10579/36138 [00:20<01:47, 237.29it/s][A
 29%|██▉       | 10605/36138 [00:20<01:49, 232.79it/s][A
 29%|██▉       | 10655/36138 [00:20<01:31, 277.04it/s][A
 30%|██▉       | 10717/36138 [00:20<01:17, 327.20it/s][A
 30%|██▉       | 10812/36138 [00:20<01:02, 407.28it/s][A
 30%|███       | 10903/36138 [00:21<00:51, 487.05it/s][A
 30%|███       | 10981/36138 [00:21<00:45, 548.73it/s][A
 31%|███       | 11155/36138 [00:21<00:36, 689.34it/s][A
 31%|███      

 48%|████▊     | 17459/36138 [00:40<00:32, 571.50it/s][A
 49%|████▊     | 17535/36138 [00:40<00:31, 597.90it/s][A
 49%|████▊     | 17607/36138 [00:40<00:43, 425.25it/s][A
 49%|████▉     | 17665/36138 [00:40<00:59, 309.20it/s][A
 49%|████▉     | 17712/36138 [00:40<01:04, 284.62it/s][A
 49%|████▉     | 17752/36138 [00:41<01:13, 251.18it/s][A
 49%|████▉     | 17786/36138 [00:41<01:27, 210.51it/s][A
 49%|████▉     | 17815/36138 [00:41<01:31, 201.02it/s][A
 49%|████▉     | 17846/36138 [00:41<01:21, 223.88it/s][A
 49%|████▉     | 17873/36138 [00:41<01:27, 209.41it/s][A
 50%|████▉     | 17898/36138 [00:41<01:23, 217.52it/s][A
 50%|████▉     | 17924/36138 [00:42<01:22, 220.53it/s][A
 50%|████▉     | 17948/36138 [00:42<01:20, 225.80it/s][A
 50%|████▉     | 17974/36138 [00:42<01:21, 223.65it/s][A
 50%|████▉     | 18006/36138 [00:42<01:14, 242.36it/s][A
 50%|████▉     | 18032/36138 [00:42<01:28, 205.33it/s][A
 50%|████▉     | 18060/36138 [00:42<01:21, 223.04it/s][A
 50%|█████    

 71%|███████   | 25630/36138 [00:58<00:11, 897.29it/s][A
 71%|███████   | 25724/36138 [00:58<00:18, 577.52it/s][A
 71%|███████▏  | 25799/36138 [00:59<00:22, 462.17it/s][A
 72%|███████▏  | 25861/36138 [00:59<00:28, 366.65it/s][A
 72%|███████▏  | 25912/36138 [00:59<00:29, 343.25it/s][A
 72%|███████▏  | 25957/36138 [00:59<00:31, 319.87it/s][A
 72%|███████▏  | 25997/36138 [00:59<00:32, 310.72it/s][A
 72%|███████▏  | 26071/36138 [00:59<00:26, 375.49it/s][A
 72%|███████▏  | 26125/36138 [00:59<00:24, 412.44it/s][A
 73%|███████▎  | 26228/36138 [01:00<00:20, 486.87it/s][A
 73%|███████▎  | 26288/36138 [01:00<00:19, 513.07it/s][A
 73%|███████▎  | 26366/36138 [01:00<00:17, 570.75it/s][A
 73%|███████▎  | 26459/36138 [01:00<00:15, 634.96it/s][A
 73%|███████▎  | 26531/36138 [01:00<00:16, 594.66it/s][A
 74%|███████▎  | 26608/36138 [01:00<00:14, 637.88it/s][A
 74%|███████▍  | 26677/36138 [01:00<00:14, 645.52it/s][A
 74%|███████▍  | 26747/36138 [01:00<00:14, 658.21it/s][A
 74%|███████▍ 

[Pipeline] .... (step 1 of 4) Processing extract_domain, total= 1.3min
[Pipeline] ...... (step 2 of 4) Processing count_domain, total=   2.3s
[Pipeline] .......... (step 3 of 4) Processing to_float, total=   0.0s
[Pipeline] ............... (step 4 of 4) Processing clf, total=  33.1s


Pipeline(memory=None,
         steps=[('extract_domain', ExtractDomainTransformer()),
                ('count_domain',
                 CountVectorizer(analyzer=<function identity at 0x7f1d1bfdd7b8>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None,...
                                     second_classifier=LGBMClassifier(boosting_type='gbdt',
                                                                      class_weight=None,
                                                                      colsample_bytree=1.0,
                                                                      learning_rate=0.1,
                       

In [311]:
import pickle

In [313]:
pickle.dump([pipeline, enc], open('model.pickle', 'wb'))