In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os, sys
import json
import dill

In [2]:
from tqdm import tqdm_notebook
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Чтение данных

In [3]:
file_path = '/data/share/project01/gender_age_dataset.txt'

In [4]:
nrows = None #100
df = pd.read_csv(file_path, sep='\t', nrows=nrows)

In [5]:
df.head()

Unnamed: 0,gender,age,uid,user_json
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000..."
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,"{""visits"": [{""url"": ""http://sweetrading.ru/?p=..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,"{""visits"": [{""url"": ""http://ru.oriflame.com/pr..."
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,"{""visits"": [{""url"": ""http://translate-tattoo.r..."
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,"{""visits"": [{""url"": ""https://mail.rambler.ru/#..."


### Чистка данных

In [6]:
# убеждаемся, что целевые переменные либо одновременно заполнены, либо одновременно пусты
assert len(df.loc[(df['age'] == '-') & (df['gender'] != '-')]) == 0
assert len(df.loc[(df['gender'] == '-') & (df['age'] != '-')]) == 0

In [7]:
# отделяем данные для обучения
df = df.loc[(df['age'] != '-') & (df['gender'] != '-')]

### Генерация признаков

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin

In [9]:
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

In [10]:
class ExtractDomainTransformer(BaseEstimator, TransformerMixin):
    """Добавляет столбец со списком доменов."""        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X['user_json'].progress_apply(lambda x: [self.url2domain(el['url']) for el in json.loads(x)['visits']])
    
    @staticmethod
    def url2domain(url):
        url = re.sub('(http(s)*://)+', 'http://', url)
        parsed_url = urlparse(unquote(url.strip()))
        if parsed_url.scheme not in ['http','https']: return None
        netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
        if netloc is not None: return netloc.strip()
        return None

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
class ToFloatTransformer(BaseEstimator, TransformerMixin):
    """Приводит элементы матрицы признаков к типу float64."""
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return X.astype(np.float64)

### Кодирование целевой переменной

In [13]:
from sklearn.preprocessing import OrdinalEncoder

In [14]:
enc = OrdinalEncoder()

In [15]:
y = enc.fit_transform(df[['gender', 'age']])

In [16]:
enc.categories_

[array(['F', 'M'], dtype=object),
 array(['18-24', '25-34', '35-44', '45-54', '>=55'], dtype=object)]

In [17]:
y

array([[0., 0.],
       [1., 1.],
       [0., 1.],
       ...,
       [1., 1.],
       [1., 0.],
       [1., 1.]])

In [18]:
X = df

### Обучение

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# разбиваем данные на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [21]:
class TwoOutputClassifier(BaseEstimator, ClassifierMixin):  
    """Классификатор, предсказывающий две целевые переменные 
       с использованием базовых классификаторов."""
    def __init__(self, first_classifier, second_classifier, top50=True):
        self.first_classifier = first_classifier
        self.second_classifier = second_classifier
        self.top50 = top50

    def fit(self, X, y):
        self.first_classifier.fit(X, y[:,0])
        self.second_classifier.fit(X, y[:,1])
        return self
        
    def predict(self, X):
        if self.top50:
            probas1, probas2 = self.predict_proba(X)
            y_pred = self.predict_by_proba(self.select_top50(probas1, probas2))
            return np.array([el[0] for el in y_pred]), np.array([el[1:] for el in y_pred])
        else:
            return np.stack([self.first_classifier.predict(X), self.second_classifier.predict(X)], axis=1)
    
    def predict_proba(self, X):
        return [self.first_classifier.predict_proba(X), self.second_classifier.predict_proba(X)]
    
    def select_top50(self, probas1, probas2):
        index_probas_list = list(zip(range(len(probas1)), probas1, probas2))
        index_probas_list.sort(key=lambda x: max(x[1]) * max(x[2]), reverse=True)
        return index_probas_list[:(len(index_probas_list) + 1) // 2]
    
    def predict_by_proba(self, index_probas_list):
        return [(x[0], x[1].argmax(), x[2].argmax()) for x in index_probas_list]

In [22]:
from lightgbm.sklearn import LGBMClassifier

### Построение Pipeline

In [38]:
from sklearn.pipeline import Pipeline

In [39]:
import pickle

In [40]:
class GetDomainsAndContents:
    def __init__(self):
        self.domain_labels = pickle.load(open('data/domain_labels.pickle', 'rb'))
        
    def __call__(self, user_domains):
        return user_domains + ['category_{}'.format(self.domain_labels[domain]) for domain in user_domains if self.domain_labels.get(domain, None)]

In [41]:
pipeline = Pipeline([('extract_domain', ExtractDomainTransformer()),
                     ('count_domain', CountVectorizer(analyzer=GetDomainsAndContents())),
                     ('to_float', ToFloatTransformer()),
                     ('clf', TwoOutputClassifier(LGBMClassifier(verbose=2), 
                                                 LGBMClassifier(verbose=2), top50=True))],
                    verbose=True)

In [27]:
pipeline.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, max=27103), HTML(value='')))


[Pipeline] .... (step 1 of 4) Processing extract_domain, total=  57.7s
[Pipeline] ...... (step 2 of 4) Processing count_domain, total=   4.1s
[Pipeline] .......... (step 3 of 4) Processing to_float, total=   0.0s
[Pipeline] ............... (step 4 of 4) Processing clf, total=  25.8s


Pipeline(memory=None,
         steps=[('extract_domain', ExtractDomainTransformer()),
                ('count_domain',
                 CountVectorizer(analyzer=<__main__.GetDomainsAndContents object at 0x7f41317b6160>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=N...
                                     second_classifier=LGBMClassifier(boosting_type='gbdt',
                                                                      class_weight=None,
                                                                      colsample_bytree=1.0,
                                                                      learning_rate=0.1,
                                                         

In [28]:
[key for key in pipeline.named_steps['count_domain'].vocabulary_ if key.startswith('category')]

['category_3',
 'category_18',
 'category_10',
 'category_8',
 'category_23',
 'category_7',
 'category_11',
 'category_4',
 'category_6',
 'category_21',
 'category_12',
 'category_2',
 'category_22',
 'category_9',
 'category_16',
 'category_5',
 'category_19',
 'category_14',
 'category_20',
 'category_24',
 'category_15',
 'category_13',
 'category_1',
 'category_17',
 'category.alldatasheet.com']

### Оценка предсказательной способности

In [29]:
# предсказываем целевую переменную
indices_pred, y_pred = pipeline.predict(X_test)

HBox(children=(IntProgress(value=0, max=9035), HTML(value='')))




In [30]:
indices_pred, y_pred

(array([4038, 5030, 4848, ..., 2153, 7972, 4036]), array([[1, 2],
        [1, 1],
        [0, 2],
        ...,
        [0, 2],
        [1, 2],
        [0, 1]]))

In [31]:
# считаем долю полностью совпадающих строк матриц y_true и y_pred
def full_accuracy(y_true, y_pred):
    return sum(np.logical_and(y_test[:,0] == y_pred[:,0], y_test[:,1] == y_pred[:,1])) / len(y_test)

In [32]:
y_test = y_test[indices_pred]

In [33]:
y_test

array([[1., 2.],
       [1., 0.],
       [0., 2.],
       ...,
       [0., 2.],
       [1., 2.],
       [0., 1.]])

In [34]:
y_pred

array([[1, 2],
       [1, 1],
       [0, 2],
       ...,
       [0, 2],
       [1, 2],
       [0, 1]])

In [35]:
full_accuracy(y_test, y_pred)

0.3621071270473661

### Сохранение модели

In [42]:
pipeline.fit(X, y)

HBox(children=(IntProgress(value=0, max=36138), HTML(value='')))


[Pipeline] .... (step 1 of 4) Processing extract_domain, total= 1.3min
[Pipeline] ...... (step 2 of 4) Processing count_domain, total=   5.5s
[Pipeline] .......... (step 3 of 4) Processing to_float, total=   0.0s
[Pipeline] ............... (step 4 of 4) Processing clf, total=  22.6s


Pipeline(memory=None,
         steps=[('extract_domain', ExtractDomainTransformer()),
                ('count_domain',
                 CountVectorizer(analyzer=<__main__.GetDomainsAndContents object at 0x7f40a1af81d0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=N...
                                     second_classifier=LGBMClassifier(boosting_type='gbdt',
                                                                      class_weight=None,
                                                                      colsample_bytree=1.0,
                                                                      learning_rate=0.1,
                                                         

In [43]:
dill.dump([pipeline, enc], open('model.dill', 'wb'))