In [147]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import os, sys
import json
import pickle

### Чтение данных

In [100]:
file_path = '/data/share/project01/gender_age_dataset.txt'

In [108]:
nrows = None #10
df = pd.read_csv(file_path, sep='\t', nrows=nrows)

In [109]:
df.head()

Unnamed: 0,gender,age,uid,user_json
0,F,18-24,d50192e5-c44e-4ae8-ae7a-7cfe67c8b777,"{""visits"": [{""url"": ""http://zebra-zoya.ru/2000..."
1,M,25-34,d502331d-621e-4721-ada2-5d30b2c3801f,"{""visits"": [{""url"": ""http://sweetrading.ru/?p=..."
2,F,25-34,d50237ea-747e-48a2-ba46-d08e71dddfdb,"{""visits"": [{""url"": ""http://ru.oriflame.com/pr..."
3,F,25-34,d502f29f-d57a-46bf-8703-1cb5f8dcdf03,"{""visits"": [{""url"": ""http://translate-tattoo.r..."
4,M,>=55,d503c3b2-a0c2-4f47-bb27-065058c73008,"{""visits"": [{""url"": ""https://mail.rambler.ru/#..."


### Чистка данных

In [110]:
df['user_json'] = df['user_json'].apply(lambda x: json.loads(x)['visits'])

In [135]:
# убеждаемся, что целевые переменные либо одновременно заполнены, либо одновременно пусты
assert len(df.loc[(df['age'] == '-') & (df['gender'] != '-')]) == 0
assert len(df.loc[(df['gender'] == '-') & (df['age'] != '-')]) == 0

In [137]:
# отделяем данные для обучения
df = df.loc[(df['age'] != '-') & (df['gender'] != '-')]

### Генерация признаков

In [139]:
from urllib.parse import urlparse
from urllib.request import urlretrieve, unquote

def url2domain(url):
    url = re.sub('(http(s)*://)+', 'http://', url)
    parsed_url = urlparse(unquote(url.strip()))
    if parsed_url.scheme not in ['http','https']: return None
    netloc = re.search("(?:www\.)?(.*)", parsed_url.netloc).group(1)
    if netloc is not None: return netloc.strip()
    return None

In [148]:
# добавляем столбец со списком доменов
df['domains'] = df['user_json'].progress_apply(lambda x: [url2domain(el['url']) for el in x])

HBox(children=(IntProgress(value=0, max=36138), HTML(value='')))




In [149]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(analyzer=lambda x: x)

In [155]:
# добавляем количество просмотров доменов пользователями в качестве признаков
X = vectorizer.fit_transform(df['domains'])

<36138x111581 sparse matrix of type '<class 'numpy.int64'>'
	with 592642 stored elements in Compressed Sparse Row format>

In [3]:
vectorizer.get_feature_names()

In [159]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Кодирование целевой переменной

In [161]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
enc = OrdinalEncoder()

In [162]:
y = enc.fit_transform(df[['gender', 'age']])

In [169]:
enc.categories_

[array(['F', 'M'], dtype=object),
 array(['18-24', '25-34', '35-44', '45-54', '>=55'], dtype=object)]

In [167]:
y

array([[0., 0.],
       [1., 1.],
       [0., 1.],
       ...,
       [1., 1.],
       [1., 0.],
       [1., 1.]])

### Обучение

In [192]:
from sklearn.model_selection import train_test_split

In [194]:
# разбиваем данные на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [195]:
from sklearn.ensemble import RandomForestClassifier

In [196]:
clf = RandomForestClassifier(verbose=2)

In [197]:
# обучаем классификатор
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.1s remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=2, warm_start=False)

### Оценка предсказательной способности

In [198]:
# предсказываем целевую переменную
y_pred = clf.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


In [215]:
# считаем долю полностью совпадающих строк матриц y_true и y_pred
def full_accuracy(y_true, y_pred):
    return sum(np.logical_and(y_test[:,0] == y_pred[:,0], y_test[:,1] == y_pred[:,1])) / len(y_test)

In [216]:
full_accuracy(y_test, y_pred)

0.2446043165467626