In [245]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, precision_recall_curve

from additional import DataProcessor

In [2]:
# load & prepare users info
user_features = pd.read_csv('hh_demographic.csv')
user_features.columns = user_features.columns.str.lower()
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
data = user_features.copy()

col_names = {
    'age_desc': 'age',
    'marital_status_code': 'marital',
    'income_desc': 'income',
    'homeowner_desc': 'homeowner',
    # 'hh_comp_desc': 'hh_comp',
    # 'household_size_desc': 'hh_size',
    # 'kid_category_desc': 'kids'
}
data.rename(columns=col_names, inplace=True)

In [4]:
# парсим данные о составе семьи - выделим необходимое в отдельные фичи и распарсим
data['hh_single'] = data['hh_comp_desc'].str.match(r'Single') * 1
data['hh_comp'] = data['hh_comp_desc'].str.replace(r'Single', '1', regex=True)   # меняем Single на число, чтоб адекватно распарсить
data['hh_comp'] = data['hh_comp'].str.extract(r'(\d+)').fillna(-1).astype('int')
data['hh_size'] = data['household_size_desc'].str.extract(r'(\d+)').fillna(-1).astype('int')    # hh_size есть у всех, в fillna() нет смысла
data['kids'] = data['kid_category_desc'].str.extract(r'(\d+)').fillna(-1).astype('int')

comp = data['hh_comp'] == -1
kids = data['kids'] == -1

In [5]:
# восстановим kids для тех, у кого он не указан, но указан comp, и обновим фильтр
data.loc[~comp & kids, 'kids'] = data.loc[~comp & kids, 'hh_size'] - data.loc[~comp & kids, 'hh_comp']
kids = data['kids'] == -1

# восстановим hh_comp для тех, у кого он не указан, но указан kids, и обновим фильтр
data.loc[comp & ~kids, 'hh_comp'] = data.loc[comp & ~kids, 'hh_size'] - data.loc[comp & ~kids, 'kids']
comp = data['hh_comp'] == -1

# те, у кого не указан hh_comp и kids, но hh_size = 1: скорее всего это single adult
size = data['hh_size'] == 1
data.loc[comp & kids & size, 'hh_comp'] = 1
data.loc[comp & kids & size, 'kids'] = 0

# обновим восстановленные данные в hh_single
single_mismatch = (data['hh_single'] == 0) & size
data.loc[single_mismatch, 'hh_single'] = 1

# обновим фильтры
broken = (data['hh_comp'] == -1) & (data['kids'] == -1)
data[broken]

Unnamed: 0,age,marital,income,homeowner,hh_comp_desc,household_size_desc,kid_category_desc,user_id,hh_single,hh_comp,hh_size,kids
140,25-34,A,25-34K,Unknown,Unknown,2,None/Unknown,404,0,-1,2,-1
218,55-64,A,50-74K,Homeowner,Unknown,2,None/Unknown,660,0,-1,2,-1
382,45-54,A,35-49K,Homeowner,Unknown,2,None/Unknown,1154,0,-1,2,-1


In [6]:
single = data['hh_single'] == 1
kids = data['kids'] == -1
comp = data['hh_comp'] == -1
data[single]

Unnamed: 0,age,marital,income,homeowner,hh_comp_desc,household_size_desc,kid_category_desc,user_id,hh_single,hh_comp,hh_size,kids
4,45-54,B,50-74K,Homeowner,Single Female,1,None/Unknown,16,1,1,1,0
7,35-44,B,15-24K,Unknown,Single Female,1,None/Unknown,19,1,1,1,0
10,35-44,U,50-74K,Unknown,Unknown,1,None/Unknown,25,1,1,1,0
11,45-54,U,25-34K,Probable Renter,Single Female,1,None/Unknown,27,1,1,1,0
15,65+,U,50-74K,Unknown,Single Male,1,None/Unknown,42,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
789,65+,A,15-24K,Homeowner,Single Female,2,None/Unknown,2465,1,1,2,1
792,45-54,B,75-99K,Homeowner,Single Male,1,None/Unknown,2483,1,1,1,0
793,35-44,B,25-34K,Unknown,Single Male,1,None/Unknown,2486,1,1,1,0
794,45-54,B,35-49K,Homeowner,Single Female,1,None/Unknown,2488,1,1,1,0


In [7]:
# теперь запилим признак пола для одиночных семей
# сначала инициализируем его в -1
data['temp_sex'] = -1

In [8]:
# теперь разметим: male = 0, female = 1
male = data['hh_comp_desc'].str.contains(r'Male')
female = data['hh_comp_desc'].str.contains(r'Female')
data.loc[male, 'temp_sex'] = 0
data.loc[female, 'temp_sex'] = 1
# отфильтруем одиночные семьи, для которых этот признак остался неразмеченным
sex = data['temp_sex'] == -1

# удалим распарсенные признаки
data.drop(columns=['hh_comp_desc', 'household_size_desc', 'kid_category_desc'], inplace=True)
# data[sex & single]

Для восстановления Male/Female соберем данные train/test:
Сюда попадут только single-family. Для train отберем размеченные, для test - неразмеченные.
Так же помним про троих non-single, которым надо восстановить hh_comp и kids.

In [325]:
# для восстановления Male/Female соберем данные в отдельные датасеты
sex_data = data[~sex & single].drop(columns=['temp_sex']).copy()
sex_target = data.loc[~sex & single, 'temp_sex'].copy()
sex_test = data[sex & single].drop(columns=['temp_sex']).copy()

# отделим кусочек на валидацию
sex_train, sex_valid, sex_train_true, sex_valid_true = train_test_split(sex_data, sex_target, test_size=0.2, random_state=13)


In [10]:
# приклеим данные о топ100 покупок
dp = DataProcessor(pd.read_csv('retail_train.csv'), top_config={'k': 100}, uim_config={'aggfunc': 'count'})
dp.fit()
# выкинем колонку с товарами не из топа и приведем к виду покупал/не покупал
purchases = ((dp.train_uim > 0) * 1).drop(columns=[0]).astype('int')
purchases.columns = purchases.columns.astype('str')

In [11]:
class Merger(BaseEstimator, TransformerMixin):
    """ Добавление данных о покупках """
    def __init__(self, purch):
        self.purch = purch

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.merge(self.purch, on='user_id', how='left')

In [12]:
class Dummies(BaseEstimator, TransformerMixin):
    """ Добавление dummies из указанных фичей """
    def __init__(self, dumcol):
        self.dumcol = dumcol

    def fit(self, X, y=None):
        self.dumcol = [col for col in self.dumcol if col in X.columns]
        return self
    
    def transform(self, X):
        dum = pd.get_dummies(X[self.dumcol])
        df = pd.concat([X, dum], axis=1)
        return df

In [13]:
class ColumnsCorrector(BaseEstimator, TransformerMixin):
    """ Выброс фичей """
    def __init__(self, dropcol):
        self.dropcol = dropcol
        self.required_order = None

    def fit(self, X, y=None):
        self.dropcol = [col for col in self.dropcol if col in X.columns]
        self.required_order = [col for col in X.columns if col not in self.dropcol]
        return self
    
    def transform(self, X):
        df = X.copy()
        absence = list(set(X.columns) ^ set(self.required_order))
        df[absence] = 0
        df.drop(columns=self.dropcol, inplace=True)
        return df[self.required_order]

In [337]:
prepare = Pipeline([('merger', Merger(purchases)),
                    ('dummies', Dummies(['age', 'marital', 'income'])),
                    ('colcorrect', ColumnsCorrector(dropcol=['age', 'marital', 'income', 'homeowner', 'user_id'])),
                    ('classifier', SGDClassifier(learning_rate='adaptive',
                                                 eta0=0.1,
                                                 early_stopping=True,
                                                 validation_fraction=0.3,
                                                 n_jobs=-1,
                                                 random_state=131)),
                    ])

# обучаем моделечку
prepare.fit(sex_data, sex_target)
full_pred = prepare.predict(sex_data)

# предикты для валидации
train_pred = prepare.predict(sex_train)     # это для отложенной выборки, которой решил не пользоваться
valid_pred = prepare.predict(sex_valid)     # это для отложенной выборки, которой решил не пользоваться
test_pred = prepare.predict(sex_test)

# считаем метрику
print('summary f1:', f1_score(sex_target, full_pred))
print('train f1:', f1_score(sex_train_true, train_pred))
print('valid f1:', f1_score(sex_valid_true, valid_pred))
# print('train pr:', precision_score(sex_train_true, train_pred))
# print('valid pr:', precision_score(sex_valid_true, valid_pred))

summary f1: 0.8127208480565371
train f1: 0.817391304347826
valid f1: 0.7924528301886793


In [345]:
# заполняем исходные пропуски предиктами и сохраняем результат
data.loc[sex & single, 'temp_sex'] = test_pred

# temp_sex это ВРЕМЕННЫЙ признак!!! надо его осмыслить для non-single семей
# data.to_csv('user_features_corrected.csv')

In [346]:
data

Unnamed: 0,age,marital,income,homeowner,user_id,hh_single,hh_comp,hh_size,kids,temp_sex
0,65+,A,35-49K,Homeowner,1,0,2,2,0,-1
1,45-54,A,50-74K,Homeowner,7,0,2,2,0,-1
2,25-34,U,25-34K,Unknown,8,0,2,3,1,-1
3,25-34,U,75-99K,Homeowner,13,0,2,4,2,-1
4,45-54,B,50-74K,Homeowner,16,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
796,35-44,U,50-74K,Homeowner,2494,0,2,2,0,-1
797,45-54,A,75-99K,Homeowner,2496,0,2,3,1,-1
798,45-54,U,35-49K,Unknown,2497,1,1,1,0,0
799,25-34,U,50-74K,Homeowner,2498,0,2,2,0,-1
