In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import os
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from functools import partial
import pickle

# EDA

In [2]:
raw_path = '../data/raw/'
train = pd.read_csv(os.path.join(raw_path, 'train.csv'))
test = pd.read_csv(os.path.join(raw_path, 'test.csv'))

In [3]:
TARGET_COLS = ['Артериальная гипертензия', 'ОНМК', 'Стенокардия, ИБС, инфаркт миокарда', 'Сердечная недостаточность', 'Прочие заболевания сердца']
ID_COL = 'ID'
EDU_COL = 'Образование'
SEX_COL = 'Пол'
CAT_COLS = [
    'Пол', 'Семья', 'Этнос', 'Национальность', 'Религия', 'Образование', 
    'Профессия', 'Статус Курения', 'Частота пасс кур', 'Алкоголь',
    'Время засыпания', 'Время пробуждения'
]
OHE_COLS = [
    'Пол', 'Вы работаете?', 'Выход на пенсию', 'Прекращение работы по болезни', 'Сахарный диабет', 'Гепатит',
    'Онкология', 'Хроническое заболевание легких', 'Бронжиальная астма', 'Туберкулез легких ', 'ВИЧ/СПИД',
    'Регулярный прим лекарственных средств', 'Травмы за год', 'Переломы','Пассивное курение', 'Сон после обеда', 
    'Спорт, клубы', 'Религия, клубы'
]
REAL_COLS = ['Возраст курения', 'Сигарет в день', 'Возраст алког']

In [4]:
def set_idx(df: pd.DataFrame, idx_col: str) -> pd.DataFrame:
    df = df.set_index(idx_col)
    return df

In [5]:
train, target = train.drop(TARGET_COLS, axis=1), train[TARGET_COLS]

In [6]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = set_idx(df, ID_COL)
    df = drop_unnecesary_id(df)
    df = fill_sex(df)
    df = cast_types(df)
    return df
    
    

In [7]:
def drop_unnecesary_id(df: pd.DataFrame) -> pd.DataFrame:
    if 'ID_y' in df.columns:
        df = df.drop('ID_y', axis=1)
    return df

In [8]:
def add_ord_edu(df: pd.DataFrame) -> pd.DataFrame:
    df[f'{EDU_COL}_ord'] = df[EDU_COL].str.slice(0, 1).astype(np.int8).values
    return df

In [9]:
def fill_sex(df: pd.DataFrame) -> pd.DataFrame:
    most_freq = df[SEX_COL].value_counts().index[0]
    df[SEX_COL] = df[SEX_COL].fillna(most_freq)
    return df

In [10]:
def cast_types(df: pd.DataFrame) -> pd.DataFrame:
    df[CAT_COLS] = df[CAT_COLS].astype('category')

    ohe_int_cols = train[OHE_COLS].select_dtypes('number').columns
    df[ohe_int_cols] = df[ohe_int_cols].astype(np.int8)

    df[REAL_COLS] = df[REAL_COLS].astype(np.float32)
    return df
    

In [11]:
from src import utils 
utils.save_as_pickle(train.pipe(preprocess), "../data/processed/train.pkl")
utils.save_as_pickle(target, "../data/processed/target.pkl")

# MODELING

In [12]:
import os
import pandas as pd
import numpy as np

In [13]:
from sklearn.svm import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.multioutput import *

In [14]:
from src import config as cfg

In [15]:
RS = 75

In [16]:
processed_data_path = '../data/processed/'
train = pd.read_pickle(os.path.join(processed_data_path, 'train.pkl'))
target = pd.read_pickle(os.path.join(processed_data_path, 'target.pkl'))

In [17]:
train[cfg.CAT_COLS] = train[cfg.CAT_COLS].astype('object')

In [18]:
scoring = partial(fbeta_score, beta=2.0)

In [19]:
train_idx, val_idx = train_test_split(
        train.index, test_size=0.2, random_state=7)

In [20]:
val_idx.to_frame().to_pickle('temp.pkl')

In [21]:
with open('tmp2.pkl', 'wb') as f:
    pickle.dump(val_idx, f)

In [22]:
with open('tmp2.pkl', 'rb') as f:
    val_idx2 = pickle.load(f)

In [23]:
train_data, val_data, train_target, val_target = train_test_split(train, target, train_size=0.8, random_state=RS)

In [24]:
real_pipe = Pipeline([('imputer', SimpleImputer()), ('scaler', StandardScaler())])

In [25]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [26]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, cfg.REAL_COLS),
    ('cat_cols', cat_pipe, cfg.CAT_COLS),
    ('ohe_cols', 'passthrough', cfg.OHE_COLS)
]
)

In [27]:
model = LinearSVC()

In [28]:
model_pipe = Pipeline([('preprocess', preprocess_pipe), ('model', model)])

In [29]:
multiout_model_pipe = MultiOutputClassifier(model_pipe, n_jobs=4)

In [30]:
scores = cross_val_score(
    estimator=multiout_model_pipe,
    X=train_data,
    y=train_target,
    scoring='recall_samples',
    cv=3,
    n_jobs=1
)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
scores

array([0.20326797, 0.2379085 , 0.29265092])

In [32]:
d = pd.DataFrame()
def metric(val_target, val_data, name, d = d): 

    a = accuracy_score(pd.DataFrame(val_target), val_data)
    r = recall_score(pd.DataFrame(val_target), val_data, average='samples')
    p = precision_score(pd.DataFrame(val_target), val_data, average='samples')
    f1 =f1_score(pd.DataFrame(val_target), val_data, average='samples')
    df = pd.DataFrame({"Accuracy":([a]), "Recall":([r]) , "Precision":([p]), "F1":([f1])}, index=[name])
    return df


Выбрана метрика Recall. Она считается как отношение количества Positive выборок, которые были классифицированы как Positive, к общему количеству Positive, то есть измеряет способность модели находить выборки, относящиеся к классу Positive. Соответственно, чем больше recall, тем больше Positive семплов было найдено, к чему мы и стремимся, поскольку нам нужно обнаружить склонность к болезни у как можно большего еоличества склонных людей.

In [33]:
train_data_transform = preprocess_pipe.fit_transform(train_data)
val_data = preprocess_pipe.transform(val_data)

In [34]:
catboost = CatBoostClassifier(iterations=100, loss_function='MultiLogloss', custom_metric=['Recall', "F1", "Precision"], silent=True).fit(pd.DataFrame(train_data_transform), pd.DataFrame(train_target))

In [35]:
logistic = MultiOutputClassifier(LogisticRegression(solver="liblinear", class_weight='balanced', max_iter=100)).fit(pd.DataFrame(train_data_transform), pd.DataFrame(train_target))

In [36]:
utils.save_as_pickle(metric(val_target, catboost.predict(val_data), "CatBoost"), "../data/processed/metric_catboost.pkl")
utils.save_as_pickle(metric(val_target, logistic.predict(val_data), "LogisticRegression"), "../data/processed/metric_logistic.pkl")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [37]:
 pd.read_pickle('../data/processed/metric_catboost.pkl')

Unnamed: 0,Accuracy,Recall,Precision,F1
CatBoost,0.534031,0.254799,0.34555,0.27993


In [38]:
 pd.read_pickle('../data/processed/metric_logistic.pkl')

Unnamed: 0,Accuracy,Recall,Precision,F1
LogisticRegression,0.303665,0.335079,0.209337,0.238062


In [39]:
pipeline = Pipeline([("preprocessing", preprocess_pipe), ("modelling", CatBoostClassifier(iterations=100, loss_function='MultiLogloss', custom_metric=['Recall', "F1", "Precision"], silent=True))])
catboost = pipeline.fit(pd.DataFrame(train_data), pd.DataFrame(train_target))