In [None]:
#предыдущий месяц, среднее за месяц наблюдений, среднее за все время

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
def positive(a):
    if a<0:
        return 0
    else:
        return a

In [None]:
def townClass(x):
    if x>600000:
        return "XXL"
    elif x>100000:
        return "XL"
    elif x>15000:
        return "L"
    elif x>3000:
        return "M"
    elif x>1000:
        return "S"
    elif x>500:
        return "XS"
    else:
        return "XXS" 
towns = pd.read_csv('towns.csv', sep=',', index_col='ADRES', dtype={'POPULATION':int})
#towns=towns['POPULATION']
towns['TOWN_SIZE'] = towns['POPULATION'].apply(townClass)
towns=towns['TOWN_SIZE']


In [None]:
#Считывание данных в DataFrame 

train = pd.read_csv('train_dataset_train.csv', sep=';', index_col=None, dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str, 'PATIENT_ID_COUNT':int})
test = pd.read_csv('test_dataset_test.csv', sep=';', index_col=None, dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str})

In [None]:
train['VISIT_MONTH_YEAR'] = pd.to_datetime(train['VISIT_MONTH_YEAR'].map(lambda b: "01."+b))
test['VISIT_MONTH_YEAR'] = pd.to_datetime(test['VISIT_MONTH_YEAR'].map(lambda b: "01."+b))

train['VISIT_MONTH'] = train['VISIT_MONTH_YEAR'].map(lambda b: b.month).astype(str)
train['VISIT_YEAR'] = train['VISIT_MONTH_YEAR'].map(lambda b: b.year).astype(str)

test['VISIT_MONTH'] = test['VISIT_MONTH_YEAR'].map(lambda b: b.month).astype(str)
test['VISIT_YEAR'] = test['VISIT_MONTH_YEAR'].map(lambda b: b.year).astype(str)

In [None]:
#Обычно в этом месяце
monthdatagrouped = train.copy().groupby(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_MONTH'], as_index=False).mean();
monthdatagrouped.rename(columns = {'PATIENT_ID_COUNT' : 'PATIENT_ID_COUNT_MONTH'}, inplace = True)

#Обычно в этом году
yeardatagrouped = train.copy().groupby(['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_YEAR'], as_index=False).mean();
yeardatagrouped.rename(columns = {'PATIENT_ID_COUNT' : 'PATIENT_ID_COUNT_YEAR'}, inplace = True)


train = pd.merge(train,monthdatagrouped,on = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_MONTH'],how="left")
test  = pd.merge(test,monthdatagrouped,on = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_MONTH'],how="left")

train = pd.merge(train,yeardatagrouped,on = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_YEAR'],how="left")
test  = pd.merge(test,yeardatagrouped,on = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_YEAR'],how="left")

train = pd.merge(train,towns,on = ['ADRES'],how="left")
test  = pd.merge(test,towns,on = ['ADRES'],how="left")

In [None]:
test = test.drop(['VISIT_MONTH'], axis=1)
train = train.drop(['VISIT_MONTH'], axis=1)

In [None]:
data       = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR','AGE_CATEGORY','VISIT_YEAR','TOWN_SIZE','PATIENT_ID_COUNT_MONTH','PATIENT_ID_COUNT_YEAR']   
categories = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'AGE_CATEGORY','VISIT_YEAR','TOWN_SIZE']

In [None]:
#Отделение меток от данных

X = train[data]
y = train[['PATIENT_ID_COUNT']]

In [None]:
separator = pd.to_datetime("01.07.2021")
# разделяем датасет по дате
train_train = train[train['VISIT_MONTH_YEAR']!=separator]
train_val = train[train['VISIT_MONTH_YEAR'] ==separator]

X_train = train_train[data]
y_train = train_train[['PATIENT_ID_COUNT']]

X_test  = train_val[data]
y_test  = train_val[['PATIENT_ID_COUNT']]

In [None]:
#Создание объекта данных Pool, плюсы: возможность указать какие признаки являются категориальными

pool_train = Pool(X_train, y_train, cat_features = categories)
pool_test = Pool(X_test, y_test, cat_features = categories)

In [None]:
#Объявление CatBoostRegressor и обучение

model = CatBoostRegressor(task_type='GPU',
                         depth=14,
                         learning_rate =  0.01,
                         iterations=10000, 
                         random_state=1, 
                         use_best_model = True 
                         )
model.fit(pool_train, eval_set = pool_test, plot = True, verbose=0)

#Получение ответов модели на тестовой выборке в локальном тестировании 

y_pred = model.predict(pool_test)

#На локальном тестировании модель выдаёт такой результат

print("Значение метрики R2 на test: ", r2_score(y_test, y_pred))

In [None]:
model.tree_count_


In [None]:
model.get_feature_importance(prettified = True)

In [None]:
#Получение ответов
pool_test_solution = Pool(test, cat_features = categories)

filename = 'solution_combo_test.csv'
y_pred_solution = model.predict(pool_test_solution)

In [None]:
#Формируем sample_solution для отправки на платформу
test_ = pd.read_csv('test_dataset_test.csv', sep=';', index_col=None, dtype={'PATIENT_SEX':str, 'MKB_CODE':str, 'ADRES':str, 'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str})
test_['PATIENT_ID_COUNT'] = y_pred_solution
test_['PATIENT_ID_COUNT'] = test_['PATIENT_ID_COUNT'].map(lambda b: round(b))
test_['PATIENT_ID_COUNT'] = test_['PATIENT_ID_COUNT'].map(lambda b: positive(b)) 

In [None]:
#Сохраняем в csv файл
 
test_[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY','PATIENT_ID_COUNT']].to_csv(filename, sep=';', index=None)

In [None]:
np.sort(train[train['VISIT_MONTH_YEAR'] ==pd.to_datetime("01.10.2020")]['ADRES'].unique())

In [None]:
train[train['VISIT_MONTH_YEAR'] ==pd.to_datetime("01.10.2020")]['ADRES'].count()

In [None]:
np.sort(train[train['VISIT_MONTH_YEAR'] ==pd.to_datetime("01.11.2021")]['ADRES'].unique())

In [None]:
train[train['VISIT_MONTH_YEAR'] ==pd.to_datetime("01.07.2021")]['ADRES'].count()

In [None]:
np.sort(test['ADRES'].unique())

In [None]:
test.count()

In [None]:
np.sort(train[train['ADRES'] =="Светлогорск"]['VISIT_MONTH_YEAR'].unique())

In [None]:
train.groupby('VISIT_MONTH_YEAR').count()