In [4]:
! pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.0-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 5.8 MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.5.0


In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error

pd.set_option('display.max_columns', 500)
from category_encoders.target_encoder import TargetEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

  import pandas.util.testing as tm


In [2]:
PATH_TO_DATA = 'covid_data_train_new.csv'
PATH_TO_TEST = 'covid_data_test.csv'

In [3]:
covid_df = pd.read_csv(PATH_TO_DATA)
covid_df = covid_df.drop(columns=['Unnamed: 0'])

# исключим дубликаты городов и записи без таргета 
covid_df = covid_df[covid_df['inf_rate'].notna()]
covid_df = covid_df.drop_duplicates(subset='lat')

### Сгенерируем признаки

In [4]:
# строки нельзя подать в Random Forest, поэтому будем использовать для них TargetEncoder
covid_df.select_dtypes('object')

Unnamed: 0,name,district,subject,region_x
0,Абаза,Сибирский,Хакасия,Республика Хакасия
1,Абакан,Сибирский,Хакасия,Республика Хакасия
2,Абдулино,Приволжский,Оренбургская область,Оренбургская область
3,Абинск,Южный,Краснодарский край,Краснодарский край
4,Агрыз,Приволжский,Татарстан,Республика Татарстан
...,...,...,...,...
446,Макарьев,Центральный,Костромская область,Костромская область
447,Макушино,Уральский,Курганская область,Курганская область
448,Малая Вишера,Северо-Западный,Новгородская область,Новгородская область
449,Малоархангельск,Центральный,Орловская область,Орловская область


In [5]:
te = TargetEncoder().fit(covid_df.select_dtypes('object'), covid_df['inf_rate'])
cat_f = te.transform(covid_df.select_dtypes('object'))
covid_df_rf = covid_df.drop(columns=covid_df.select_dtypes('object').columns).join(cat_f).drop(columns='name')
# добавим обратно все признаки, кроме name, потому что это айдишник, будет лик



In [6]:
# доля больных туберкулезом в городе
covid_df_rf['patients_percent_2017'] = covid_df_rf.apply(
    lambda x: x['num_patients_tubercul_2017']/x['population'], axis=1)

In [7]:
# объем предлагаемых населению услуг, суммарно
covid_df_rf['volume_serv_total'] = covid_df_rf.apply(lambda x: x['volume_serv_household_2017'] +\
                            x['volume_serv_chargeable_2017']+\
                            x['volume_serv_transport_2017'] +\
                            x['volume_serv_post_2017']+\
                            x['volume_serv_accommodation_2017'] +\
                            x['volume_serv_telecom_2017']+\
                            x['volume_serv_others_2017'] +\
                            x['volume_serv_veterinary_2017']+\
                            x['volume_serv_housing_2017'] +\
                            x['volume_serv_education_2017']+\
                            x['volume_serv_medicine_2017'] +\
                            x['volume_serv_disabled_2017']+\
                            x['volume_serv_culture_2017'] +\
                            x['volume_serv_sport_2017']+\
                            x['volume_serv_hotels_2017'] +\
                            x['volume_serv_tourism_2017']+\
                            x['volume_serv_sanatorium_2017'],
                            axis=1
                        )

In [8]:
[x for x in covid_df_rf.columns if 'urban' in x]

['urban_50-54_years',
 'urban_55-59_years',
 'urban_60-64_years',
 'urban_65-69_years',
 'urban_70-74_years',
 'urban_75-79_years',
 'urban_80-84_years',
 'urban_85-89_years',
 'urban_90-94_years',
 'num_phones_urban_2019',
 'urban']

In [9]:
# те же признаки, но в процентах от населения города
covid_df_rf['urban_50-54_years_percent'] = covid_df.apply(
    lambda x: x['urban_50-54_years']/x['whole_population'], axis=1)
covid_df_rf['urban_55-59_years_percent'] = covid_df.apply(
    lambda x: x['urban_55-59_years']/x['whole_population'], axis=1)
covid_df_rf['urban_60-64_years_percent'] = covid_df.apply(
    lambda x: x['urban_60-64_years']/x['whole_population'], axis=1)
covid_df_rf['urban_65-69_years_percent'] = covid_df.apply(
    lambda x: x['urban_65-69_years']/x['whole_population'], axis=1)
covid_df_rf['urban_70-74_years_percent'] = covid_df.apply(
    lambda x: x['urban_70-74_years']/x['whole_population'], axis=1)
covid_df_rf['urban_75-79_years_percent'] = covid_df.apply(
    lambda x: x['urban_75-79_years']/x['whole_population'], axis=1)
covid_df_rf['urban_80-84_years_percent'] = covid_df.apply(
    lambda x: x['urban_80-84_years']/x['whole_population'], axis=1)
covid_df_rf['urban_85-89_years_percent'] = covid_df.apply(
    lambda x: x['urban_85-89_years']/x['whole_population'], axis=1)
covid_df_rf['urban_90-94_years_percent'] = covid_df.apply(
    lambda x: x['urban_90-94_years']/x['whole_population'], axis=1)
covid_df_rf['urban_percent'] = covid_df.apply(
    lambda x: x['urban']/x['whole_population'], axis=1)

### Удалим ненужные признаки

In [10]:
covid_df_rf = covid_df_rf.drop(columns=[
    'num_patients_tubercul_1992',
    'num_patients_tubercul_1993',
    'num_patients_tubercul_1994',
    'num_patients_tubercul_1995',
    'num_patients_tubercul_1996',
    'num_patients_tubercul_1997',
    'num_patients_tubercul_1998',
    'num_patients_tubercul_1999',
    'num_patients_tubercul_2000',
    'num_patients_tubercul_2001',
    'num_patients_tubercul_2002',
    'num_patients_tubercul_2003',
    'num_patients_tubercul_2004',
    'num_patients_tubercul_2005',
    'num_patients_tubercul_2006',
    'num_patients_tubercul_2007',
    'num_patients_tubercul_2008',
    'num_patients_tubercul_2009',
    'num_patients_tubercul_2010',
    'num_patients_tubercul_2011',
    'num_patients_tubercul_2012',
    'num_patients_tubercul_2013',
    'num_patients_tubercul_2014',
    'num_patients_tubercul_2015',
    'num_patients_tubercul_2016',
    'num_patients_tubercul_2017',
             ])

In [11]:
for col in covid_df_rf.columns:
  if covid_df_rf[covid_df_rf[col].isna()].shape[0] != 0:
    print(f"COLUMN: {col}")
    print(covid_df_rf[covid_df_rf[col].isna()].shape)

COLUMN: ivl_per_100k
(68, 105)
COLUMN: ivl_number
(68, 105)
COLUMN: ekmo_per_100k
(68, 105)
COLUMN: ekmo_number
(217, 105)
COLUMN: life_quality_place_rating
(295, 105)
COLUMN: ecology
(295, 105)
COLUMN: cleanness
(295, 105)
COLUMN: public_services
(295, 105)
COLUMN: neighbourhood
(295, 105)
COLUMN: children_places
(295, 105)
COLUMN: sport_and_outdoor
(295, 105)
COLUMN: shops_and_malls
(295, 105)
COLUMN: public_transport
(295, 105)
COLUMN: security
(295, 105)
COLUMN: life_costs
(295, 105)
COLUMN: epirank_avia
(352, 105)
COLUMN: epirank_bus
(88, 105)
COLUMN: epirank_train
(144, 105)
COLUMN: epirank_avia_cat
(352, 105)
COLUMN: epirank_bus_cat
(88, 105)
COLUMN: epirank_train_cat
(144, 105)


In [12]:
# много пропущенных значений
covid_df_rf = covid_df_rf.drop(columns=[
    'life_costs',
    'epirank_avia',
    'epirank_bus',
    'epirank_train',
    'epirank_avia_cat',
    'epirank_bus_cat',
    'epirank_train_cat',
    'life_quality_place_rating',
    'ecology',
    'public_services',
    'neighbourhood',
    'children_places',
    'shops_and_malls',
    'public_transport',
    'security'
])

### Обучим модель

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    covid_df_rf.drop(columns=['inf_rate']),
    covid_df_rf['inf_rate'],
    test_size=0.33,
     random_state=42,
     )

In [14]:
covid_df_rf.shape

(395, 90)

In [15]:
clf = RandomForestRegressor(max_depth=5, n_estimators=20, random_state=42)
clf.fit(X_train.fillna(-1), y_train)

RandomForestRegressor(max_depth=5, n_estimators=20, random_state=42)

In [16]:
mean_absolute_error(y_test, clf.predict(X_test.fillna(-1)))

0.011249832803061405

In [17]:
mean_absolute_error(y_train, clf.predict(X_train.fillna(-1)))

0.011923574729247833

GridSearch

In [18]:
clf = RandomForestRegressor(random_state=0)
params = [{'max_depth': [2, 3, 4, 5],
         'n_estimators': [8, 10, 12, 15, 18, 20]}]
clf_gs = GridSearchCV(clf,
                      param_grid=params,
                      scoring='neg_mean_absolute_error',
                      cv=5)
clf_gs.fit(X_train.fillna(-1), y_train)
clf_gs.best_params_

{'max_depth': 5, 'n_estimators': 20}

In [19]:
clf = RandomForestRegressor(max_depth=5, n_estimators=20, random_state=0)
clf.fit(X_train.fillna(-1), y_train)

RandomForestRegressor(max_depth=5, n_estimators=20, random_state=0)

In [20]:
mean_absolute_error(y_test, clf.predict(X_test.fillna(-1)))

0.010789677070830377

In [21]:
mean_absolute_error(y_train, clf.predict(X_train.fillna(-1)))

0.00883888245320205

Сохраним модель и данные

In [22]:
import pickle

In [23]:
with open('model.pickle', 'wb') as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [24]:
X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)

In [25]:
X_train.to_pickle('X_train.pickle')
X_test.to_pickle('X_test.pickle')
y_train.to_pickle('y_train.pickle')
y_test.to_pickle('y_test.pickle')

Признаки с наибольшим feature importance в этой модели:

In [26]:
pd.DataFrame(list(zip(X_train.columns, clf.feature_importances_))).rename(
    columns={0:'feature', 1:'importance'}).sort_values('importance', ascending=False).head(20) #.to_excel('fi_random_forest.xlsx', index=False)

Unnamed: 0,feature,importance
76,region_x,0.438401
75,subject,0.369092
47,work_ratio_15-64_years,0.078389
44,work_ratio_15-72_years,0.07039
45,work_ratio_55-64_years,0.01832
3,density,0.009425
62,volume_serv_sport_2017,0.005673
65,volume_serv_sanatorium_2017,0.00136
61,volume_serv_culture_2017,0.000862
30,urban_70-74_years,0.000827


### Рассчитаем значения для теста

In [27]:
test_df = pd.read_csv(PATH_TO_TEST)

In [28]:
cat_test = te.transform(test_df.select_dtypes('object'))
test_df = test_df.drop(columns=test_df.select_dtypes('object').columns).join(cat_test).drop(columns='name')

In [29]:
test_df['patients_percent_2017'] = test_df.apply(
    lambda x: x['num_patients_tubercul_2017']/x['population'], axis=1)

In [30]:
test_df['volume_serv_total'] = test_df.apply(lambda x: x['volume_serv_household_2017'] +\
                            x['volume_serv_chargeable_2017']+\
                            x['volume_serv_transport_2017'] +\
                            x['volume_serv_post_2017']+\
                            x['volume_serv_accommodation_2017'] +\
                            x['volume_serv_telecom_2017']+\
                            x['volume_serv_others_2017'] +\
                            x['volume_serv_veterinary_2017']+\
                            x['volume_serv_housing_2017'] +\
                            x['volume_serv_education_2017']+\
                            x['volume_serv_medicine_2017'] +\
                            x['volume_serv_disabled_2017']+\
                            x['volume_serv_culture_2017'] +\
                            x['volume_serv_sport_2017']+\
                            x['volume_serv_hotels_2017'] +\
                            x['volume_serv_tourism_2017']+\
                            x['volume_serv_sanatorium_2017'],
                            axis=1
                        )

In [31]:
# те же признаки, но в процентах от населения города
test_df['urban_50-54_years_percent'] = test_df.apply(
    lambda x: x['urban_50-54_years']/x['whole_population'], axis=1)
test_df['urban_55-59_years_percent'] = test_df.apply(
    lambda x: x['urban_55-59_years']/x['whole_population'], axis=1)
test_df['urban_60-64_years_percent'] = test_df.apply(
    lambda x: x['urban_60-64_years']/x['whole_population'], axis=1)
test_df['urban_65-69_years_percent'] = test_df.apply(
    lambda x: x['urban_65-69_years']/x['whole_population'], axis=1)
test_df['urban_70-74_years_percent'] = test_df.apply(
    lambda x: x['urban_70-74_years']/x['whole_population'], axis=1)
test_df['urban_75-79_years_percent'] = test_df.apply(
    lambda x: x['urban_75-79_years']/x['whole_population'], axis=1)
test_df['urban_80-84_years_percent'] = test_df.apply(
    lambda x: x['urban_80-84_years']/x['whole_population'], axis=1)
test_df['urban_85-89_years_percent'] = test_df.apply(
    lambda x: x['urban_85-89_years']/x['whole_population'], axis=1)
test_df['urban_90-94_years_percent'] = test_df.apply(
    lambda x: x['urban_90-94_years']/x['whole_population'], axis=1)
test_df['urban_percent'] = test_df.apply(
    lambda x: x['urban']/x['whole_population'], axis=1)

In [32]:
test_df = test_df.drop(columns=[
    'num_patients_tubercul_1992',
    'num_patients_tubercul_1993',
    'num_patients_tubercul_1994',
    'num_patients_tubercul_1995',
    'num_patients_tubercul_1996',
    'num_patients_tubercul_1997',
    'num_patients_tubercul_1998',
    'num_patients_tubercul_1999',
    'num_patients_tubercul_2000',
    'num_patients_tubercul_2001',
    'num_patients_tubercul_2002',
    'num_patients_tubercul_2003',
    'num_patients_tubercul_2004',
    'num_patients_tubercul_2005',
    'num_patients_tubercul_2006',
    'num_patients_tubercul_2007',
    'num_patients_tubercul_2008',
    'num_patients_tubercul_2009',
    'num_patients_tubercul_2010',
    'num_patients_tubercul_2011',
    'num_patients_tubercul_2012',
    'num_patients_tubercul_2013',
    'num_patients_tubercul_2014',
    'num_patients_tubercul_2015',
    'num_patients_tubercul_2016',
    'num_patients_tubercul_2017',
             ])

In [33]:
test_df = test_df.drop(columns=[
    'life_costs',
    'epirank_avia',
    'epirank_bus',
    'epirank_train',
    'epirank_avia_cat',
    'epirank_bus_cat',
    'epirank_train_cat',
    'life_quality_place_rating',
    'ecology',
    'public_services',
    'neighbourhood',
    'children_places',
    'shops_and_malls',
    'public_transport',
    'security'
])

In [35]:
predicted = clf.predict(test_df.fillna(-1).drop(columns=['Unnamed: 0', 'inf_rate']))

In [36]:
sub_df = pd.read_csv('submission.csv')

In [38]:
sub_df['inf_rate'] = predicted

In [39]:
sub_df.to_csv('submission.csv')