In [1]:
import os
import pandas as pd
import numpy as np
from pandas.api.types import is_object_dtype
from sklearn.impute import KNNImputer

In [170]:
df_counties = pd.read_csv('data_own/data_counties.csv')
df_municipalities = pd.read_csv('data_own/data_municipalities.csv')
df_politics = pd.read_csv('data_own/wybory_2019_clear.csv')

df_geopgraphy = pd.read_csv('data_own/spatial_municipality_corrected2.csv')
df_geopgraphy = df_geopgraphy.drop(columns = ['Unnamed: 0']).rename(columns = {'mncplty_c': 'municipality_code'})
df_geopgraphy.loc[lambda x: x['within_russian'].astype(int) == 1, 'partitions'] = 1
df_geopgraphy.loc[lambda x: x['within_prussian'].astype(int) == 1, 'partitions'] = 2
df_geopgraphy.loc[lambda x: x['within_austrian'].astype(int) == 1, 'partitions'] = 3
df_geopgraphy = df_geopgraphy[['municipality_code', 'Longitude', 'Latitude', 'partitions']]

df_places = pd.read_csv('data_own/processed_df_modellingV3.csv')
df_places = df_places[['Longitude', 'Latitude', 'min_dist', 'no_loc_in10km']]

print(df_counties.shape, df_municipalities.shape, df_politics.shape, df_geopgraphy.shape, df_places.shape)

(380, 12) (2477, 100) (2496, 16) (2477, 4) (2476, 4)


In [171]:
df_politics = df_politics.rename(columns = {
    "Kod TERYT": "municipality_code"
}).rename(columns={col: col.replace(' ', '_') for col in df_politics.columns})

df = df_municipalities.merge(df_counties, left_on="county_code", right_on = "county_code")
print(df.shape)

df['municipality_code_politics'] = df['municipality_code'].astype(str).str[:-1].astype(int)
df = df.merge(df_politics, left_on="municipality_code_politics", right_on = "municipality_code", how = "left")
print(df.shape)

df = df.rename(columns = {'municipality_code_x': 'municipality_code'})\
    .drop(columns = ['municipality_code_politics', 'municipality_code_y'])

columns_checking = ['appartments_per_1000_persons', 'unemployed_total',
       'unemployed_f', 'unemployed_m', 'unemployed_up_to_25',
       'unemployed_over_50', 'unemployed_long_term']
print(df.shape)

for column in columns_checking:
    df[column] = df[column].str.replace(' ', '').astype(float)
print(df.shape)

df = df.merge(df_geopgraphy, on = 'municipality_code', how = "left")
df = df.merge(df_places, left_on = ['Longitude', 'Latitude'], right_on = ['Longitude', 'Latitude'], how = "left")
print(df.shape)

(2477, 111)
(2477, 128)
(2477, 126)
(2477, 126)
(2477, 131)


In [172]:
df.loc[lambda x: x['municipality_code'] == 1465011] = df.loc[lambda x: x['municipality_code'] == 1465011].fillna(
    df_politics.loc[lambda x: x['Powiat'] == "Warszawa"][[
        'Liczba_wyborców_uprawnionych_do_głosowania',
        'Liczba_otrzymanych_kopert_zwrotnych', 'Liczba_kart_nieważnych',
        'Liczba_kart_ważnych', 'Liczba_głosów_nieważnych',
        'Liczba_głosów_ważnych_oddanych_łącznie_na_wszystkie_listy_kandydatów',
        'PO', 'Konfederacja', 'PSL', 'PIS', 'SLD', 'Inne'
    ]].sum(axis = 0)
).fillna({'min_dist': 0 ,'no_loc_in10km': 58}).fillna("nic").copy()

### Feature engenering

In [173]:
columns_partie = ['PO', 'Konfederacja', 'PSL', 'PIS', 'SLD', 'Inne']
object_columns = df.select_dtypes(include=['object'])

df['type_of_municipality'] = df['municipality_code'].astype(str).str[-1].astype(int)

for column in columns_partie:
    df[f'{column}_percent'] = df[column] / df['Liczba_głosów_ważnych_oddanych_łącznie_na_wszystkie_listy_kandydatów']

df['percent_vaccinated_01'] = df['percent_vaccinated'] / 100

def logit_inverse(x):
    return(1 / (1 + np.exp(-x)))

def logit(x):
    return np.log(x/(1-x))

df['percent_vaccinated_01_r'] = logit(df['percent_vaccinated_01'])
df['y'] = df['percent_vaccinated_01_r']

df['population_total_f_ratio_total'] = df['population_total_f'] / df['population_total']
df['frekwencja_wyborcza'] = df['Liczba_kart_ważnych'] / df['Liczba_wyborców_uprawnionych_do_głosowania']

df['population_density_log'] = np.log(df['population_density'])
df['population_density_sqrt'] = np.sqrt(df['population_density'])

df['revenues_per_capita_PIT_log'] = np.log(df['revenues_per_capita_PIT'])
df['SLD_percent_sqrt'] = np.sqrt(df['SLD_percent']) 

df['healthcare_advices_ratio_total'] = df['healthcare_advices'] / df['population_total']

df['forests_area_ratio_area_km2'] = df['forests_area'] / df['area_km2']

columns_ages = np.arange(start = 0, stop = 86, step = 5)
columns_ages[1:] = columns_ages[1:] - 1

columns_ages_str = list()

for i, age in enumerate(columns_ages):
    if i != 0:
        columns_ages_str.append(f"{columns_ages[i-1] + 1}_{columns_ages[i]}")

columns_ages_str[0] = "0_4"

for age_group in columns_ages_str:
    df[f"population_{age_group}_total_percent"] = df[f'population_{age_group}_total'] / df["population_total"]
    df[f"population_{age_group}_total_f_ratio_age_group"] = df[f'population_{age_group}_total_f'] / df[f'population_{age_group}_total']

columns_ages_remove = [f'population_{age_group}_total_f' for age_group in columns_ages_str] +\
    [f'population_{age_group}_total_m' for age_group in columns_ages_str] +\
    [f'population_{age_group}_total' for age_group in columns_ages_str]

columns_ages_remove_f_percent = [f'population_{age_group}_total_f_ratio_age_group' for age_group in columns_ages_str]

df['unemployed_over_50_ratio_population_50_60'] = df['unemployed_over_50'] / df[[f'population_{age_group}_total' for age_group in columns_ages_str[10:12]]].sum(axis = 1)

df['beds_in_hospitals_ratio_population'] = df['beds_in_hospitals'] / df.groupby('county_code')['population_total'].transform('sum')

df['percent_over_60'] = df[[f'population_{age_group}_total' for age_group in columns_ages_str[12:]]].sum(axis = 1) / df['population_total']
df['percent_under_18'] = df[[f'population_{age_group}_total' for age_group in columns_ages_str[:4]]].sum(axis = 1) / df['population_total']

df['percent_over_60_squared'] = df['percent_over_60']**2

df['cars_per_1_persons'] = df['cars_per_1000_persons'] / 1000
df['persons_running_business_per_1_person'] = df['persons_running_business'] / df['population_total']
df['entities_registered_per_1_persons'] = df['entities_registered_per_10k_persons'] / 10000

df['county_code_random'] = df['county_code']
indexes_shape_1 = df['county_code'].value_counts()[(df['county_code'].value_counts() == 1)].index
df.loc[lambda x: x['county_code'].isin(indexes_shape_1)] = 1

In [174]:
df_modelling = df.drop(columns = [
    'population_total_m', 'population_total_f',
    'municipality_code', 'municipality_name',
    'area_km2',
    'unemployment_rate_m', 'unemployment_rate_f',
    'unemployed_total',
    'unemployed_up_to_25', 'unemployed_over_50', 'unemployed_long_term',
    'persons_running_business', 'entities_registered_per_10k_persons', 'healthcare_advices',
    'percent_vaccinated', "Inne_percent",
    'beds_in_hospitals',
    'percent_vaccinated_01_r', 'percent_vaccinated_01'
] + columns_ages_remove + columns_ages_remove_f_percent + df_politics.columns.tolist() + ['municipality_name',
 'county_name',
 'Gmina',
 'Powiat',
 'Województwo']
).copy()

### Imputation

In [175]:
df_modelling.isna().sum().sort_values().tail(7)

revenues_per_capita                0
net_scholarization                 0
county_code_random                 0
children_3_5_in_kindergartens      1
forests_area                      16
forests_area_ratio_area_km2       16
healthcare_advices_ratio_total    59
dtype: int64

In [176]:
data = df_modelling.copy()

data_health = data[['Longitude', 'Latitude', 'healthcare_advices_ratio_total']].values
data_forests_ratio = data[['Longitude', 'Latitude', 'forests_area_ratio_area_km2']].values
data_forests_area = data[['Longitude', 'Latitude', 'forests_area']].values
data_kinder = data[['Longitude', 'Latitude', 'children_3_5_in_kindergartens']].values
data_tourists = data[['Longitude', 'Latitude', 'tourits_per_1000_persons']].values
imputer = KNNImputer(n_neighbors=5)
data_health = imputer.fit_transform(data_health)
data_forests_ratio = imputer.fit_transform(data_forests_ratio)
data_forests_area = imputer.fit_transform(data_forests_area)
data_kinder = imputer.fit_transform(data_kinder)
data_tourists = imputer.fit_transform(data_tourists)

data[['Longitude', 'Latitude', 'healthcare_advices_ratio_total']] = data_health
data[['Longitude', 'Latitude', 'forests_area_ratio_area_km2']] = data_forests_ratio
data[['Longitude', 'Latitude', 'forests_area']] = data_forests_area
data[['Longitude', 'Latitude', 'children_3_5_in_kindergartens']] = data_kinder
data[['Longitude', 'Latitude', 'tourits_per_1000_persons']] = data_tourists

df_modelling = data.copy()                                       

In [177]:
df_modelling.isna().sum().sort_values().tail(7)

expenditures_per_capita          0
revenues_per_capita_CIT          0
revenues_per_capita_PIT          0
revenues_per_capita              0
bicycle_paths_per_10k_persons    0
Longitude                        0
county_code_random               0
dtype: int64

### Saving data for modelling

In [178]:
df_modelling.to_csv("processed_df_modelling.csv")