In [37]:
import pandas as pd
import numpy as np

In [38]:
# LOAD THE DATA AND RENAME THE 'ID' COLUMN
df = pd.read_csv("../data/districts.csv")
df.rename(columns={'id': 'district_id'}, inplace=True)

In [39]:
df

Unnamed: 0,district_id,name,region,population,num_cities,urban_ratio,avg_salary,entrepreneur_1000,municipality_1,municipality_2,municipality_3,municipality_4,unemployment_rate,commited_crimes
0,1,Hl.m. Praha,Prague,1204953,1,100.0,12541,167,0,0,0,1,"[0.2,0.43]","[85677,99107]"
1,2,Benesov,central Bohemia,88884,5,46.7,8507,132,80,26,6,2,"[1.6,1.85]","[2159,2674]"
2,3,Beroun,central Bohemia,75232,5,41.7,8980,111,55,26,4,1,"[1.9,2.21]","[2824,2813]"
3,4,Kladno,central Bohemia,149893,6,67.4,9753,109,63,29,6,2,"[4.6,5.05]","[5244,5892]"
4,5,Kolin,central Bohemia,95616,6,51.4,9307,118,65,30,4,1,"[3.8,4.43]","[2616,3040]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,73,Opava,north Moravia,182027,7,56.4,8746,90,17,49,12,2,"[3.3,3.74]","[4355,4433]"
73,74,Ostrava - mesto,north Moravia,323870,1,100.0,10673,100,0,0,0,1,"[4.7,5.44]","[18782,18347]"
74,75,Prerov,north Moravia,138032,5,64.6,8819,99,67,30,4,2,"[5.3,5.66]","[4063,4505]"
75,76,Sumperk,north Moravia,127369,7,51.2,8369,107,31,32,13,2,"[4.7,5.88]","[3736,2807]"


In [40]:
# COPY THE DATAFRAME AND CHANGE THE COLUMN NAME
new_df = df.copy()
new_df.rename(columns={'municipality_1': 'pop_500'}, inplace=True)
new_df.rename(columns={'municipality_2': 'pop_500_1999'}, inplace=True)
new_df.rename(columns={'municipality_3': 'pop_2000_9999'}, inplace=True)
new_df.rename(columns={'municipality_4': 'pop_10000'}, inplace=True)

In [41]:
# REMOVE SPACES FROM COLUMN NAMES
new_df.rename(columns=lambda x: x.strip(), inplace=True)

In [42]:
# REPLACE ALL CELLS THAT ARE ENTIRELY SPACE (OR EMPTY) WITH NAN 
new_df.replace(r'^\s*$', np.nan, regex=True, inplace = True) # regex: regular expression

# COUNT THE NUMBER OF MISSING VALUES IN EACH COLUMN; none
missing_values_num = new_df.isna().sum()
print(missing_values_num)

district_id          0
name                 0
region               0
population           0
num_cities           0
urban_ratio          0
avg_salary           0
entrepreneur_1000    0
pop_500              0
pop_500_1999         0
pop_2000_9999        0
pop_10000            0
unemployment_rate    0
commited_crimes      0
dtype: int64


In [43]:
# SPLIT THE STRING IN 'UNEMPLOYMENT_RATE' COLUMN AND PUT IT IN A NEW COLUMN
split_data = new_df['unemployment_rate'].str.split(',', expand=True)
new_df['unemployment_rate_95'] = split_data[0]
new_df['unemployment_rate_96'] = split_data[1]

# DROP THE OLD COLUMN
new_df = new_df.drop(columns=['unemployment_rate'])

# CLEAN THE DATA IN THE NEW COLUMN
new_df['unemployment_rate_95'] = new_df['unemployment_rate_95'].str.replace('[', '')
new_df['unemployment_rate_96'] = new_df['unemployment_rate_96'].str.replace(']', '')

In [44]:
# SPLIT THE STRING IN 'COMMITED_CRIMES' COLUMN AND PUT IT IN A NEW COLUMN
split_data = new_df['commited_crimes'].str.split(',', expand=True)
new_df['commited_crimes_95'] = split_data[0]
new_df['commited_crimes_96'] = split_data[1]

# DROP THE OLD COLUMN
new_df = new_df.drop(columns=['commited_crimes'])

# CLEAN THE DATA IN THE NEW COLUMN
new_df['commited_crimes_95'] = new_df['commited_crimes_95'].str.replace('[', '')
new_df['commited_crimes_96'] = new_df['commited_crimes_96'].str.replace(']', '')

In [45]:
new_df

Unnamed: 0,district_id,name,region,population,num_cities,urban_ratio,avg_salary,entrepreneur_1000,pop_500,pop_500_1999,pop_2000_9999,pop_10000,unemployment_rate_95,unemployment_rate_96,commited_crimes_95,commited_crimes_96
0,1,Hl.m. Praha,Prague,1204953,1,100.0,12541,167,0,0,0,1,0.2,0.43,85677,99107
1,2,Benesov,central Bohemia,88884,5,46.7,8507,132,80,26,6,2,1.6,1.85,2159,2674
2,3,Beroun,central Bohemia,75232,5,41.7,8980,111,55,26,4,1,1.9,2.21,2824,2813
3,4,Kladno,central Bohemia,149893,6,67.4,9753,109,63,29,6,2,4.6,5.05,5244,5892
4,5,Kolin,central Bohemia,95616,6,51.4,9307,118,65,30,4,1,3.8,4.43,2616,3040
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,73,Opava,north Moravia,182027,7,56.4,8746,90,17,49,12,2,3.3,3.74,4355,4433
73,74,Ostrava - mesto,north Moravia,323870,1,100.0,10673,100,0,0,0,1,4.7,5.44,18782,18347
74,75,Prerov,north Moravia,138032,5,64.6,8819,99,67,30,4,2,5.3,5.66,4063,4505
75,76,Sumperk,north Moravia,127369,7,51.2,8369,107,31,32,13,2,4.7,5.88,3736,2807


In [None]:
new_df.to_csv('districts_py.csv', index = False, encoding='utf-8')