In [14]:
library('tidyverse')
library(dplyr)

In [15]:
# READ THE DATA
df <- read.csv("../data/districts.csv")

# RENAME THE ID COLUMN
df <- df %>%
        rename(district_id = id)

In [16]:
# COPY THE DATAFRAME AND CHANGE THE COLUMN NAME
new_df <- df %>%
  rename(
    pop_500 = municipality_1,
    pop_500_1999 = municipality_2,
    pop_2000_9999 = municipality_3,
    pop_10000 = municipality_4
  )

new_df

district_id,name,region,population,num_cities,urban_ratio,avg_salary,entrepreneur_1000,pop_500,pop_500_1999,pop_2000_9999,pop_10000,unemployment_rate,commited_crimes
<int>,<chr>,<chr>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>
1,Hl.m. Praha,Prague,1204953,1,100.0,12541,167,0,0,0,1,"[0.2,0.43]","[85677,99107]"
2,Benesov,central Bohemia,88884,5,46.7,8507,132,80,26,6,2,"[1.6,1.85]","[2159,2674]"
3,Beroun,central Bohemia,75232,5,41.7,8980,111,55,26,4,1,"[1.9,2.21]","[2824,2813]"
4,Kladno,central Bohemia,149893,6,67.4,9753,109,63,29,6,2,"[4.6,5.05]","[5244,5892]"
5,Kolin,central Bohemia,95616,6,51.4,9307,118,65,30,4,1,"[3.8,4.43]","[2616,3040]"
6,Kutna Hora,central Bohemia,77963,4,51.5,8546,126,60,23,4,2,"[2.9,4.02]","[2640,3120]"
7,Melnik,central Bohemia,94725,6,63.4,9920,130,38,28,1,3,"[2.2,2.87]","[4289,4846]"
8,Mlada Boleslav,central Bohemia,112065,8,69.4,11277,127,95,19,7,1,"[1.2,1.44]","[5179,4987]"
9,Nymburk,central Bohemia,81344,6,55.3,8899,149,61,23,4,2,"[3.3,3.97]","[2987,2487]"
10,Praha - vychod,central Bohemia,92084,5,46.7,10124,141,55,29,4,3,"[0.5,0.54]","[3810,4316]"


In [17]:
# REMOVE SPACES FROM COLUMN NAMES
colnames(new_df) <- gsub(" ", "", colnames(new_df))

In [18]:
# REPLACE ALL CELLS THAT ARE ENTIRELY SPACE (OR EMPTY) WITH NAN 
new_df <- apply(new_df, 2, function(x) ifelse(grepl("^\\s*$", x), NA, x))

In [19]:
# COUNT THE NUMBER OF MISSING VALUES IN EACH COLUMN; none
missing_values_num <- colSums(is.na(new_df))
print(missing_values_num)

      district_id              name            region        population 
                0                 0                 0                 0 
       num_cities       urban_ratio        avg_salary entrepreneur_1000 
                0                 0                 0                 0 
          pop_500      pop_500_1999     pop_2000_9999         pop_10000 
                0                 0                 0                 0 
unemployment_rate   commited_crimes 
                0                 0 


In [20]:
# TYPE CAST THE MATRIX TO DATA FRAME
new_df <- as.data.frame(new_df)

In [21]:
# SPLIT THE STRING IN 'UNEMPLOYMENT_RATE' COLUMN AND PUT IT IN A NEW COLUMN
split_data <- strsplit(new_df$unemployment_rate, ",")
new_df$unemployment_rate_95 <- sapply(split_data, "[", 1)
new_df$unemployment_rate_96 <- sapply(split_data, "[", 2)

# DROP THE OLD COLUMN
new_df <- subset(new_df, select = -c(unemployment_rate))

# CLEAN THE DATA IN THE NEW COLUMN
new_df$unemployment_rate_95 <- gsub("\\[", "", new_df$unemployment_rate_95)
new_df$unemployment_rate_96 <- gsub("\\]", "", new_df$unemployment_rate_96)

new_df

Unnamed: 0_level_0,district_id,name,region,population,num_cities,urban_ratio,avg_salary,entrepreneur_1000,pop_500,pop_500_1999,pop_2000_9999,pop_10000,commited_crimes,unemployment_rate_95,unemployment_rate_96
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,Hl.m. Praha,Prague,1204953,1,100.0,12541,167,0,0,0,1,"[85677,99107]",0.2,0.43
2,2,Benesov,central Bohemia,88884,5,46.7,8507,132,80,26,6,2,"[2159,2674]",1.6,1.85
3,3,Beroun,central Bohemia,75232,5,41.7,8980,111,55,26,4,1,"[2824,2813]",1.9,2.21
4,4,Kladno,central Bohemia,149893,6,67.4,9753,109,63,29,6,2,"[5244,5892]",4.6,5.05
5,5,Kolin,central Bohemia,95616,6,51.4,9307,118,65,30,4,1,"[2616,3040]",3.8,4.43
6,6,Kutna Hora,central Bohemia,77963,4,51.5,8546,126,60,23,4,2,"[2640,3120]",2.9,4.02
7,7,Melnik,central Bohemia,94725,6,63.4,9920,130,38,28,1,3,"[4289,4846]",2.2,2.87
8,8,Mlada Boleslav,central Bohemia,112065,8,69.4,11277,127,95,19,7,1,"[5179,4987]",1.2,1.44
9,9,Nymburk,central Bohemia,81344,6,55.3,8899,149,61,23,4,2,"[2987,2487]",3.3,3.97
10,10,Praha - vychod,central Bohemia,92084,5,46.7,10124,141,55,29,4,3,"[3810,4316]",0.5,0.54


In [22]:
# SPLIT THE STRING IN 'UNEMPLOYMENT_RATE' COLUMN AND PUT IT IN A NEW COLUMN
split_data <- strsplit(new_df$commited_crimes, ",")
new_df$commited_crimes_95 <- sapply(split_data, "[", 1)
new_df$commited_crimes_96 <- sapply(split_data, "[", 2)

# DROP THE OLD COLUMN
new_df <- subset(new_df, select = -c(commited_crimes))

# CLEAN THE DATA IN THE NEW COLUMN
new_df$commited_crimes_95 <- gsub("\\[", "", new_df$commited_crimes_95)
new_df$commited_crimes_96 <- gsub("\\]", "", new_df$commited_crimes_96)

new_df

Unnamed: 0_level_0,district_id,name,region,population,num_cities,urban_ratio,avg_salary,entrepreneur_1000,pop_500,pop_500_1999,pop_2000_9999,pop_10000,unemployment_rate_95,unemployment_rate_96,commited_crimes_95,commited_crimes_96
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,Hl.m. Praha,Prague,1204953,1,100.0,12541,167,0,0,0,1,0.2,0.43,85677,99107
2,2,Benesov,central Bohemia,88884,5,46.7,8507,132,80,26,6,2,1.6,1.85,2159,2674
3,3,Beroun,central Bohemia,75232,5,41.7,8980,111,55,26,4,1,1.9,2.21,2824,2813
4,4,Kladno,central Bohemia,149893,6,67.4,9753,109,63,29,6,2,4.6,5.05,5244,5892
5,5,Kolin,central Bohemia,95616,6,51.4,9307,118,65,30,4,1,3.8,4.43,2616,3040
6,6,Kutna Hora,central Bohemia,77963,4,51.5,8546,126,60,23,4,2,2.9,4.02,2640,3120
7,7,Melnik,central Bohemia,94725,6,63.4,9920,130,38,28,1,3,2.2,2.87,4289,4846
8,8,Mlada Boleslav,central Bohemia,112065,8,69.4,11277,127,95,19,7,1,1.2,1.44,5179,4987
9,9,Nymburk,central Bohemia,81344,6,55.3,8899,149,61,23,4,2,3.3,3.97,2987,2487
10,10,Praha - vychod,central Bohemia,92084,5,46.7,10124,141,55,29,4,3,0.5,0.54,3810,4316


In [23]:
# SAVE THE DATAFRAME TO CSV FILE 
write.csv(new_df, file = 'districts_r.csv', row.names = FALSE, fileEncoding = 'UTF-8')