# INFO-F-422 -  Statistical Foundations of Machine Learning 

### Couchard Darious - __[Darius.Couchard@ulb.be](mailto:Darius.Couchard@ulb.be) - Student ID 425366__
### Donne Stefano - __[Stefano.Donne@ulb.be](mailto:Stefano.Donne@ulb.be) - Student ID 408801__
### Parent Paul - __[Paul.Parent@ulb.be](mailto:Paul.Parent@ulb.be) - Student ID 495257__


## Pump it Up: Data Mining the Water Table
####  April 29, 2021


# 2) Data Pre-Processing




In [1]:
require(tidyr)
require(plyr)
require(dplyr)
library(mltools)
library(data.table)

training_set<-read.csv("../Data/TrainingSet/4910797b-ee55-40a7-8668-10efd5c1b960.csv",header=TRUE) # loads the training set csv file (it's magic)
dim(training_set) # dimension of the set 
names(training_set) # names of the variables

training_labels<-read.csv("../Data/TrainingLabel/0bf8bc6e-30d0-4c50-956a-603fc693d966.csv", header=TRUE) # Loads the corresponding labels

### TEST SET ###

test_set<-read.csv("../Data/TestSet/702ddfc5-68cd-4d1d-a0de-f5f566f76d91.csv", header=TRUE) 

Loading required package: tidyr

Loading required package: plyr

Loading required package: dplyr


Attaching package: 'dplyr'


The following objects are masked from 'package:plyr':

    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union



Attaching package: 'mltools'


The following object is masked from 'package:tidyr':

    replace_na



Attaching package: 'data.table'


The following objects are masked from 'package:dplyr':

    between, first, last




## How to enhance the data set :

After having analyzed the data set and assessed each variable relevance, it's needed to standardize the datas.

First, it's needed to remove empty values in the table (Imputation): each NaN or empty cell has to be removed or replaced, different solution exists : 
* Mapping NaN<-0 for nominal categorical variables
* Replace missing value by mean of column for numerical variables

Then, modifications have to be made depending on the nature of the data :
* If a column (variable) consists in continous numerical values : standardization is applied such has obtaining a new column with a mean value of 0 and a standard deviation of 1 (**longitude**)
* If a column is made of ordinal categorical variable (hierarchy between categories) : map each string to a numerical value (**water_quality**)
* In case of nominal categorical variable : apply one hot encoding -> create new column (with binary values) for each category (**source_type**)
<br/>


## Methods are implemented to apply these changes :

In [2]:
# METHOD TO REASSIGN EMPTY VALUES
NaN_handler_categorical <- function(column_name,set) { # input : column_name (name of the variable)
    set[set[,column_name] == "",column_name ] <- 0 # select row where column element is empty string "" and assign to 0
    set[is.na(set[,column_name]),column_name] <- 0 # select row where column element is NaN and assign to 0
    return(set)
}


# METHOD TO REASSIGN EMPTY VALUES
NaN_handler_num<- function(column_name,set) { # input : column_name (name of the variable)
    mean_col <- mean(training_set[,column_name], na.rm = TRUE)
    set[is.na(set[,column_name]),column_name] <- mean_col
    return(set)
}


In [3]:
# STANDARDIZATION METHOD FOR VARIABLES WITH CONTINUOUS NUMERICAL VALUES
Standardization <- function(column_name,set){ # input : column_name (name of the variable)
    mean_col <- mean(training_set[,column_name], na.rm = TRUE) # mean of the variable
    sd_col <- sd(training_set[,column_name], na.rm = TRUE) # standard deviation of the variable
    set[,column_name]<-(set[,column_name]-mean_col)/sd_col # apply the transformation
    # now for the whole column : mean = 0 and sd = 1
    return(set)
}

In [4]:
# HANDLING OF NOMINAL CATEGORICAL VARIABLES (ONE HOT ENCODING)
# before using : change notebook IOPub data rate limit with Jupyter  notebook --NotebookApp.iopub_data_rate_limit=jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10
Nom_cat_handle <- function(column_name,set){
    set <- set %>% mutate(value = 1)  %>% spread(column_name, value,  fill = 0 )
    return(set)
}


## Drop unused variables

In [5]:
# COLUMNS TO DROP 
column_to_drop<-c("wpt_name","amount_tsh","date_recorded","gps_height","num_private","public_meeting","recorded_by",
                 "scheme_name","quantity_group","source_class","subvillage","waterpoint_type", "region", "district_code",
                 "extraction_type","extraction_type_group","water_quality","source", "payment_type", "management",
                 "latitude", "longitude","installer","lga","ward")

# TODO : adds others columns to drop
training_set<-training_set[,!(names(training_set) %in% column_to_drop)] # drop the desired column




### TEST SET ###
test_set<-test_set[,!(names(test_set) %in% column_to_drop)] # drop the desired column

## Basin variable

In [6]:
# MANAGE SIMPLE NOMINAL CATEGORICAL VARIABLE
#TODO : apply one-hot encoding to other needed variables
training_set<-Nom_cat_handle("basin",training_set) # apply one-hot-encoding to the basin related column




### TEST SET ###
test_set<-Nom_cat_handle("basin",test_set) # apply one-hot-encoding to the basin related column

## Population variable

In [7]:
# MANAGE POPULATION VARIABLE
# NaN have to be replaced by mean of region population
# column 1 = region code, column 2 = population mean in this region
region_code_frame <- data.frame("region" = unique(training_set$region_code),"mean_pop" = NA)
for(row in 1: nrow(region_code_frame) ){ 
    sel <-training_set[which(training_set[,"region_code"]==region_code_frame[row,1],),"population"] # select pop row with corresponding region
    region_code_frame[row,"mean_pop"]<-mean(sel[sel!=0],na.rm = TRUE)
}

# TODO : some region population mean are NaN , find a solution (same problem with region and district_code variables)
# temporairement on remplace les NaN par le mean des autres valeurs dans region_code_frame
region_code_frame[which(is.na(region_code_frame[,2])),2]<-mean(region_code_frame[,2],na.rm=TRUE)




### TEST SET ###
for(elem in which(test_set[,"population"]==0)){
    test_set[elem,"population"]<-region_code_frame[test_set[elem,"region_code"],2] # replace NaN by their mean region value
}

test_set<-Standardization("population",test_set)
### TEST SET ###

# replace NaN value of population by mean region values
index<-which(training_set[,"population"]==0)
for(elem in index){
    training_set[elem,"population"]<-region_code_frame[training_set[elem,"region_code"],2] # replace NaN by their mean region value
}

# Now Standardize Population Variable 
training_set<-Standardization("population",training_set)








## Permit variable

In [8]:
# MANAGE ORDINAL CATEGORICAL VARIABLE

# variable PERMIT
# remap True : 1 , False : 0, Missing "" : NA
training_set$permit <- mapvalues(training_set$permit, 
          from=c("True","False",""), 
          to=c(1,0,NA))
training_set <- transform(training_set, permit = as.numeric(permit)) # transform column data type (char to int)


### TEST SET ###
test_set$permit <- mapvalues(test_set$permit, 
          from=c("True","False",""), 
          to=c(1,0,NA))
test_set <- transform(test_set, permit = as.numeric(permit)) # transform column data type (char to int)
# replace missing value by mean of column
test_set<-NaN_handler_num("permit",test_set)
# Standardize Variable
test_set<-Standardization("permit",test_set)
### TEST SET ###

# replace missing value by mean of column
training_set<-NaN_handler_num("permit",training_set)
# Standardize Variable
training_set<-Standardization("permit",training_set)




## Scheme management variable

In [9]:
# One hot encoding
training_set <- Nom_cat_handle("scheme_management",training_set)



### TEST SET ###
test_set <- Nom_cat_handle("scheme_management",test_set)
### TEST SET ###

"The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
Using compatibility `.name_repair`.


## Construction year variable

In [10]:
# Replace the construction's year by age
max_year <- max(training_set$construction_year)
# Replace 0 values by NaN
training_set$construction_year <- mapvalues(training_set$construction_year, from=0, to=NaN)
# Changes construction year by age
training_set$construction_year <- max_year - training_set$construction_year
# Computes mean age
mean_age <- mean(na.omit(training_set$construction_year))
# Replace NaN by the mean age
training_set$construction_year <- mapvalues(training_set$construction_year, from=NaN, to=mean_age)

### TEST SET ###
# Replace 0 values by NaN
test_set$construction_year <- mapvalues(test_set$construction_year, from=0, to=NaN)
# Changes construction year by age
test_set$construction_year <- max_year - test_set$construction_year
# Replace NaN by the mean age
test_set$construction_year <- mapvalues(test_set$construction_year, from=NaN, to=mean_age)
# Standardize mean age values
test_set<- Standardization("construction_year",test_set)
# Rename column to age
names(test_set["construction_year"]) <- "age"
### TEST SET ###


# Standardize mean age values
training_set<- Standardization("construction_year",training_set)

# Rename column to age
names(training_set["construction_year"]) <- "age"

## Extraction type class variable

In [11]:
# One hot encoding
training_set <- Nom_cat_handle("extraction_type_class",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("extraction_type_class",test_set)
### TEST SET ###

## Management group variable

In [12]:
# One hot encode
training_set <- Nom_cat_handle("management_group",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("management_group",test_set)
### TEST SET ###

## Payment variable

In [13]:
# One hot encode
training_set <- Nom_cat_handle("payment",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("payment",test_set)
### TEST SET ###

## Water quality group variable

In [14]:
# Integer encoding
training_set$quality_group <- mapvalues(training_set$quality_group,
                                       from=c("milky", "good", "salty", "colored", "unknown", "fluoride"),
                                        to=c(2,3,0,1,NA,4))

training_set$quality_group = as.integer(training_set$quality_group)

quality_mean <- mean(na.omit(training_set$quality_group))

training_set$quality_group <- mapvalues(training_set$quality_group,
                                       from=NA,
                                       to=quality_mean)


### TEST SET ###
test_set$quality_group <- mapvalues(test_set$quality_group,
                                       from=c("milky", "good", "salty", "colored", "unknown", "fluoride"),
                                        to=c(2,3,0,1,NA,4))
test_set$quality_group = as.integer(test_set$quality_group)
test_set$quality_group <- mapvalues(test_set$quality_group,
                                       from=NA,
                                       to=quality_mean)
test_set <- Standardization("quality_group",test_set)
### TEST SET ###

training_set <- Standardization("quality_group",training_set)


## Water quantity variable

In [15]:
# Integer encoding
training_set$quantity <- mapvalues(training_set$quantity,
                                    from=c("enough", "insufficient", "dry", "seasonal", "unknown"),
                                    to=c(3, 1, 0, 2, NA))
training_set$quantity = as.integer(training_set$quantity)

# compute mean
quantity_mean <- mean(na.omit(training_set$quantity))

# replace NaN by mean
training_set$quantity <- mapvalues(training_set$quantity,
                                    from=NA,
                                    to=quantity_mean)


### TEST SET ###
test_set$quantity  <- mapvalues(test_set$quantity,
                                    from=c("enough", "insufficient", "dry", "seasonal", "unknown"),
                                    to=c(3, 1, 0, 2, NA))
test_set$quantity  = as.integer(test_set$quantity )
training_set$quantity <- mapvalues(training_set$quantity,
                                    from=NA,
                                    to=quantity_mean)
test_set <- Standardization("quantity",test_set)
### TEST SET ###


# Standardize data
training_set <- Standardization("quantity",training_set)

The following `from` values were not present in `x`: NA



## Source type variable

In [16]:
# One hot encode
training_set <- Nom_cat_handle("source_type",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("source_type",test_set)
### TEST SET ###

## Water point type variable

In [17]:
# One hot encode
training_set <- Nom_cat_handle("waterpoint_type_group",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("waterpoint_type_group",test_set)
### TEST SET ###

## Funder variable

In [18]:
#Replace missing values and 0 by "others"
training_set$funder <- sub("^$", "other_funder", training_set$funder)
training_set$funder <- sub("0", "other_funder", training_set$funder)

funder_occurency<-as.data.frame(table(training_set[,"funder"])) # data_frame containing the number of occurences of each funder
funder_occurency<-arrange(funder_occurency,Freq) # sorted in ascending frequency order
write.csv(funder_occurency,"../Data/PreProcess/funder_occ.csv") # stores occurences for later test_set pre-processing
thresh1<-funder_occurency[as.integer(nrow(funder_occurency)*9/10),2] # thresh 1 

# the two thresholds split the data frame in 2 equal parts (arbitrary) : 
# - funders having opened at most 32 water pump
# - funders having opened more than 32 water pumps
# The funder column can now be transformed, where every funder is now assigned to a categorical variable (big, small)

for(row in 1: nrow(training_set)){
    val = funder_occurency[which(funder_occurency[,1] == training_set[row,"funder"]),2]
    if(training_set[row,"funder"]!="other_funder"){
        if(val>thresh1){
            training_set[row,"funder"]<-"big"
        }
        else{
            training_set[row,"funder"]<-"small"
        }
    }
    # reassign each funder value to its new category
}

# One hot encode
training_set <- Nom_cat_handle("funder",training_set)



### TEST SET ### 
#Replace missing values and 0 by "others"
test_set$funder <- sub("^$", "other_funder", test_set$funder)
test_set$funder <- sub("0", "other_funder", test_set$funder)
for(row in 1: nrow(test_set)){
    index <- which(funder_occurency[,1] == test_set[row,"funder"])
    if( is.integer(index) && length(index) == 0L){ ## funder that wasn't in the first data_set
        count<-length(which(test_set[,"funder"]==test_set[row,"funder"]))
        if(count>thresh1){
            test_set[row,"funder"]<-"big" 
        }
        else{
            test_set[row,"funder"]<-"small"
        }
    }
    else if(test_set[row,"funder"]!="other_funder"){
        val = funder_occurency[index,2] + length(which(test_set[,"funder"]==test_set[row,"funder"]))
        if(val>thresh1){
            test_set[row,"funder"]<-"big"
        }
        else{
            test_set[row,"funder"]<-"small"
        }
    }
    # reassign each funder value to its new category
}
# One hot encode
test_set <- Nom_cat_handle("funder",test_set)
print("nn")
### TEST SET ###

[1] "nn"


## Installer variable

In [19]:
# Replace missing values and 0 by "others"
#training_set$installer <- sub("^$", "others", training_set$installer)
#training_set$installer <- sub("0", "others", training_set$installer)

# One hot encode
#training_set <- Nom_cat_handle("installer")

## Ward variable

In [20]:
# One hot encode
#training_set <- Nom_cat_handle("ward")

## Region code variable

In [21]:
# One hot encode
training_set <- Nom_cat_handle("region_code",training_set)

### TEST SET ###
test_set <- Nom_cat_handle("region_code",test_set)
### TEST SET ###

In [22]:
dim(training_set)
dim(test_set)
for(elem in names(training_set)){ ## ADDS MISSING COLUMNS TO TEST SET (BECAUSE SOME LABELS THAT AREN'T PRESENT THUS HAVEN'T BEEN ONE-HOT ENCODED)
    if(!(elem %in% names(test_set))){
        print(elem)
        test_set[,elem] <- 0 # adds column full of 0 with the missing name
    }
}
dim(training_set)
dim(test_set)


[1] "None"
[1] "40"


## Write preprocessed data

In [23]:
# Write the pre-processed data into a new XLS file
training_set[,"id"]<-training_labels[training_labels$id==training_set[,"id"],2] # done for practicality purposes with random forest library, comment for other methods if needed 
write.csv(training_set,"../Data/PreProcess/processed_training_data.csv") 
write.csv(test_set,"../Data/PreProcess/test_data.csv") 