# Load packages

In [101]:
installed.packages("janitor")
install.packages("party")

library(tidyverse) # metapackage of all tidyverse packages
library(magrittr)
library(janitor)
library(readr)
library(stringdist)
library(party) # RandomForest
library(tidymodels)  # train_split
library(calibrate)  # ones
library(randomForest)
library(mltools)
library(nnet)
library(neuralnet)

Package,LibPath,Version,Priority,Depends,Imports,LinkingTo,Suggests,Enhances,License,License_is_FOSS,License_restricts_use,OS_type,Archs,MD5sum,NeedsCompilation,Built


Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)



# Load the data

In [102]:
path <- "/kaggle/input/chocolate-bar-ratings/flavors_of_cacao.csv"
df <- read_csv(file=path, locale=locale(encoding="latin1"))

[1mRows: [22m[34m1795[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (6): CompanyÂ 
(Maker-if known), Specific Bean Origin
or Bar Name, Cocoa...
[32mdbl[39m (3): REF, Review
Date, Rating

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


# Subfunctions

In [103]:
mycol_renom <- function(df, collist){
    n <- length(colnames(df))
    
    # If you do not know the names just give it an empty list so it will be named by number
    # collist <- list()
    if (length(collist) == 0){
        collist = c(1:n)
    }
    for(i in 1:n){
        colnames(df)[i] = collist[i]
    }
    return(df)
}

In [104]:
remove_undes_dfrows <- function(df, df_col, undes_char_list){
    df_out <- data.frame()

    for (j in 1:length((rownames(df))))
    {
        c <- list()
        for (i in 1:length(undes_char_list))
        {
            wd <- paste(undes_char_list[i])
            if(df_col[j] == wd){ # exact
            #if(any(str_detect(df_num$bean_type[j], wd)) == TRUE){ # it could not remove all matches
                c <- append(c, TRUE)
            }
            else{
                c <- append(c, FALSE)
            }
        }
        if(any(c) == FALSE)
        {
            df_out <- rbind(df_out, df[j,])
        }
    }
    return(df_out)
}

In [105]:
dfcol_2_list <- function(df_col){
    c <- list()
    for (i in 1:length(df_col)){
        c <- append(c, df_col[i])
    }
    return(c)
}

In [106]:
lfind <- function(word, char){
    n <- nchar(char)  # length of string
    nw <- nchar(word)
    num_of_checks <- n*floor(nw/n) #(length of char)*(number of times you slide char)

    ind <- list()
    for (i in 1:num_of_checks){
        st <- i
        end <- i+n-1
        if (substr(word, st, end) == char){
            ind <- append(ind, i)
        }
    }
    return(ind)
}

In [107]:
scaled_data_standardization <- function(feat){
        return((feat - mean(feat))/sd(feat))
}

In [108]:
handmade_accuracy <- function(y_org, y_pred_org){
    
    n <- length(y_org)
    n_class <- length(unique(y_org))

    if (n_class == 2){
        # Binary class classification
        mydict <- get_metrics(y_org, y_pred_org)
    } else {
        # Multi-class classification
        # Re code y_org, y_pred_org as ALL OR ONE
        y <- ones(n, p=1)  # always 1 because it is the truth
        y_pred <- list()
        for (i in 1:n) {
            if (y_org[i] == y_pred_org[i]) {
                print(y_pred)
                y_pred <- append(y_pred, 1)
            } else{
                y_pred <- append(y_pred, 0)
            }
        }
        mydict <- get_metrics(y, y_pred)
    }
    return(mydict)
}

In [109]:
get_metrics <- function(y_train, y_train_pred){
    TP_cnt <- 0
    FN_cnt <- 0
    FP_cnt <- 0
    TN_cnt <- 0
    
    n <- length(y_train)
    
    for (i in 1:n){
        if (y_train[i] == 1 & y_train_pred[i] == 1) {
            TP_cnt <- TP_cnt + 1
        } else if (y_train[i] == 0 & y_train_pred[i] == 1) {
            FN_cnt <- FN_cnt + 1
        } else if (y_train[i] == 1 & y_train_pred[i] == 0) {
            FP_cnt <- FP_cnt + 1
        } else {
            TN_cnt <- TN_cnt + 1
        }
    }
    
    mydict = {}
    mydict['precision'] <- TP_cnt/(TP_cnt + FP_cnt)
    mydict['recall'] <- TP_cnt/(TP_cnt + FN_cnt)
    mydict['acc'] <- (TP_cnt + TN_cnt)/(TP_cnt + FN_cnt + FP_cnt + TN_cnt)
    
    return(mydict)
}

In [110]:
model_pred_accuracy_NN <- function(model, data, fst, fend, labnum){
    
    X <- data[,fst:fend]
    y_actual < data[,labnum]
    
    # ----------------------------------
    # Extract results: prediction probabilities
    # ----------------------------------
    predictions_model <- compute(model, data)
    predictions_model1 <- predictions_model$net.result
    # ----------------------------------
    
    # ----------------------------------
    # Verify predictions
    y_pred <- max.col(predictions_model1)
    # ----------------------------------
    
    # ----------------------------------
    # Put predictions in a DataFrame
    ydata <- data.frame(y_actual, y_pred, round(y_pred))
    collist <- c("y_actual", "y_pred", "y_pred_int")
    ydata <- mycol_renom(ydata, collist)

    ydata <- ydata %>%
        transform(y_actual = as.integer(y_actual), y_pred_int = as.integer(y_pred_int))
    
    # Prepare a list : the functions accuracy and handmade_accuracy need a list
    y_actual <- ydata$y_actual
    y_pred_int <- ydata$y_pred_int
    # ----------------------------------
    
    # ----------------------------------
    # Accuracy calculation
    print('Accuracy: (TP+TN)/(TP+TN+FP+FN)')
    print(mean(y_pred == y_actual))
    
    # Handmande function
    print('Accuracy: (TP+TN)/(TP+TN+FP+FN)')
    mydict <- handmade_accuracy(y_actual, y_pred_int)
    # ----------------------------------
    return(mydict)
}

In [111]:
model_pred_accuracy_RF <- function(model, data, fst, fend, labnum){
    
    X <- data[,fst:fend]
    y_actual < data[,labnum]
    
    # ----------------------------------
    # Extract results: prediction probabilities
    # ----------------------------------
    # Works for RandomForest classification, NOT regression
    # predictions_model1 <- predict(model, data, "prob")
    # ----------------------------------
    
    # ----------------------------------
    # Verify predictions
    y_pred <- predict(model, X)
    # ----------------------------------
    
    # ----------------------------------
    # Put predictions in a DataFrame
    ydata <- data.frame(y_actual, y_pred, round(y_pred))
    collist <- c("y_actual", "y_pred", "y_pred_int")
    ydata <- mycol_renom(ydata, collist)

    ydata <- ydata %>%
        transform(y_actual = as.integer(y_actual), y_pred_int = as.integer(y_pred_int))
    
    # Prepare a list : the functions accuracy and handmade_accuracy need a list
    y_actual <- ydata$y_actual
    y_pred_int <- ydata$y_pred_int
    # ----------------------------------
    
    # ----------------------------------
    # Accuracy calculation
    print('Accuracy: (TP+TN)/(TP+TN+FP+FN)')
    print(mean(y_pred == y_actual))
    # Handmande function
    print('Accuracy: (TP+TN)/(TP+TN+FP+FN)')
    mydict <- handmade_accuracy(y_actual, y_pred_int)
    # ----------------------------------
    return(mydict)
}

# Pre-processing

## Search for undesired characters in bean_type

In [112]:
# Look at existing column names
print('Existing column names:')
print(colnames(df))

print('------------------------------------------------------')
collist <- c("company_name", "origin_bar", "ref", "review_date", "cocoa_per", "company_loc", "rating", "bean_type","bean_origin")
df <- mycol_renom(df, collist)

print('------------------------------------------------------')
print('Changed column names:')
print(colnames(df))

[1] "Existing column names:"
[1] "CompanyÂ \n(Maker-if known)"       "Specific Bean Origin\nor Bar Name"
[3] "REF"                               "Review\nDate"                     
[5] "Cocoa\nPercent"                    "Company\nLocation"                
[7] "Rating"                            "Bean\nType"                       
[9] "Broad Bean\nOrigin"               
[1] "------------------------------------------------------"
[1] "------------------------------------------------------"
[1] "Changed column names:"
[1] "company_name" "origin_bar"   "ref"          "review_date"  "cocoa_per"   
[6] "company_loc"  "rating"       "bean_type"    "bean_origin" 


In [113]:
df_num <- drop_na(df)
cocoa_per_int <- data.frame(as.double(gsub('%','',df_num$cocoa_per)))
#cocoa_per_int <- as.numeric(cocoa_per_int)

# Select numeric columns first, then categorical columns
# ref(3), cocoa_per_int(5), rating(7), bean_type(8), company_name(1), origin_bar(2), company_loc(6), bean_origin(9)
df_num <- data.frame(df_num[,3], cocoa_per_int, df_num[,7], df_num[,8], df_num[,1], df_num[,2], df_num[,6], df_num[,9])

# Rename columns
collist <- c("ref", "cocoa_per_int", "rating", "bean_type", "company_name", "origin_bar", "company_loc", "bean_origin")
df_num <- mycol_renom(df_num, collist)

In [114]:
head(df_num)

Unnamed: 0_level_0,ref,cocoa_per_int,rating,bean_type,company_name,origin_bar,company_loc,bean_origin
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1876,63,3.75,Â,A. Morin,Agua Grande,France,Sao Tome
2,1676,70,2.75,Â,A. Morin,Kpime,France,Togo
3,1676,70,3.0,Â,A. Morin,Atsane,France,Togo
4,1680,70,3.5,Â,A. Morin,Akata,France,Togo
5,1704,70,3.5,Â,A. Morin,Quilla,France,Peru
6,1315,70,2.75,Criollo,A. Morin,Carenero,France,Venezuela


In [115]:
a0 <- count(df_num, bean_type, sort = TRUE)
print('a0:')
print(a0)

[1] "a0:"
                  bean_type   n
1                        Â  887
2                Trinitario 418
3                   Criollo 153
4                 Forastero  87
5      Forastero (Nacional)  52
6                     Blend  41
7       Criollo, Trinitario  39
8        Forastero (Arriba)  37
9       Criollo (Porcelana)  10
10      Trinitario, Criollo   9
11    Forastero (Parazinho)   8
12   Forastero (Arriba) ASS   6
13                  Beniano   3
14                      EET   3
15                   Matina   3
16        Nacional (Arriba)   3
17               Amazon mix   2
18              Amazon, ICS   2
19         Criollo (Amarru)   2
20     Criollo (Ocumare 61)   2
21       Criollo, Forastero   2
22      Forastero (Catongo)   2
23                 Nacional   2
24 Trinitario (85% Criollo)   2
25    Trinitario, Forastero   2
26                   Amazon   1
27  Blend-Forastero,Criollo   1
28                    CCN51   1
29     Criollo (Ocumare 67)   1
30     Criollo (Ocumare 77)   

In [116]:
# Columns that we want to remove: too low and high counts
undes_char <- filter(a0, n < 10 | n > 450)
undes_char_list <- undes_char$bean_type
print('undes_char_list: ')
print(undes_char_list)

[1] "undes_char_list: "
 [1] "Â "                       "Trinitario, Criollo"     
 [3] "Forastero (Parazinho)"    "Forastero (Arriba) ASS"  
 [5] "Beniano"                  "EET"                     
 [7] "Matina"                   "Nacional (Arriba)"       
 [9] "Amazon mix"               "Amazon, ICS"             
[11] "Criollo (Amarru)"         "Criollo (Ocumare 61)"    
[13] "Criollo, Forastero"       "Forastero (Catongo)"     
[15] "Nacional"                 "Trinitario (85% Criollo)"
[17] "Trinitario, Forastero"    "Amazon"                  
[19] "Blend-Forastero,Criollo"  "CCN51"                   
[21] "Criollo (Ocumare 67)"     "Criollo (Ocumare 77)"    
[23] "Criollo (Ocumare)"        "Criollo (Wild)"          
[25] "Criollo, +"               "Forastero (Amelonado)"   
[27] "Forastero (Arriba) ASSS"  "Forastero, Trinitario"   
[29] "Forastero(Arriba, CCN)"   "Trinitario (Amelonado)"  
[31] "Trinitario (Scavina)"     "Trinitario, Nacional"    
[33] "Trinitario, TCGA"        


## Remove rows with undesired bean_type names 

In [117]:
df_num2 <- remove_undes_dfrows(df_num, df_num$bean_type, undes_char_list)

“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' to logical”
“coercing argument of type 'list' 

## Reduce bean_types that have similar names

In [118]:
head(df_num2)

Unnamed: 0_level_0,ref,cocoa_per_int,rating,bean_type,company_name,origin_bar,company_loc,bean_origin
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>
6,1315,70,2.75,Criollo,A. Morin,Carenero,France,Venezuela
8,1315,70,3.5,Criollo,A. Morin,Sur del Lago,France,Venezuela
9,1319,70,3.75,Criollo,A. Morin,Puerto Cabello,France,Venezuela
12,1011,70,3.0,Criollo,A. Morin,Madagascar,France,Madagascar
18,1015,70,4.0,Trinitario,A. Morin,Chuao,France,Venezuela
25,1470,70,3.75,Criollo,Acalli,"Tumbes, Norandino",U.S.A.,Peru


In [119]:
a1 <- count(df_num2, bean_type, sort = TRUE)
unique_words <- dfcol_2_list(a1$bean_type)

#collist <- list()
#df_new2 <- data.frame()
for (i in 1:length(unique_words)){
    #row_vec <- list()

    word <- toString(unique_words[i])

    for (j in 1:length(unique_words)){
        #row_vec <- append(row_vec, j)
        char0 <- toString(unique_words[j])
        # OU
        # look for half of the begining words; to catch similar words
        char <- substr(char0, 1, floor(nchar(char0)/2))

        # word char match
        match_ind <- lfind(word, char)

        # Was there a match found?
        if (length(match_ind) == 0){
            bool_match <- 0
        }
        else{
            bool_match <- 1
            if (word != char0)
            {
                # determine which word should be replaced by the other 
                k <- which.min(c(nchar(word), nchar(char0)))
                if (k == 1){
                    # replace short_nom_list[j] with short_nom_list[i]
                    word2replace <- word
                    replacewith <- char0
                }
                else{
                    word2replace <- char0
                    replacewith <- word
                }
                df_num2 <- df_num2 %>%
                    mutate(bean_type=replace(bean_type, bean_type==replacewith, word2replace))
            }
        }
        #row_vec <- append(row_vec, bool_match)
    }
    #n <- length(row_vec)
    #mat <- matrix(row_vec, nrow = 1, ncol = n, byrow = 1)
    #df_temp <- as.data.frame(mat)
    #mycol_renom(df_temp, collist) # define row names
    #df_new2 <- rbind(df_new2, df_temp)
}
#mycol_renom(df_new2, collist)

In [120]:
unique(df_num2$bean_type)

## Transform categorical features to numerical values

In [121]:
df_num3 <- df_num2 %>%
    mutate(bean_type_num=unclass(as.factor(bean_type)), company_name_num=unclass(as.factor(company_name)), origin_bar_num=unclass(as.factor(origin_bar)), company_loc_num=unclass(as.factor(company_loc)), bean_origin_num=unclass(as.factor(bean_origin)))


In [122]:
print(length(rownames(df_num3)))
head(df_num3)

[1] 837


Unnamed: 0_level_0,ref,cocoa_per_int,rating,bean_type,company_name,origin_bar,company_loc,bean_origin,bean_type_num,company_name_num,origin_bar_num,company_loc_num,bean_origin_num
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>
6,1315,70,2.75,Criollo,A. Morin,Carenero,France,Venezuela,2,1,78,15,59
8,1315,70,3.5,Criollo,A. Morin,Sur del Lago,France,Venezuela,2,1,440,15,59
9,1319,70,3.75,Criollo,A. Morin,Puerto Cabello,France,Venezuela,2,1,376,15,59
12,1011,70,3.0,Criollo,A. Morin,Madagascar,France,Madagascar,2,1,253,15,32
18,1015,70,4.0,Trinitario,A. Morin,Chuao,France,Venezuela,4,1,93,15,59
25,1470,70,3.75,Criollo,Acalli,"Tumbes, Norandino",U.S.A.,Peru,2,2,479,46,38


# CLASSIFICATION in R!! 

## Can we predict bean type using cocoa_per, REF, Rating?

In [123]:
# X matrix ONLY because we need to scale X below
df_numerical <- data.frame(df_num3[,1:3])
# ref(1),  cocoa_per_int(2), rating(3)

# Rename columns
collist <- c("ref", "cocoa_per_int", "rating")
df_numerical <- mycol_renom(df_numerical, collist)

fst <- 1
fend <- 3
labnum <- 4

In [124]:
df_numerical <- data.frame(df_num3[,1:3], df_num3[,10:13])
# ref(1),  cocoa_per_int(2), rating(3), 
# company_name_num(10), origin_bar_num(11), company_loc_num(12), bean_origin_num(13)

# Rename columns
collist <- c("ref", "cocoa_per_int", "rating", "company_name_num", "origin_bar_num", "company_loc_num", "bean_origin_num")
df_numerical <- mycol_renom(df_numerical, collist)

fst <- 1
fend <- 7
labnum <- 8

In [125]:
#df_numerical <- as.double(df_numerical)
df_numerical <- as.data.frame(lapply(df_numerical, as.double))

print(length(rownames(df_numerical)))
head(df_numerical)

[1] 837


Unnamed: 0_level_0,ref,cocoa_per_int,rating,company_name_num,origin_bar_num,company_loc_num,bean_origin_num
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,1315,70,2.75,1,78,15,59
2,1315,70,3.5,1,440,15,59
3,1319,70,3.75,1,376,15,59
4,1011,70,3.0,1,253,15,32
5,1015,70,4.0,1,93,15,59
6,1470,70,3.75,2,479,46,38


## Normalize/scale data

In [126]:
# df_numerical2 <- as.data.frame(lapply(df_numerical, scaled_data_standardization))
# OR
df_numerical2 <- map_df(df_numerical,.f=scaled_data_standardization)

print(length(rownames(df_numerical2)))
head(df_numerical2)

[1] 837


ref,cocoa_per_int,rating,company_name_num,origin_bar_num,company_loc_num,bean_origin_num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.6948652,-0.3178854,-1.0100499,-1.724183,-1.240516349,-0.8937676,1.32664362
0.6948652,-0.3178854,0.5921202,-1.724183,1.313063038,-0.8937676,1.32664362
0.701969,-0.3178854,1.1261769,-1.724183,0.861601489,-0.8937676,1.32664362
0.1549716,-0.3178854,-0.4759932,-1.724183,-0.006051176,-0.8937676,-0.08601336
0.1620754,-0.3178854,1.6602336,-1.724183,-1.134705048,-0.8937676,1.32664362
0.9701399,-0.3178854,1.1261769,-1.711326,1.588172419,0.9574807,0.22791041


### Unfornately column cocoa_per_int always becomes NA after scaling the data, I tried many transformation to try to keep it as a number. Despite the fact that I think it is an important column for prediction of bean_type, I dropped it to continue with the analysis.

In [127]:
df_numerical2 <- df_numerical2[,-2]

print(length(rownames(df_numerical2)))
head(df_numerical2)

[1] 837


ref,rating,company_name_num,origin_bar_num,company_loc_num,bean_origin_num
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.6948652,-1.0100499,-1.724183,-1.240516349,-0.8937676,1.32664362
0.6948652,0.5921202,-1.724183,1.313063038,-0.8937676,1.32664362
0.701969,1.1261769,-1.724183,0.861601489,-0.8937676,1.32664362
0.1549716,-0.4759932,-1.724183,-0.006051176,-0.8937676,-0.08601336
0.1620754,1.6602336,-1.724183,-1.134705048,-0.8937676,1.32664362
0.9701399,1.1261769,-1.711326,1.588172419,0.9574807,0.22791041


## Add y to the scaled X matrix

In [128]:
df_numerical3 <- data.frame(df_numerical2, df_num3[,9])
df_numerical3 <- drop_na(df_numerical3)

# Rename columns
collist <- c("ref", "rating", "company_name_num", "origin_bar_num", "company_loc_num", "bean_origin_num", "bean_type_num")
df_numerical3 <- mycol_renom(df_numerical3, collist)

print(length(rownames(df_numerical3)))
head(df_numerical3)

[1] 837


Unnamed: 0_level_0,ref,rating,company_name_num,origin_bar_num,company_loc_num,bean_origin_num,bean_type_num
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
1,0.6948652,-1.0100499,-1.724183,-1.240516349,-0.8937676,1.32664362,2
2,0.6948652,0.5921202,-1.724183,1.313063038,-0.8937676,1.32664362,2
3,0.701969,1.1261769,-1.724183,0.861601489,-0.8937676,1.32664362,2
4,0.1549716,-0.4759932,-1.724183,-0.006051176,-0.8937676,-0.08601336,2
5,0.1620754,1.6602336,-1.724183,-1.134705048,-0.8937676,1.32664362,4
6,0.9701399,1.1261769,-1.711326,1.588172419,0.9574807,0.22791041,2


## Split into train and test data set

### Since the pipe (%>%) character stopped functioning, we can not use the select() function. Thus, we need to select each column in the DataFrame by number; I use the variables fst, fend, and labnum.

In [129]:
matrix_split <- initial_split(df_numerical3, prop = 3/4,)

train_data <- training(matrix_split)
test_data <- testing(matrix_split)

fst <- 1
fend <- 6
labnum <- 7

which_way = 0
if (which_way == 0){
    X_train <- train_data[,fst:fend]
    y_train <- data.frame(train_data[,labnum])
    collist <- c("bean_type_num")
    y_train <- mycol_renom(y_train, collist)
    
    X_test <- test_data[,fst:fend]
    y_test <- data.frame(test_data[,labnum])
    y_test <- mycol_renom(y_test, collist)
} else {
    X_train <- train_data %>% select(ref, cocoa_per_int, rating)
    y_train <- train_data %>% select(bean_type_num)
    X_test <- test_data %>% select(ref, cocoa_per_int, rating)
    y_test <- test_data %>% select(bean_type_num)
}

In [130]:
print(c(length(rownames(X_train)), length(colnames(X_train))))
print(c(length(rownames(y_train)), length(colnames(y_train))))
print(c(length(rownames(X_test)), length(colnames(X_test))))
print(c(length(rownames(y_test)), length(colnames(y_test))))

print(any(is.na(X_train)))
print(any(is.na(y_train)))
print(any(is.na(X_test)))
print(any(is.na(y_test)))

print(colnames(X_train))
print(colnames(y_train))
print(colnames(X_test))
print(colnames(y_test))

[1] 627   6
[1] 627   1
[1] 210   6
[1] 210   1
[1] FALSE
[1] FALSE
[1] FALSE
[1] FALSE
[1] "ref"              "rating"           "company_name_num" "origin_bar_num"  
[5] "company_loc_num"  "bean_origin_num" 
[1] "bean_type_num"
[1] "ref"              "rating"           "company_name_num" "origin_bar_num"  
[5] "company_loc_num"  "bean_origin_num" 
[1] "bean_type_num"


## One-hot nototion (optional)

In [131]:
y_train_onehot <- mutate(y_train, btn1 = ifelse(bean_type_num==1, 1, 0))
y_train_onehot <- mutate(y_train_onehot, btn2 = ifelse(bean_type_num==2, 1, 0))
y_train_onehot <- mutate(y_train_onehot, btn3 = ifelse(bean_type_num==3, 1, 0))
y_train_onehot <- mutate(y_train_onehot, btn4 = ifelse(bean_type_num==4, 1, 0))
y_train_onehot <- y_train_onehot[2:5]  # one-hot

In [132]:
head(y_train_onehot)

Unnamed: 0_level_0,btn1,btn2,btn3,btn4
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,0,0,0,1
2,0,0,1,0
3,0,0,0,1
4,0,0,0,1
5,0,0,0,1
6,0,0,0,1


In [133]:
y_test_onehot <- mutate(y_test, btn1 = ifelse(bean_type_num==1, 1, 0))
y_test_onehot <- mutate(y_test_onehot, btn2 = ifelse(bean_type_num==2, 1, 0))
y_test_onehot <- mutate(y_test_onehot, btn3 = ifelse(bean_type_num==3, 1, 0))
y_test_onehot <- mutate(y_test_onehot, btn4 = ifelse(bean_type_num==4, 1, 0))
y_test_onehot <- y_test_onehot[2:5]  # one-hot

In [134]:
head(y_test_onehot)

Unnamed: 0_level_0,btn1,btn2,btn3,btn4
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,0,1,0,0
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
5,0,0,0,1
6,0,0,0,1


In [135]:
print(c(length(rownames(X_train)), length(colnames(X_train))))
print(c(length(rownames(y_train_onehot)), length(colnames(y_train_onehot))))
print(c(length(rownames(X_test)), length(colnames(X_test))))
print(c(length(rownames(y_test_onehot)), length(colnames(y_test_onehot))))

print(any(is.na(X_train)))
print(any(is.na(y_train_onehot)))
print(any(is.na(X_test)))
print(any(is.na(y_test_onehot)))

print(colnames(X_train))
print(colnames(y_train_onehot))
print(colnames(X_test))
print(colnames(y_test_onehot))

[1] 627   6
[1] 627   4
[1] 210   6
[1] 210   4
[1] FALSE
[1] FALSE
[1] FALSE
[1] FALSE
[1] "ref"              "rating"           "company_name_num" "origin_bar_num"  
[5] "company_loc_num"  "bean_origin_num" 
[1] "btn1" "btn2" "btn3" "btn4"
[1] "ref"              "rating"           "company_name_num" "origin_bar_num"  
[5] "company_loc_num"  "bean_origin_num" 
[1] "btn1" "btn2" "btn3" "btn4"


# Neural Network

In [136]:
# Covariates/features with response/y, but it ignores response/y column/s 
# because you specify which column/s are y
data_train <- data.frame(X_train, y_train) 
data_train_onehot <- data.frame(X_train, y_train_onehot)
data_test <- data.frame(X_test, y_test) 
data_test_onehot <- data.frame(X_test, y_test_onehot)

laquelle = 4

if (laquelle == 0){
    # Regression: nnet model: DataFrame y vector notation
    y <- y_train # response
    model <- nnet(formula=y ~ ., data=X_train, size = 30)
    # OR
    # model <- nnet(formula=y_train[,1] ~ ., data=X_train, size = 30)
    # ..........
    data <- X_train
    y_actual <- y_train
    
} else if (laquelle == 1) {
    # Classification: nnet model: String one-hot notation
    X <- c("ref","cocoa_per_int","rating")  # covariates/features
    y <- 'btn1 + btn2 + btn3 + btn4 ~ ' # response
    f <- as.formula(paste(y , paste(X, collapse='+')))
    print(f)
    model <- nnet(formula=f, data=data_train_onehot, size = 30)
    # ..........
    data <- data_train_onehot
    y_actual <- y_train_onehot
    
} else if (laquelle == 2) {
    # Classification: neuralnet model: String one-hot notation
    X <- c("ref","cocoa_per_int","rating")  # covariates/features
    y <- 'btn1 + btn2 + btn3 + btn4 ~ ' # response
    f <- as.formula(paste(y , paste(X, collapse='+')))
    print(f)
    model <- neuralnet(f, data=data_train_onehot, hidden = c(9, 3, 4),
                       act.fct = "logistic", linear.output = FALSE, lifesign = "minimal")
    # response = btn1 btn2 btn3 btn4
    # covariate = ref cocoa_per_int rating
    # data = ref cocoa_per_int rating btn1 btn2 btn3 btn4
    # linear.output = FALSE such that you can specify a cost function (ie: logistic, relu, tanh)
    # ..........
    data <- data_train_onehot
    y_actual <- y_train_onehot
    
} else if (laquelle == 3) {
    # Classification: String y vector notation
    # X <- c("ref","cocoa_per_int","rating")  # covariates/features
    X <- c("ref", "rating", "company_name_num", "origin_bar_num", "company_loc_num", "bean_origin_num")
    y <- 'bean_type_num ~ ' # response
    f <- as.formula(paste(y , paste(X, collapse='+')))
    print(f)
    model <- neuralnet(f, data=data_train, hidden = 3, learningrate=0.001,
                    act.fct = "logistic", linear.output = FALSE, lifesign = "minimal")
    # response = bean_type_num
    # covariate = ref cocoa_per_int rating
    # ..........
    data <- data_train
    y_actual <- y_train
    
} else if (laquelle == 4) {
    # Classification: DataFrame y vector notation
    y <- data_train$bean_type_num # response
    model <- neuralnet(y ~ ., data_train, hidden = 3, learningrate=0.01,
                    act.fct = "logistic", linear.output = FALSE, lifesign = "minimal")
    # response = y
    # covariate =  features (ref rating company_name_num origin_bar_num company_loc_num)
    # ..........
    data <- data_train
    y_actual <- y_train
    
} else {
    
}

# Confirm that the covariates and response are CORRECTLY assigned with the model print out
print(model)

hidden: 3    thresh: 0.01    rep: 1/1    steps: 
     39
	error: 1877.00631
	time: 0.01 secs



$call
neuralnet(formula = y ~ ., data = data_train, hidden = 3, learningrate = 0.01, 
    lifesign = "minimal", act.fct = "logistic", linear.output = FALSE)

$response
    y
523 4
567 3
250 4
493 4
708 4
718 4
336 4
334 3
303 4
761 3
755 2
475 3
518 2
700 4
378 1
582 4
99  4
533 3
369 4
318 2
137 4
245 3
359 4
294 2
685 2
497 2
690 4
136 2
455 2
505 4
73  4
450 4
175 2
280 4
573 4
503 2
366 4
551 3
721 3
343 4
536 2
226 4
252 2
751 4
135 2
491 1
745 4
829 4
444 1
775 2
363 4
24  3
460 4
442 3
308 2
49  3
168 4
710 2
732 3
376 3
794 3
681 4
42  1
2   2
181 4
758 4
669 3
205 4
274 4
201 2
385 4
820 4
103 4
805 3
562 1
153 4
783 4
656 2
699 3
489 3
141 4
352 4
756 4
355 4
50  4
815 2
647 4
190 2
189 3
214 3
818 4
206 4
333 3
635 3
12  3
235 2
348 4
283 3
667 3
658 4
295 2
648 4
498 4
282 2
811 4
577 1
10  4
666 3
709 4
781 3
434 3
636 3
41  1
109 4
277 1
389 2
249 4
16  3
126 4
224 3
195 2
237 3
257 3
58  2
395 4
451 4
91  4
329 4
430 4
512 3
397 4
580 4
196 2
155 3
305 2
341 4
69  1
531 

In [137]:
print('TRAINING accuracy:')
mydict <- model_pred_accuracy_NN(model, data, fst, fend, labnum)

print('mydict')
print(mydict)

[1] "TRAINING accuracy:"
[1] "Accuracy: (TP+TN)/(TP+TN+FP+FN)"
[1] 0.04625199
[1] "Accuracy: (TP+TN)/(TP+TN+FP+FN)"
[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

[[7]]
[1] 0

[[8]]
[1] 0

[[9]]
[1] 0

[[10]]
[1] 0

[[11]]
[1] 0

[[12]]
[1] 0

[[13]]
[1] 0

[[14]]
[1] 0

[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

[[7]]
[1] 0

[[8]]
[1] 0

[[9]]
[1] 0

[[10]]
[1] 0

[[11]]
[1] 0

[[12]]
[1] 0

[[13]]
[1] 0

[[14]]
[1] 0

[[15]]
[1] 1

[[16]]
[1] 0

[[17]]
[1] 0

[[18]]
[1] 0

[[19]]
[1] 0

[[20]]
[1] 0

[[21]]
[1] 0

[[22]]
[1] 0

[[23]]
[1] 0

[[24]]
[1] 0

[[25]]
[1] 0

[[26]]
[1] 0

[[27]]
[1] 0

[[28]]
[1] 0

[[29]]
[1] 0

[[30]]
[1] 0

[[31]]
[1] 0

[[32]]
[1] 0

[[33]]
[1] 0

[[34]]
[1] 0

[[35]]
[1] 0

[[36]]
[1] 0

[[37]]
[1] 0

[[38]]
[1] 0

[[39]]
[1] 0

[[40]]
[1] 0

[[41]]
[1] 0

[[42]]
[1] 0

[[43]]
[1] 0

[[44]]
[1] 0

[[45]]
[1] 0

[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

In [138]:
# print('TEST accuracy:')
# mydict <- model_pred_accuracy_NN(model, data_test, fst, fend, labnum)

# print('mydict')
# print(mydict)

# RandomForest

In [139]:
# Tilde symbol l is used within formulas of statistical models, 
# as mainly this symbol is used to define the relationship between 
# the dependent variable and the independent variables in the statistical 
# model formula in the R programming language. The left side of the tilde 
# symbol specifies the target variable (dependent variable or outcome) and 
# the right side of the tilde specifies the predictor variable(independent variables).

laquelle = 0

if (laquelle == 0){
    # Regression
    # X <- c("ref","cocoa_per_int","rating")  # covariates/features
    X <- c("ref","rating", "company_name_num", "origin_bar_num", "company_loc_num", "bean_origin_num")
    y <- 'bean_type_num ~ ' # response
    f <- as.formula(paste(y , paste(X, collapse='+')))
    print(f)
    model <- randomForest(f, data=data_train, importance=TRUE, proximity=TRUE)

} else if (laquelle == 1){
    # Regression
    model <- randomForest(y_train$bean_type_num ~., data=X_train, importance=TRUE, proximity=TRUE)
    
} else if (laquelle == 2){
    # Regression
    model <- randomForest(x=X_train, y=y_train$bean_type_num, 
                  xtest=X_test, ytest=y_test$bean_type_num, importance= TRUE)
} else {
    # Regression
    model <- randomForest(X_train, y_train$bean_type_num) 
}

print(model)

bean_type_num ~ ref + rating + company_name_num + origin_bar_num + 
    company_loc_num + bean_origin_num


“The response has five or fewer unique values.  Are you sure you want to do regression?”



Call:
 randomForest(formula = f, data = data_train, importance = TRUE,      proximity = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 2

          Mean of squared residuals: 0.4868278
                    % Var explained: 42.3


In [None]:
print('TRAINING accuracy:')
mydict <- model_pred_accuracy_RF(model, data, fst, fend, labnum)
print('mydict')
print(mydict)

[1] "TRAINING accuracy:"


In [None]:
# print('------------------------------------------------------')
# print('TEST accuracy:')
# mydict <- model_pred_accuracy_RF(model, data_test, fst, fend, labnum)
# print('mydict')
# print(mydict)

In [None]:
print('------------------------------------------------------')
print('Evaluate Feature Importance:')
round(importance(model), 2)

# Knit to output html

PATH="/kaggle/input/rclassification/code_example.rmd"
out <- str_split(PATH, "/")  # takes a character vector and returns a list.
df <- data.frame(out)
r <- length(rownames(df))
c <- length(colnames(df))
filename <- toString(df[r,c])
cat("filename: ", filename)

file.copy(from=PATH, to="./",
         overwrite=TRUE, recursive=FALSE,
         copy.mode=TRUE)

# https://search.r-project.org/CRAN/refmans/rmarkdown/html/render.html
# output_format="html_document", "cv_document", "pdf_document"
#rmarkdown::render(input=filename, output_format="html_document", 
#                  output_file="out.html")

rmarkdown::render(input=filename, output_format="html_document", 
                  output_file="r-classification.html")