In [14]:
library(tidyverse)
library(caret)
library(randomForest)
library(xgboost)
library(gbm)

In [15]:
# Split the data into training and validation sets
trainData <- read.csv("/kaggle/input/dsr302m/train.csv")
testData <- read.csv("/kaggle/input/dsr302m/test.csv")
head(trainData,5)


Unnamed: 0_level_0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,type
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>
1,6.6,0.3,0.36,1.2,0.035,43,126,0.9909,3.01,0.63,11.4,6,white
2,7.7,0.5,0.26,1.9,0.062,9,31,0.9966,3.39,0.64,9.6,5,red
3,8.4,0.5,0.35,2.9,0.076,21,127,0.9976,3.23,0.63,9.2,5,red
4,7.5,0.4,0.33,5.0,0.045,30,131,0.9942,3.32,0.44,10.9,6,white
5,6.4,0.2,0.25,20.2,0.083,35,157,0.9998,3.17,0.5,9.1,5,white


# Preprocessing

## Label encode

In [16]:
# Encoding the character column within the dataframe
# trainData$type <- as.numeric(factor(trainData$type))

## Frequency encode

In [17]:
freq_count  <-  table(trainData$type)
trainData$type   <-  match(trainData$type, names(freq_count))

# Feature Engineer
original [1] "RMSE: 0.269911066116102"

free/total [1] "RMSE: 0.267660409371441"

total/res [1] "RMSE: 0.266309867791473"

res/total [1] "RMSE: 0.264389686515288"

-fixed [1] "RMSE: 0.263081606218825"





In [18]:
round(cor(trainData),
  digits = 3 # rounded to 2 decimals
)

Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality,type
fixed.acidity,1.0,0.225,0.319,-0.127,0.298,-0.296,-0.344,0.475,-0.236,0.335,-0.106,-0.06,-0.503
volatile.acidity,0.225,1.0,-0.377,-0.21,0.368,-0.365,-0.42,0.287,0.266,0.233,-0.076,-0.26,-0.657
citric.acid,0.319,-0.377,1.0,0.141,0.047,0.142,0.201,0.083,-0.333,0.071,0.013,0.091,0.196
residual.sugar,-0.127,-0.21,0.141,1.0,-0.147,0.414,0.502,0.522,-0.264,-0.196,-0.333,-0.028,0.365
chlorides,0.298,0.368,0.047,-0.147,1.0,-0.21,-0.286,0.359,0.04,0.442,-0.262,-0.194,-0.515
free.sulfur.dioxide,-0.296,-0.365,0.142,0.414,-0.21,1.0,0.73,0.002,-0.152,-0.201,-0.149,0.068,0.491
total.sulfur.dioxide,-0.344,-0.42,0.201,0.502,-0.286,0.73,1.0,0.007,-0.24,-0.276,-0.23,-0.029,0.699
density,0.475,0.287,0.083,0.522,0.359,0.002,0.007,1.0,0.03,0.278,-0.69,-0.298,-0.42
pH,-0.236,0.266,-0.333,-0.264,0.04,-0.152,-0.24,0.03,1.0,0.164,0.097,0.004,-0.333
sulphates,0.335,0.233,0.071,-0.196,0.442,-0.201,-0.276,0.278,0.164,1.0,-0.03,0.023,-0.491


In [19]:
trainData$free_total <- trainData$free.sulfur.dioxide / trainData$total.sulfur.dioxide
trainData$total_res <- trainData$total.sulfur.dioxide / trainData$residual.sugar
trainData$res_total <- trainData$residual.sugar / trainData$total.sulfur.dioxide
summary(trainData)

 fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
 Min.   : 3.800   Min.   :0.1000   Min.   :0.0000   Min.   : 0.600  
 1st Qu.: 6.400   1st Qu.:0.2000   1st Qu.:0.2400   1st Qu.: 1.800  
 Median : 7.000   Median :0.3000   Median :0.3100   Median : 3.000  
 Mean   : 7.252   Mean   :0.3484   Mean   :0.3175   Mean   : 5.412  
 3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.000  
 Max.   :15.900   Max.   :1.3000   Max.   :1.6600   Max.   :65.800  
   chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
 Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
 1st Qu.:0.03800   1st Qu.: 16.00      1st Qu.: 75.0        1st Qu.:0.9924  
 Median :0.04800   Median : 28.00      Median :117.0        Median :0.9951  
 Mean   :0.05751   Mean   : 30.13      Mean   :114.6        Mean   :0.9948  
 3rd Qu.:0.06800   3rd Qu.: 41.00      3rd Qu.:155.0        3rd Qu.:0.9971  
 Max.   :0.61100   Max.   :289.00      Max.   :440.0   

# Traning

In [20]:
set.seed(42)  # Set a seed for reproducibility
trainIndex <- createDataPartition(trainData$quality, p = .8, 
                                  list = FALSE,
                                  times = 1)
trainData <- trainData[ trainIndex,]
validationData <- trainData[-trainIndex,]
# Train a Random Forest Model
model <- randomForest(quality~.-fixed.acidity, data=trainData,
                   ntree = 350,       # number of trees
                   mtry = 13,          # number of variables tried at each split
                   importance = TRUE)


print(model)
predictions <- predict(model, newdata = validationData)
# Calculate RMSE
rmse <- sqrt(mean((predictions - validationData$quality)^2))
print(paste("RMSE:", rmse))



Call:
 randomForest(formula = quality ~ . - fixed.acidity, data = trainData,      ntree = 350, mtry = 13, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 350
No. of variables tried at each split: 13

          Mean of squared residuals: 0.3519416
                    % Var explained: 55.29
[1] "RMSE: 0.260298104352344"


# Ensemble model

In [21]:
# set.seed(42)  # Set a seed for reproducibility
# trainIndex <- createDataPartition(trainData$quality, p = .8, 
#                                   list = FALSE,
#                                   times = 1)
# trainData <- trainData[ trainIndex,]
# validationData <- trainData[-trainIndex,]

# td <- subset(trainData, select = -c(quality,fixed.acidity))
# vd <- subset(validationData, select = -c(quality,fixed.acidity))

# xgb_train <- xgb.DMatrix(data = as.matrix(td), label = trainData$quality)
# xgb_valid <- xgb.DMatrix(data = as.matrix(vd), label = validationData$quality)

# # Train XGBoost model
# xgb_model <- xgboost(data = xgb_train,
#                       nround=200,
#                       max_depth = 17,
#                       eta = 0.09,
#                       gamma=1,
#                       objective = "reg:squarederror",
#                       verbose=0)
                       
# # Make predictions and evaluate
# xgb_preds <- predict(xgb_model, xgb_valid)
# rf_model <- randomForest(quality~.-fixed.acidity, data=trainData,
#                    ntree = 350,       # number of trees
#                    mtry = 13,          # number of variables tried at each split
#                    importance = TRUE)
# rf_preds <- predict(rf_model, newdata = validationData)
# rf_rmse <- sqrt(mean((rf_preds - validationData$quality)^2))

# print(paste("Ensemble RMSE:", rf_rmse))
# Calculate RMSE
# ensemble_preds <- 0.6*rf_preds + 0.4*xgb_preds

# # Evaluate ensemble
# ensemble_rmse <- sqrt(mean((ensemble_preds - validationData$quality)^2))

# print(paste("Ensemble RMSE:", ensemble_rmse))


# Fine -Tuning


In [22]:
# set.seed(42)  # Set a seed for reproducibility
# trainIndex <- createDataPartition(trainData$quality, p = .8, 
#                                   list = FALSE,
#                                   times = 1)
# trainData <- trainData[ trainIndex,]
# validationData <- trainData[-trainIndex,]
# # Set candidate values
# ntree_vals <- c(100,200,300, 350,500)
# mtry_vals <- c(8,9,10,11,12,13) 

# # Function to evaluate a model
# tune_model <- function(ntree, mtry){

#   model <- randomForest(quality ~., 
#                         data = trainData, 
#                         ntree = ntree,
#                         mtry = mtry,
#                         importance = TRUE)
                        
#   predictions <- predict(model, newdata = validationData)
#   rmse <- sqrt(mean((predictions - validationData$quality)^2))
  
#   return(rmse)
# }

# # Test all combinations 
# results <- expand.grid(ntree = ntree_vals,
#                        mtry = mtry_vals)
                       
# results$rmse <- apply(results, 1, function(x) tune_model(x['ntree'], x['mtry']))

# # Examine results
# print(results)

# # Refit with best params
# best_ntree <- results$ntree[which.min(results$rmse)] 
# best_mtry <- results$mtry[which.min(results$rmse)]

# final_model <- randomForest(quality ~., 
#                             data = trainData,
#                             ntree = best_ntree, 
#                             mtry = best_mtry,
#                             importance = TRUE)


# GBM

In [23]:
# trainIndex <- createDataPartition(trainData$quality, p = .8, 
#                                   list = FALSE,
#                                   times = 1)
# trainData <- trainData[ trainIndex,]
# validationData <- trainData[-trainIndex,]

# n.trees <- c(100,150,200,300,400)
# interaction.depth <- c(3,4,5) 

# # Function to evaluate a model
# tune_model <- function(tree, depth){

#   model <- gbm(quality ~.-fixed.acidity, 
#                         data = trainData, 
#                         n.trees = n.trees,
#                         interaction.depth = interaction.depth)
                        
#   predictions <- predict(model, newdata = validationData)
#   rmse <- sqrt(mean((predictions - validationData$quality)^2))
  
#   return(rmse)
# }

# # Test all combinations 
# results <- expand.grid(n.trees= n.trees,
#                        interaction.depth = interaction.depth)
                       
# results$rmse <- apply(results, 1, function(x) tune_model(x['n.trees'], x['interaction.depth']))

# # Examine results
# print(results)

# # Refit with best params
# best_tree <- results$n.trees[which.min(results$rmse)] 
# best_depth <- results$n.trees[which.min(results$rmse)]

# gbm_model <- gbm(quality ~. -fixed.acidity,
#                 data = trainData,
#                 n.trees = best_tree,
#                 interaction.depth = best_depth)
                
# # Predict                 
# # gbm_preds <- predict(gbm_model, validationData, n.trees=1000)

# Prediction test

In [24]:
testData <- read.csv("/kaggle/input/dsr302m/test.csv")
t <- subset(testData, select = -id)
# Encoding the character column within the dataframe
# trainData$type <- as.numeric(factor(trainData$type))
freq_count_test  <-  table(t$type)
t$type   <-  match(t$type, names(freq_count_test))
summary(t)

 fixed.acidity   volatile.acidity  citric.acid     residual.sugar  
 Min.   : 4.80   Min.   :0.0800   Min.   :0.0000   Min.   : 0.700  
 1st Qu.: 6.50   1st Qu.:0.2300   1st Qu.:0.2400   1st Qu.: 1.800  
 Median : 7.00   Median :0.3000   Median :0.3250   Median : 2.500  
 Mean   : 7.26   Mean   :0.3525   Mean   :0.3212   Mean   : 4.992  
 3rd Qu.: 7.70   3rd Qu.:0.4200   3rd Qu.:0.4000   3rd Qu.: 7.400  
 Max.   :13.00   Max.   :1.5800   Max.   :0.7800   Max.   :23.500  
   chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
 Min.   :0.01500   Min.   :  3.00      Min.   :  7.00       Min.   :0.9871  
 1st Qu.:0.04000   1st Qu.: 16.00      1st Qu.: 64.75       1st Qu.:0.9925  
 Median :0.04800   Median : 29.00      Median :116.00       Median :0.9949  
 Mean   :0.05741   Mean   : 30.46      Mean   :111.77       Mean   :0.9947  
 3rd Qu.:0.06600   3rd Qu.: 41.00      3rd Qu.:156.00       3rd Qu.:0.9969  
 Max.   :0.41300   Max.   :146.50      Max.   :366.50       Ma

In [25]:
t$free_total <- t$free.sulfur.dioxide / t$total.sulfur.dioxide
t$total_res <- t$total.sulfur.dioxide / t$residual.sugar
t$res_total <- t$residual.sugar / t$total.sulfur.dioxide
head(t,5)

Unnamed: 0_level_0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,type,free_total,total_res,res_total
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>
1,7.2,0.25,0.37,2.5,0.063,11,41,0.99439,3.52,0.8,12.4,1,0.2682927,16.4,0.06097561
2,8.2,0.27,0.39,7.8,0.039,49,208,0.9976,3.31,0.51,9.5,2,0.2355769,26.66667,0.0375
3,8.9,0.32,0.49,1.6,0.05,17,131,0.9956,3.13,0.34,9.4,2,0.129771,81.875,0.01221374
4,7.4,0.16,0.3,13.7,0.056,33,168,0.99825,2.9,0.44,8.7,2,0.1964286,12.26277,0.08154762
5,6.4,0.28,0.56,1.7,0.156,49,106,0.99354,3.1,0.37,9.2,2,0.4622642,62.35294,0.01603774


In [26]:
# Make predictions on the test data
test_predictions <- predict(model, newdata = t)
# Create a data frame with 'id' and 'quality' columns
result <- data.frame(id = testData$id, quality = test_predictions)

# Save the results to a CSV file
write.csv(result, "result.csv", row.names = FALSE)
result_data <-read.csv("/kaggle/working/result.csv")
result_data


id,quality
<int>,<dbl>
1257,6.666286
6409,5.470857
136,4.969143
1631,6.677762
6084,5.733714
5434,5.244857
1094,5.424333
5146,6.362000
5921,6.749762
1076,6.022571
