In [None]:
# install initial required 
install.packages("devtools")
install.packages("tidyverse")
install.packages("ggstatsplot")
install.packages("ggplot2")
install.packages("readxl")
install.packages("factoextra")
install.packages("NbClust")
install.packages("rlang")
install.packages('swamp')
install.packages("dplyr")

In [None]:
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("impute")

In [None]:
install.packages("caret", dependencies=c("Depends", "Suggests"))
library(caret)
install.packages("ggplot2")
install.packages("ggstatsplot")

In [None]:
# import csv file into R
path <- "D:/Nav/my_projects/ML/SkillCraft_Dataset.csv"
gameDataSet <- read.csv(path)

In [None]:
#Column Names
colnames(gameDataSet)

In [None]:
#Remove GameID since it makes no sense to inputs
gameDataSet = gameDataSet[,!(names(gameDataSet) %in% c("GameID"))]

In [None]:
#demo dataset
head(gameDataSet)

In [None]:
#Findout the dataType or classes of df
sapply(gameDataSet, class)

In [None]:
#Convert Non-Numeric columns to Numeric

## Specifying columns want to change
cf <- c(2, 3, 4)

# Specifying own function within apply
gameDataSet[ , cf] <- apply(gameDataSet[ , cf], 2,            
                    function(x) as.numeric(as.character(x)))

In [None]:
#Findout the dataType or classes of df
sapply(gameDataSet, class)

In [None]:
install.packages("magrittr") # package installations are only needed the first time you use it
install.packages("dplyr")    # alternative installation of the %>%
library(magrittr) # needs to be run every time you start R and want to use %>%
library(dplyr) 

In [None]:
# Applying functions of dplyr
gameDf_scaled <- gameDataSet %>%           
  mutate_at(colnames(gameDataSet), ~(scale(.) %>% as.vector))

head(gameDf_scaled)

In [None]:
# Create a boxplot of the dataset, outliers are shown as two distinct points
boxplot(gameDf_scaled)$out

In [None]:
# create detect outlier function
find_outlierx <- function(x) { 
  Quantilex1 <- quantile(x, probs=.25)
  Quantilex3 <- quantile(x, probs=.75)
  IQR = Quantilex3-Quantilex1
  x > Quantilex3 + (IQR*1.5) | x < Quantilex1 - (IQR*1.5)
}

In [None]:
# create remove outlier function
remove_outliery <- function(dataframe,
                            columns=names(dataframe)) { 
  # for loop to traverse in columns vector
  for (col in columns) {

      # remove observation if it satisfies outlier function
      dataframe <- dataframe[!find_outlierx(dataframe[[col]]), ]
  }

  # return dataframe
  print("Remove outliers")
  print(dataframe)
}

In [None]:
#Set NaN to 0.
gameDf_scaled[is.na(gameDf_scaled)] = 0

In [None]:
#Remove Outliers
nill_outliers_gameDf_scaled <- remove_outliery(gameDf_scaled, colnames(gameDf_scaled))

In [None]:
# Create a boxplot of the dataset, after removing outliers
boxplot(nill_outliers_gameDf_scaled)$out

In [None]:
#seperate Input variables and Output variable
output_column = nill_outliers_gameDf_scaled$LeagueIndex  

new_input_gameDf = nill_outliers_gameDf_scaled[,!(names(nill_outliers_gameDf_scaled) %in% c("LeagueIndex"))]

In [None]:
# Visualize indivizual classes
oldpar1 = par(mfrow = c(2,6))
for ( i in 1:11 ) {
  boxplot(new_input_gameDf[[i]])
  mtext(names(new_input_gameDf)[i], cex = 1, side = 1, line = 2)
}
par(oldpar1) 

In [None]:
install.packages("randomForest")
install.packages("party")
install.packages("Amelia")
install.packages("yardstick")

In [None]:
library(randomForest)
require(caTools)
library("party")
library('tidyverse')
library("Amelia")
library('randomForest')
library(caret) # for applying classification and getting confusion matrix
library('yardstick')

In [None]:
head(nill_outliers_gameDf_scaled)

In [None]:
#QUick summary for the dataset
summary(nill_outliers_gameDf_scaled)

In [None]:
#Data types of each df.column
sapply(nill_outliers_gameDf_scaled, class)

In [None]:
#Converting Ranking column to charater 
nill_outliers_gameDf_scaled$LeagueIndex = as.character(as.numeric(nill_outliers_gameDf_scaled$LeagueIndex))

In [None]:
#Replacing with reasonable Charater (ASC ORD)
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "-2.09848934275149", "A")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "-1.43943556674397", "B")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "-0.780381790736446", "C")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "-0.121328014728925", "D")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "0.537725761278596", "E")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "1.19677953728612", "F")
nill_outliers_gameDf_scaled$LeagueIndex <- str_replace(nill_outliers_gameDf_scaled$LeagueIndex, "1.85583331329364", "G")

In [None]:
#Converting character to factor for furtur processings
nill_outliers_gameDf_scaled$LeagueIndex = as.factor(as.character(nill_outliers_gameDf_scaled$LeagueIndex))

In [None]:
# Getting distinct value of rank column with new values
distinct(nill_outliers_gameDf_scaled, LeagueIndex)

In [None]:
#Finding Null values for columns
SumNa <- function(col){sum(is.na(col))}

In [None]:
#Filtering for all the feature columns
data.sum <- nill_outliers_gameDf_scaled %>% 
  summarise_all(SumNa) %>%
  tidyr::gather(key='feature', value='SumNa') %>%
  arrange(-SumNa) %>%
  mutate(PctNa = SumNa/nrow(nill_outliers_gameDf_scaled))

In [None]:
#Filetering for Index Column ..consider the PCA
data.sum2 <- data.sum %>% 
  filter(! (feature %in% c('LeagueIndex'))) %>%
  filter(PctNa < .85)

In [None]:
data.sum2$feature

In [None]:
#Finalize the dataframe to train
data_F <- nill_outliers_gameDf_scaled %>% 
  dplyr::select(LeagueIndex, data.sum2$feature) %>%
  filter(!is.na(LeagueIndex))

In [None]:
#Gap visualizing
Amelia::missmap(as.data.frame(data_F))

In [None]:
set.seed(8576309)

#Partition data set for Train and Test with 70% and 30% Accordingly
trainIndex <- createDataPartition(data_F$LeagueIndex, 
                                  p = .7, 
                                  list = FALSE, 
                                  times = 1)

In [None]:
train_df <- data_F[trainIndex, ]
test_df <- data_F[-trainIndex, ]

In [None]:
# Defining controller for train
train_ctrl <- trainControl(method="cv", # type of resampling in this case Cross-Validated
                           number=3, # number of folds
                           search = "random", # we are performing a "random
                           )

In [None]:
# Index demo
class(train_df[,1])

In [None]:
toc <- Sys.time()

#Training the model...
model_rf <- train(LeagueIndex ~ .,
                       data = train_df,
                       method = "rf", # this will use the randomForest::randomForest function
                       metric =  "Accuracy", # which metric should be optimized for 
                       trControl = train_ctrl,
                       # options to be passed to randomForest
                       ntree = 741,
                       keep.forest=TRUE,
                       importance=TRUE) 
tic <- Sys.time()

In [None]:
# Execution time
tic - toc

In [None]:
#Model Summary
model_rf

In [None]:
#Plotting the Features
randomForest::varImpPlot(model_rf$finalModel)

In [None]:
# Getting test dataframe performed
probs <- predict(model_rf, test_df, 'prob')
class <- predict(model_rf, test_df, 'raw')


test_df.scored <- cbind(test_df, probs, class)

test_df.scored

In [None]:
# Creating the Confusion matrix 
cm <- conf_mat(test_df.scored, truth = LeagueIndex, class)

In [None]:
#TP TN FP FN
cm

In [None]:
#Summary of the matrix
summary(cm)

In [None]:
# Performace plot
ggplot(summary(cm), aes(x=.metric, y=.estimate)) + 
  geom_bar(stat="identity") + 
  coord_flip()

In [None]:
set.seed(1)

In [None]:
sample <- sample.split(nill_outliers_gameDf_scaled$LeagueIndex, SplitRatio = .7)
train  <- subset(nill_outliers_gameDf_scaled, sample == TRUE)
test   <- subset(nill_outliers_gameDf_scaled, sample == FALSE)

In [None]:
dim(train)
dim(test)

In [None]:
rf <- randomForest(
  LeagueIndex ~ .,
  data=train
)

In [None]:
rf

In [None]:
pred = predict(rf, newdata=test[-1])

In [None]:
cm = table(test[,1], pred)

In [None]:
x <- ctree (LeagueIndex ~ ., data=test)
plot(x, type="simple")

In [None]:
install.packages("caret")
install.packages("imbalance")

In [None]:
library(caret) # for applying classification and getting confusion matrix
library(imbalance) # for haberman data set
library(dplyr) # for transforming to tibble object
library(cvms) # for plot a confusion matrix in R

In [None]:
train_pred = predict(rf, newdata=train[-1])

In [None]:
cm_table_train = cm_train = table(train[,1], train_pred)

In [None]:
test_pred = predict(rf, newdata=test[-1])

In [None]:
cm_test = table(test[,1], test_pred)

In [None]:
cm_train_ef = enframe(cm_train, name = "table", value = "value")

In [None]:
cm_test_ef = enframe(cm_test, name = "table", value = "value")

In [None]:
# Tibble for train data
tab_fin_train=as_tibble(cm_train_ef$table)

colnames(tab_fin_train)=c("Target", "Prediction", "N")

In [None]:
# Tibble for test data
tab_fin_test=as_tibble(cm_test_ef$table)

colnames(tab_fin_test)=c("Target", "Prediction", "N")

In [None]:
# Confusion matrix for train data
plot_confusion_matrix(tab_fin_train)

In [None]:
# Confusion matrix for test data
plot_confusion_matrix(tab_fin_test

In [None]:
set.seed(1)

In [None]:
gameDataSet[ gameDataSet == "?"] <- NA
colSums(is.na(gameDataSet))

In [None]:
summary(gameDataSet)

In [None]:
gameDataSet <- gameDataSet[!(gameDataSet$Age %in% c(NA)),]
gameDataSet <- gameDataSet[!(gameDataSet$HoursPerWeek %in% c(NA)),]
gameDataSet <- gameDataSet[!(gameDataSet$TotalHours %in% c(NA)),]

colSums(is.na(gameDataSet))

In [None]:
sample1 <- sample.split(gameDataSet$LeagueIndex, SplitRatio = .7)
train1  <- subset(gameDataSet, sample == TRUE)
test1   <- subset(gameDataSet, sample == FALSE)

In [None]:
dim(train1)
dim(test1)

In [None]:
rf1 <- randomForest(
  LeagueIndex ~ .,
  data=train1
)

In [None]:
rf1

In [None]:
install.packages("OneR")
install.packages("rfviz")

In [None]:
library(ggplot2)
library(OneR)
library(rfviz)
library(tidyverse)

In [None]:
sample_rg <- sample.split(nill_outliers_gameDf_scaled$LeagueIndex, SplitRatio = .7)
train_rg  <- subset(nill_outliers_gameDf_scaled, sample_rg == TRUE)
test_rg   <- subset(nill_outliers_gameDf_scaled, sample_rg == FALSE)

In [None]:
train_rg_x = train_rg[-1]
train_rg_y = train_rg[1]

In [None]:
head(nill_outliers_gameDf_scaled)

In [None]:
colSums(is.na(nill_outliers_gameDf_scaled)

In [None]:
class(train_rg_y[,1])

In [None]:
rfprep <- rf_prep(train_rg_x, train_rg_y[,1])

In [None]:
varImpPlot(rfprep$rf)

In [None]:
#Pull up the visualization tool
bcrf <- rf_viz(rfprep, input=TRUE, imp=TRUE, cmd=TRUE)

In [None]:
set.seed(4543)

In [None]:
rfrg.fit <- randomForest(LeagueIndex ~ ., data=nill_outliers_gameDf_scaled, ntree=1000,
                       keep.forest=FALSE, importance=TRUE)

In [None]:
rfrg.fit

In [None]:
# Output to be present as PNG file 
png(file = "D:/Nav/my_projects/ML/PlayerPatterns_Recognition/randomForestRegression.png")
  
# Plot the error vs the number of trees graph
plot(rfrg.fit)
  
# Saving the file
dev.off()

In [None]:
# Best number of clusters using NbClust
set.seed(26)
clusterNo=NbClust(new_input_gameDf,distance="euclidean", min.nc=2,max.nc=10,method="kmeans",index="all")

In [None]:
# Best number of clusters using Elbow method
fviz_nbclust(new_input_gameDf, kmeans, method = "wss") + geom_vline(xintercept = 2, linetype = 2)+ 
  labs(subtitle = "Elbow method")

In [None]:
# Best number of clusters using Silhouette method
fviz_nbclust(new_input_gameDf, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")