## Importing The Dataset ##

In [1]:
dataset = read.csv('Data.csv')

In [2]:
dataset

Country,Age,Salary,Purchased
<chr>,<int>,<int>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


To read the Excel file, you need to install a library:

In [4]:
#install.packages("readxl")

Installing package into 'C:/Users/ASUS/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)



package 'readxl' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\ASUS\AppData\Local\Temp\RtmpO8p7ow\downloaded_packages


In [5]:
library(readxl)

"package 'readxl' was built under R version 4.4.2"


In [6]:
data <- read_excel('Data.xlsx')

In [7]:
data

Country,Age,Salary,Purchased
<chr>,<dbl>,<dbl>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


# Or

In [8]:
#install.packages("openxlsx")

In [9]:
library(openxlsx)

"package 'openxlsx' was built under R version 4.4.2"


In [10]:
data <- read.xlsx("Data.xlsx", sheet = 1)  # or specify the sheet name

In [11]:
data

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
1,France,44.0,72000.0,No
2,Spain,27.0,48000.0,Yes
3,Germany,30.0,54000.0,No
4,Spain,38.0,61000.0,No
5,Germany,40.0,,Yes
6,France,35.0,58000.0,Yes
7,Spain,,52000.0,No
8,France,48.0,79000.0,Yes
9,Germany,50.0,83000.0,No
10,France,37.0,67000.0,Yes


## Taking care of missing data

### 1. Drop missing samples

In [14]:
data_cleaned <- na.omit(dataset)

In [15]:
data_cleaned

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>
1,France,44,72000,No
2,Spain,27,48000,Yes
3,Germany,30,54000,No
4,Spain,38,61000,No
6,France,35,58000,Yes
8,France,48,79000,Yes
9,Germany,50,83000,No
10,France,37,67000,Yes


### 2. Imputation

In [11]:
dataset$Age = ifelse(is.na(dataset$Age),
                     ave(dataset$Age, FUN = function(x) mean(x, na.rm = TRUE)),
                     dataset$Age)

In [12]:
dataset

Country,Age,Salary,Purchased
<chr>,<dbl>,<int>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,,Yes
France,35.0,58000.0,Yes
Spain,38.77778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


In [13]:
dataset$Salary = ifelse(is.na(dataset$Salary),
                        ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
                        dataset$Salary)

In [14]:
dataset

Country,Age,Salary,Purchased
<chr>,<dbl>,<dbl>,<chr>
France,44.0,72000.0,No
Spain,27.0,48000.0,Yes
Germany,30.0,54000.0,No
Spain,38.0,61000.0,No
Germany,40.0,63777.78,Yes
France,35.0,58000.0,Yes
Spain,38.77778,52000.0,No
France,48.0,79000.0,Yes
Germany,50.0,83000.0,No
France,37.0,67000.0,Yes


### 3. Regression imputation

We will cover this part after we find out what Regression is :)

## Encoding categorical data

In [15]:
dataset$Country = factor(dataset$Country,
                         levels = c('France', 'Spain', 'Germany'),
                         labels = c(1, 2, 3))

In [16]:
dataset

Country,Age,Salary,Purchased
<fct>,<dbl>,<dbl>,<chr>
1,44.0,72000.0,No
2,27.0,48000.0,Yes
3,30.0,54000.0,No
2,38.0,61000.0,No
3,40.0,63777.78,Yes
1,35.0,58000.0,Yes
2,38.77778,52000.0,No
1,48.0,79000.0,Yes
3,50.0,83000.0,No
1,37.0,67000.0,Yes


In [17]:
dataset$Purchased = factor(dataset$Purchased,
                           levels = c('No', 'Yes'),
                           labels = c(0, 1))

In [18]:
dataset

Country,Age,Salary,Purchased
<fct>,<dbl>,<dbl>,<fct>
1,44.0,72000.0,0
2,27.0,48000.0,1
3,30.0,54000.0,0
2,38.0,61000.0,0
3,40.0,63777.78,1
1,35.0,58000.0,1
2,38.77778,52000.0,0
1,48.0,79000.0,1
3,50.0,83000.0,0
1,37.0,67000.0,1


## Splitting the dataset into the Training set and Test set

In [19]:
install.packages('caTools')

Installing package into 'C:/Users/ASUS/AppData/Local/R/win-library/4.4'
(as 'lib' is unspecified)



package 'caTools' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\ASUS\AppData\Local\Temp\RtmpeqUv8W\downloaded_packages


In [20]:
library(caTools)

"package 'caTools' was built under R version 4.4.2"


In [21]:
set.seed(123)

In [25]:
split = sample.split(dataset$Purchased, SplitRatio = 0.8) #Purchased is your dependent variables

In [26]:
split

In [24]:
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

In [27]:
training_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
1,1,44.0,72000.0,0
2,2,27.0,48000.0,1
3,3,30.0,54000.0,0
4,2,38.0,61000.0,0
5,3,40.0,63777.78,1
7,2,38.77778,52000.0,0
8,1,48.0,79000.0,1
10,1,37.0,67000.0,1


In [28]:
test_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
6,1,35,58000,1
9,3,50,83000,0


## Feature Scaling

In [29]:
training_set = scale(training_set)

ERROR: Error in colMeans(x, na.rm = TRUE): 'x' must be numeric


In [31]:
test_set = scale(test_set)

ERROR: Error in colMeans(x, na.rm = TRUE): 'x' must be numeric


In [33]:
training_set[,2:3] = scale(training_set[,2:3])

In [34]:
training_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
1,1,0.90101716,0.9392746,0
2,2,-1.58847494,-1.337116,1
3,3,-1.14915281,-0.7680183,0
4,2,0.02237289,-0.1040711,0
5,3,0.31525431,0.1594,1
7,2,0.13627122,-0.9577176,0
8,1,1.48678,1.6032218,1
10,1,-0.12406783,0.4650265,1


In [35]:
test_set[,2:3] = scale(test_set[,2:3])

In [36]:
test_set

Unnamed: 0_level_0,Country,Age,Salary,Purchased
Unnamed: 0_level_1,<fct>,<dbl>,<dbl>,<fct>
6,1,-0.7071068,-0.7071068,1
9,3,0.7071068,0.7071068,0
