# Machine Learning with SVMs

Load package with SVM implementation:

In [2]:
library(e1071)

## Classification with SVR (Suppert-Vector Regression)

### Read and prepare data

Set working directory and read data:

In [3]:
selected_features <- c('smoking',
                       'gender',
                       'age',
                       'height.cm.',
                       'weight.kg.',
                       'systolic',
                       'relaxation',
                       'fasting.blood.sugar',
                       'triglyceride',
                       'HDL',
                       'hemoglobin',
                       'serum.creatinine',
                       'ALT',
                       'Gtp')

In [4]:
setwd('/home/steinerj/Documents/ai-b/semester-4/machine-learning/ai-b-4-ml-project/data')

In [5]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)
model.data <- data[,selected_features]

In [6]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])

In [7]:
summary(model.data)

 smoking   gender         age          height.cm.      weight.kg.    
 0:35237   F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00  
 1:20455   M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00  
                     Median :40.00   Median :165.0   Median : 65.00  
                     Mean   :44.18   Mean   :164.6   Mean   : 65.86  
                     3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00  
                     Max.   :85.00   Max.   :190.0   Max.   :135.00  
    systolic       relaxation  fasting.blood.sugar  triglyceride  
 Min.   : 71.0   Min.   : 40   Min.   : 46.00      Min.   :  8.0  
 1st Qu.:112.0   1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0  
 Median :120.0   Median : 76   Median : 96.00      Median :108.0  
 Mean   :121.5   Mean   : 76   Mean   : 99.31      Mean   :126.7  
 3rd Qu.:130.0   3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0  
 Max.   :240.0   Max.   :146   Max.   :505.00      Max.   :999.0  
      HDL           hemoglobin    serum.c

### Split dataset into training and test set

In [8]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Classification

Definition of the Tuning-Parameter:

In [9]:
cc <- seq(-5,10,1)
cg <- seq(-5,1,0.5) 

Compute the model:

In [10]:
tuning <- tune.svm(
    smoking ~ ., 
    data=model.data.train,
    scale = TRUE,
    type = "C-classification",
    kernel = "radial",
    gamma = 10^cg,
    cost = 2^cc,
    epsilon = 0.1,
    tunecontrol = tune.control(sampling = "cross",cross=5))

In [None]:
print(tuning)

In [None]:
model <- tuning$best.model

Calculate the predictions:

In [None]:
X <- data[,selected_features]

Calculate confusion matrix:

In [None]:
predictions <- predict(model,X)
y <- data[,"smoking"]

In [None]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions ==0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions ==0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions ==1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions ==1, 1,0))

In [None]:
A