# Logistic Regression

## With feature selection

### Read and preprocess data

In [1]:
selected_features <- c('smoking',
                       'gender',
                       'age',
                       'height.cm.',
                       'weight.kg.',
                       'systolic',
                       'relaxation',
                       'fasting.blood.sugar',
                       'triglyceride',
                       'HDL',
                       'hemoglobin',
                       'serum.creatinine',
                       'ALT',
                       'Gtp',
                       'dental.caries',
                       'tartar')

In [2]:
setwd('/home/steinerj/Documents/ai-b/semester-4/machine-learning/ai-b-4-ml-project/data')

In [3]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [4]:
model.data <- data[,selected_features]

In [5]:
summary(model.data)

    smoking       gender         age          height.cm.      weight.kg.    
 Min.   :0.0000   F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00  
 1st Qu.:0.0000   M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00  
 Median :0.0000             Median :40.00   Median :165.0   Median : 65.00  
 Mean   :0.3673             Mean   :44.18   Mean   :164.6   Mean   : 65.86  
 3rd Qu.:1.0000             3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00  
 Max.   :1.0000             Max.   :85.00   Max.   :190.0   Max.   :135.00  
    systolic       relaxation  fasting.blood.sugar  triglyceride  
 Min.   : 71.0   Min.   : 40   Min.   : 46.00      Min.   :  8.0  
 1st Qu.:112.0   1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0  
 Median :120.0   Median : 76   Median : 96.00      Median :108.0  
 Mean   :121.5   Mean   : 76   Mean   : 99.31      Mean   :126.7  
 3rd Qu.:130.0   3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0  
 Max.   :240.0   Max.   :146   Max.   :505.00      Max.   :

In [6]:
model.data[,"tartar"] <- ifelse(data[,"tartar"] == 'Y', 1, 0)
model.data[,"tartar"] <- as.factor(model.data[,"tartar"])

In [7]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])
model.data[,"dental.caries"] <- as.factor(model.data[,"dental.caries"])

In [8]:
summary(model.data)

 smoking   gender         age          height.cm.      weight.kg.    
 0:35237   F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00  
 1:20455   M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00  
                     Median :40.00   Median :165.0   Median : 65.00  
                     Mean   :44.18   Mean   :164.6   Mean   : 65.86  
                     3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00  
                     Max.   :85.00   Max.   :190.0   Max.   :135.00  
    systolic       relaxation  fasting.blood.sugar  triglyceride  
 Min.   : 71.0   Min.   : 40   Min.   : 46.00      Min.   :  8.0  
 1st Qu.:112.0   1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0  
 Median :120.0   Median : 76   Median : 96.00      Median :108.0  
 Mean   :121.5   Mean   : 76   Mean   : 99.31      Mean   :126.7  
 3rd Qu.:130.0   3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0  
 Max.   :240.0   Max.   :146   Max.   :505.00      Max.   :999.0  
      HDL           hemoglobin    serum.c

### Split dataset into training and test set

In [26]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Compute model

In [27]:
glm_model <- glm(smoking ~ ., 
                 data=model.data.train,
                 binomial(link = "logit"))

In [28]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = model.data.train)

Coefficients:
        (Intercept)              genderM                  age  
         -7.2771988            2.9215834           -0.0009104  
         height.cm.           weight.kg.             systolic  
          0.0236944           -0.0114897           -0.0154107  
         relaxation  fasting.blood.sugar         triglyceride  
          0.0096962            0.0034282            0.0044089  
                HDL           hemoglobin     serum.creatinine  
          0.0013781            0.1275312           -0.8640067  
                ALT                  Gtp       dental.caries1  
         -0.0058129            0.0071391            0.3045297  
            tartarY  
          0.3198174  

Degrees of Freedom: 38983 Total (i.e. Null);  38968 Residual
Null Deviance:	    51350 
Residual Deviance: 37060 	AIC: 37090

#### Calculate prediction error

In [29]:
X.test <- model.data.test[,selected_features]
X.test <- subset(X.test,select=-smoking)

In [30]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [31]:
y <- model.data.test[,"smoking"]

##### Confusion Matrix

In [32]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [33]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8199,1778
Prognose: Smoker,2454,4277


##### True-Positive-Rate & True-Negative-Rate

In [34]:
A[2,2]/(A[1,2]+A[2,2])   # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1])   # True-Negative-Rate

## Without feature selection

In [12]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [13]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [14]:
data[,"tartar"] <- ifelse(data[,"tartar"] == 'Y', 1, 0)
data[,"tartar"] <- as.factor(data[,"tartar"])

data[,"smoking"] <- as.factor(data[,"smoking"])
data[,"dental.caries"] <- as.factor(data[,"dental.caries"])

In [15]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Split dataset into training and test set

In [16]:
n <- length(data[,1])
index <- sample(1:n,n,replace=FALSE)
data <- data[index,]
seventyPercentLimit <- round(length(data[,1]) * 0.7,0)
data.train <- data[1:seventyPercentLimit,]
data.test <- data[(seventyPercentLimit+1):n,]

### Compute model

In [17]:
glm_model <- glm(smoking ~ ., 
                 data=data.train,
                 binomial(link = "logit"))

In [18]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = data.train)

Coefficients:
        (Intercept)              genderM                  age  
         -6.7280030            2.9426096           -0.0009741  
         height.cm.           weight.kg.            waist.cm.  
          0.0232893           -0.0094401           -0.0010454  
     eyesight.left.      eyesight.right.        hearing.left.  
         -0.0230300           -0.0294431           -0.2497740  
     hearing.right.             systolic           relaxation  
          0.0149171           -0.0151060            0.0095278  
fasting.blood.sugar          Cholesterol         triglyceride  
          0.0040770           -0.0022505            0.0047754  
                HDL                  LDL           hemoglobin  
          0.0016788           -0.0003993            0.1387619  
      Urine.protein     serum.creatinine                  AST  
         -0.0090808           -0.9467679           -0.00138

#### Calculate prediction error

In [19]:
names(data.train)

In [20]:
X.test <- data.test[,c('gender',
                        'age',
                        'height.cm.',
                        'weight.kg.',
                        'waist.cm.',
                        'eyesight.left.',
                        'eyesight.right.',
                        'hearing.left.',
                        'hearing.right.',
                        'systolic',
                        'relaxation',
                        'fasting.blood.sugar',
                        'Cholesterol',
                        'triglyceride',
                        'HDL',
                        'LDL',
                        'hemoglobin',
                        'Urine.protein',
                        'serum.creatinine',
                        'AST',
                        'ALT',
                        'Gtp',
                        'dental.caries',
                        'tartar',
                        'smoking')]

In [21]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [22]:
y <- data.test[,"smoking"]

##### Confusion Matrix

In [23]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [24]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8154,1855
Prognose: Smoker,2454,4245


##### True-Positive-Rate & True-Negative-Rate

In [25]:
A[2,2]/(A[1,2]+A[2,2]) # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1]) # True-Negative-Rate

## With LASSO feature selection

In [26]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [27]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [28]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Sort data randomly

In [29]:
n <- length(data[,1])
Index <- sample(seq(1,n,1), replace=FALSE)
data <- data[Index,]
rownames(data) <- 1:n

### LASSO feature selection

In [30]:
library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-4



In [31]:
data.lasso <- subset(data, select=-gender)
data.lasso <- subset(data.lasso, select=-tartar)
X <- model.matrix(smoking ~ ., data.lasso)
X <- X[,-1]

In [32]:
y <- data[,"smoking"]

In [33]:
m <- length(X[1,])
total.numbers <- rep(0,m)
RUNS <- 100

for (run in 1:RUNS) {
    model.lasso <- cv.glmnet(X,y)
    beta <- coef(model.lasso,s="lambda.1se")[-1,1]
    total.numbers <- total.numbers + ifelse(beta != 0, 1, 0)
}

In [34]:
total.numbers <- as.matrix(total.numbers)
total.numbers

0,1
age,0
height.cm.,100
weight.kg.,94
waist.cm.,0
eyesight.left.,0
eyesight.right.,0
hearing.left.,0
hearing.right.,0
systolic,100
relaxation,0


In [35]:
temp <- total.numbers[total.numbers[,1] >= 50, 1]
selection <- names(temp)

In [36]:
selection

In [37]:
model.data <- data[,c("smoking","gender","tartar",selection)]

In [38]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])

In [39]:
summary(model.data)

 smoking   gender    tartar      height.cm.      weight.kg.        systolic    
 0:35237   F:20291   N:24752   Min.   :130.0   Min.   : 30.00   Min.   : 71.0  
 1:20455   M:35401   Y:30940   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.:112.0  
                               Median :165.0   Median : 65.00   Median :120.0  
                               Mean   :164.6   Mean   : 65.86   Mean   :121.5  
                               3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.:130.0  
                               Max.   :190.0   Max.   :135.00   Max.   :240.0  
 fasting.blood.sugar  Cholesterol     triglyceride        HDL        
 Min.   : 46.00      Min.   : 55.0   Min.   :  8.0   Min.   :  4.00  
 1st Qu.: 89.00      1st Qu.:172.0   1st Qu.: 74.0   1st Qu.: 47.00  
 Median : 96.00      Median :195.0   Median :108.0   Median : 55.00  
 Mean   : 99.31      Mean   :196.9   Mean   :126.7   Mean   : 57.29  
 3rd Qu.:104.00      3rd Qu.:220.0   3rd Qu.:160.0   3rd Qu.: 66.00  
 Max.   :505.00     

### Split dataset into training and test set

In [40]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Compute model

In [41]:
glm_model <- glm(smoking ~ ., 
                 data=model.data.train,
                 binomial(link = "logit"))

In [42]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = model.data.train)

Coefficients:
        (Intercept)              genderM              tartarY  
         -7.2187074            2.9144997            0.3377710  
         height.cm.           weight.kg.             systolic  
          0.0235753           -0.0120319           -0.0094111  
fasting.blood.sugar          Cholesterol         triglyceride  
          0.0033828           -0.0023110            0.0048387  
                HDL           hemoglobin     serum.creatinine  
          0.0016316            0.1470883           -0.8350024  
                AST                  ALT                  Gtp  
         -0.0002087           -0.0057439            0.0075258  
      dental.caries  
          0.2920498  

Degrees of Freedom: 38983 Total (i.e. Null);  38968 Residual
Null Deviance:	    51230 
Residual Deviance: 36800 	AIC: 36830

#### Calculate prediction error

In [43]:
X.test <- model.data.test[,c("gender","tartar",selection)]

In [44]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [45]:
y <- model.data.test[,"smoking"]

##### Confusion Matrix

In [46]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [47]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8200,1929
Prognose: Smoker,2340,4239


##### True-Positive-Rate & True-Negative-Rate

In [48]:
A[2,2]/(A[1,2]+A[2,2])   # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1])   # True-Negative-Rate