# Logistic Regression

## With feature selection

### Read and preprocess data

In [17]:
selected_features <- c('smoking',
                       'gender',
                       'age',
                       'height.cm.',
                       'weight.kg.',
                       'systolic',
                       'relaxation',
                       'fasting.blood.sugar',
                       'triglyceride',
                       'HDL',
                       'hemoglobin',
                       'serum.creatinine',
                       'ALT',
                       'Gtp')

In [18]:
setwd('/home/steinerj/Documents/ai-b/semester-4/machine-learning/ai-b-4-ml-project/data')

In [19]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [20]:
model.data <- data[,selected_features]

In [21]:
summary(model.data)

    smoking       gender         age          height.cm.      weight.kg.    
 Min.   :0.0000   F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00  
 1st Qu.:0.0000   M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00  
 Median :0.0000             Median :40.00   Median :165.0   Median : 65.00  
 Mean   :0.3673             Mean   :44.18   Mean   :164.6   Mean   : 65.86  
 3rd Qu.:1.0000             3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00  
 Max.   :1.0000             Max.   :85.00   Max.   :190.0   Max.   :135.00  
    systolic       relaxation  fasting.blood.sugar  triglyceride  
 Min.   : 71.0   Min.   : 40   Min.   : 46.00      Min.   :  8.0  
 1st Qu.:112.0   1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0  
 Median :120.0   Median : 76   Median : 96.00      Median :108.0  
 Mean   :121.5   Mean   : 76   Mean   : 99.31      Mean   :126.7  
 3rd Qu.:130.0   3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0  
 Max.   :240.0   Max.   :146   Max.   :505.00      Max.   :

In [22]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])

In [23]:
summary(model.data)

 smoking   gender         age          height.cm.      weight.kg.    
 0:35237   F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00  
 1:20455   M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00  
                     Median :40.00   Median :165.0   Median : 65.00  
                     Mean   :44.18   Mean   :164.6   Mean   : 65.86  
                     3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00  
                     Max.   :85.00   Max.   :190.0   Max.   :135.00  
    systolic       relaxation  fasting.blood.sugar  triglyceride  
 Min.   : 71.0   Min.   : 40   Min.   : 46.00      Min.   :  8.0  
 1st Qu.:112.0   1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0  
 Median :120.0   Median : 76   Median : 96.00      Median :108.0  
 Mean   :121.5   Mean   : 76   Mean   : 99.31      Mean   :126.7  
 3rd Qu.:130.0   3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0  
 Max.   :240.0   Max.   :146   Max.   :505.00      Max.   :999.0  
      HDL           hemoglobin    serum.c

### Split dataset into training and test set

In [24]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Compute model

In [25]:
glm_model <- glm(smoking ~ ., 
                 data=model.data.train,
                 binomial(link = "logit"))

In [26]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = model.data.train)

Coefficients:
        (Intercept)              genderM                  age  
          -6.776029             2.920207            -0.003474  
         height.cm.           weight.kg.             systolic  
           0.021119            -0.010985            -0.014637  
         relaxation  fasting.blood.sugar         triglyceride  
           0.010213             0.003046             0.004277  
                HDL           hemoglobin     serum.creatinine  
          -0.000323             0.138625            -0.793938  
                ALT                  Gtp  
          -0.005693             0.007283  

Degrees of Freedom: 38983 Total (i.e. Null);  38970 Residual
Null Deviance:	    51070 
Residual Deviance: 37100 	AIC: 37130

#### Calculate prediction error

In [27]:
X.test <- model.data.test[,selected_features]
X.test <- subset(X.test,select=-smoking)

In [28]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [29]:
y <- model.data.test[,"smoking"]

##### Confusion Matrix

In [30]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [31]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8041,1990
Prognose: Smoker,2359,4318


##### True-Positive-Rate & True-Negative-Rate

In [32]:
A[2,2]/(A[1,2]+A[2,2])   # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1])   # True-Negative-Rate

## Without feature selection

In [33]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [34]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [35]:
data[,"smoking"] <- as.factor(data[,"smoking"])

In [36]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Split dataset into training and test set

In [37]:
n <- length(data[,1])
index <- sample(1:n,n,replace=FALSE)
data <- data[index,]
seventyPercentLimit <- round(length(data[,1]) * 0.7,0)
data.train <- data[1:seventyPercentLimit,]
data.test <- data[(seventyPercentLimit+1):n,]

### Compute model

In [38]:
glm_model <- glm(smoking ~ ., 
                 data=data.train,
                 binomial(link = "logit"))

In [39]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = data.train)

Coefficients:
        (Intercept)              genderM                  age  
         -6.5739825            2.9429108           -0.0007955  
         height.cm.           weight.kg.            waist.cm.  
          0.0224253           -0.0115327           -0.0003628  
     eyesight.left.      eyesight.right.        hearing.left.  
         -0.0453818           -0.0126596           -0.3088257  
     hearing.right.             systolic           relaxation  
          0.0597497           -0.0143089            0.0096465  
fasting.blood.sugar          Cholesterol         triglyceride  
          0.0039097           -0.0019718            0.0045874  
                HDL                  LDL           hemoglobin  
          0.0003954           -0.0008070            0.1374217  
      Urine.protein     serum.creatinine                  AST  
          0.0046301           -0.8634472           -0.00071

#### Calculate prediction error

In [40]:
names(data.train)

In [41]:
X.test <- data.test[,c('gender',
                        'age',
                        'height.cm.',
                        'weight.kg.',
                        'waist.cm.',
                        'eyesight.left.',
                        'eyesight.right.',
                        'hearing.left.',
                        'hearing.right.',
                        'systolic',
                        'relaxation',
                        'fasting.blood.sugar',
                        'Cholesterol',
                        'triglyceride',
                        'HDL',
                        'LDL',
                        'hemoglobin',
                        'Urine.protein',
                        'serum.creatinine',
                        'AST',
                        'ALT',
                        'Gtp',
                        'dental.caries',
                        'tartar',
                        'smoking')]

In [42]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [43]:
y <- data.test[,"smoking"]

##### Confusion Matrix

In [44]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [45]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8127,1755
Prognose: Smoker,2542,4284


##### True-Positive-Rate & True-Negative-Rate

In [46]:
A[2,2]/(A[1,2]+A[2,2]) # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1]) # True-Negative-Rate

## With LASSO feature selection

In [47]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [48]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [49]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Sort data randomly

In [50]:
n <- length(data[,1])
Index <- sample(seq(1,n,1), replace=FALSE)
data <- data[Index,]
rownames(data) <- 1:n

### LASSO feature selection

In [51]:
library(glmnet)

Loading required package: Matrix

Loaded glmnet 4.1-4



In [52]:
data.lasso <- subset(data, select=-gender)
data.lasso <- subset(data.lasso, select=-tartar)
X <- model.matrix(smoking ~ ., data.lasso)
X <- X[,-1]

In [53]:
y <- data[,"smoking"]

In [54]:
m <- length(X[1,])
total.numbers <- rep(0,m)
RUNS <- 100

for (run in 1:RUNS) {
    model.lasso <- cv.glmnet(X,y)
    beta <- coef(model.lasso,s="lambda.1se")[-1,1]
    total.numbers <- total.numbers + ifelse(beta != 0, 1, 0)
}

In [55]:
total.numbers <- as.matrix(total.numbers)
total.numbers

0,1
age,0
height.cm.,100
weight.kg.,95
waist.cm.,0
eyesight.left.,0
eyesight.right.,0
hearing.left.,0
hearing.right.,0
systolic,100
relaxation,0


In [56]:
temp <- total.numbers[total.numbers[,1] >= 50, 1]
selection <- names(temp)

In [57]:
selection

In [58]:
model.data <- data[,c("smoking","gender","tartar",selection)]

In [59]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])

In [60]:
summary(model.data)

 smoking   gender    tartar      height.cm.      weight.kg.        systolic    
 0:35237   F:20291   N:24752   Min.   :130.0   Min.   : 30.00   Min.   : 71.0  
 1:20455   M:35401   Y:30940   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.:112.0  
                               Median :165.0   Median : 65.00   Median :120.0  
                               Mean   :164.6   Mean   : 65.86   Mean   :121.5  
                               3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.:130.0  
                               Max.   :190.0   Max.   :135.00   Max.   :240.0  
 fasting.blood.sugar  Cholesterol     triglyceride        HDL        
 Min.   : 46.00      Min.   : 55.0   Min.   :  8.0   Min.   :  4.00  
 1st Qu.: 89.00      1st Qu.:172.0   1st Qu.: 74.0   1st Qu.: 47.00  
 Median : 96.00      Median :195.0   Median :108.0   Median : 55.00  
 Mean   : 99.31      Mean   :196.9   Mean   :126.7   Mean   : 57.29  
 3rd Qu.:104.00      3rd Qu.:220.0   3rd Qu.:160.0   3rd Qu.: 66.00  
 Max.   :505.00     

### Split dataset into training and test set

In [61]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Compute model

In [62]:
glm_model <- glm(smoking ~ ., 
                 data=model.data.train,
                 binomial(link = "logit"))

In [63]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = model.data.train)

Coefficients:
        (Intercept)              genderM              tartarY  
          -7.182417             2.867847             0.350580  
         height.cm.           weight.kg.             systolic  
           0.023704            -0.011432            -0.009337  
fasting.blood.sugar          Cholesterol         triglyceride  
           0.003814            -0.002642             0.004712  
                HDL           hemoglobin     serum.creatinine  
           0.001664             0.147427            -0.847932  
                AST                  ALT                  Gtp  
          -0.001158            -0.006246             0.007839  
      dental.caries  
           0.334286  

Degrees of Freedom: 38983 Total (i.e. Null);  38968 Residual
Null Deviance:	    51280 
Residual Deviance: 36850 	AIC: 36880

#### Calculate prediction error

In [64]:
X.test <- model.data.test[,c("gender","tartar",selection)]

In [65]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [66]:
y <- model.data.test[,"smoking"]

##### Confusion Matrix

In [67]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [68]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8137,1812
Prognose: Smoker,2451,4308


##### True-Positive-Rate & True-Negative-Rate

In [69]:
A[2,2]/(A[1,2]+A[2,2])   # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1])   # True-Negative-Rate