# Logistic Regression

## With feature selection

### Read and preprocess data

In [17]:
selected_features <- c('smoking',
                       'height.cm.',
                       'weight.kg.',
                       'systolic',
                       'relaxation',
                       'fasting.blood.sugar',
                       'triglyceride',
                       'HDL',
                       'hemoglobin',
                       'serum.creatinine',
                       'ALT',
                       'Gtp')

In [18]:
setwd('/home/steinerj/Documents/ai-b/semester-4/machine-learning/ai-b-4-ml-project/data')

In [19]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [20]:
model.data <- data[,selected_features]

In [21]:
summary(model.data)

    smoking         height.cm.      weight.kg.        systolic    
 Min.   :0.0000   Min.   :130.0   Min.   : 30.00   Min.   : 71.0  
 1st Qu.:0.0000   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.:112.0  
 Median :0.0000   Median :165.0   Median : 65.00   Median :120.0  
 Mean   :0.3673   Mean   :164.6   Mean   : 65.86   Mean   :121.5  
 3rd Qu.:1.0000   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.:130.0  
 Max.   :1.0000   Max.   :190.0   Max.   :135.00   Max.   :240.0  
   relaxation  fasting.blood.sugar  triglyceride        HDL        
 Min.   : 40   Min.   : 46.00      Min.   :  8.0   Min.   :  4.00  
 1st Qu.: 70   1st Qu.: 89.00      1st Qu.: 74.0   1st Qu.: 47.00  
 Median : 76   Median : 96.00      Median :108.0   Median : 55.00  
 Mean   : 76   Mean   : 99.31      Mean   :126.7   Mean   : 57.29  
 3rd Qu.: 82   3rd Qu.:104.00      3rd Qu.:160.0   3rd Qu.: 66.00  
 Max.   :146   Max.   :505.00      Max.   :999.0   Max.   :618.00  
   hemoglobin    serum.creatinine       ALT            

In [22]:
model.data[,"smoking"] <- as.factor(model.data[,"smoking"])

In [23]:
summary(model.data)

 smoking     height.cm.      weight.kg.        systolic       relaxation 
 0:35237   Min.   :130.0   Min.   : 30.00   Min.   : 71.0   Min.   : 40  
 1:20455   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.:112.0   1st Qu.: 70  
           Median :165.0   Median : 65.00   Median :120.0   Median : 76  
           Mean   :164.6   Mean   : 65.86   Mean   :121.5   Mean   : 76  
           3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.:130.0   3rd Qu.: 82  
           Max.   :190.0   Max.   :135.00   Max.   :240.0   Max.   :146  
 fasting.blood.sugar  triglyceride        HDL           hemoglobin   
 Min.   : 46.00      Min.   :  8.0   Min.   :  4.00   Min.   : 4.90  
 1st Qu.: 89.00      1st Qu.: 74.0   1st Qu.: 47.00   1st Qu.:13.60  
 Median : 96.00      Median :108.0   Median : 55.00   Median :14.80  
 Mean   : 99.31      Mean   :126.7   Mean   : 57.29   Mean   :14.62  
 3rd Qu.:104.00      3rd Qu.:160.0   3rd Qu.: 66.00   3rd Qu.:15.80  
 Max.   :505.00      Max.   :999.0   Max.   :618.00   Max.   :

### Split dataset into training and test set

In [24]:
n <- length(model.data[,1])
index <- sample(1:n,n,replace=FALSE)
model.data <- model.data[index,]
seventyPercentLimit <- round(length(model.data[,1]) * 0.7,0)
model.data.train <- model.data[1:seventyPercentLimit,]
model.data.test <- model.data[(seventyPercentLimit+1):n,]

### Compute model

In [25]:
glm_model <- glm(smoking ~ ., 
                 data=model.data.train,
                 binomial(link = "logit"))

In [26]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = model.data.train)

Coefficients:
        (Intercept)           height.cm.           weight.kg.  
         -19.703326             0.081956            -0.017064  
           systolic           relaxation  fasting.blood.sugar  
          -0.008899             0.005795             0.004312  
       triglyceride                  HDL           hemoglobin  
           0.003945            -0.007901             0.433745  
   serum.creatinine                  ALT                  Gtp  
           0.245354            -0.007637             0.009412  

Degrees of Freedom: 38983 Total (i.e. Null);  38972 Residual
Null Deviance:	    51240 
Residual Deviance: 40050 	AIC: 40070

#### Calculate prediction error

In [27]:
X.test <- model.data.test[,selected_features]

In [28]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [29]:
y <- model.data.test[,"smoking"]

##### Confusion Matrix

In [30]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [31]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8526,2721
Prognose: Smoker,2022,3439


##### True-Positive-Rate & True-Negative-Rate

In [32]:
A[2,2]/(A[1,2]+A[2,2])   # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1])   # True-Negative-Rate

## Without feature selection

In [144]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [145]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [146]:
data[,"smoking"] <- as.factor(data[,"smoking"])

In [147]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Split dataset into training and test set

In [148]:
n <- length(data[,1])
index <- sample(1:n,n,replace=FALSE)
data <- data[index,]
seventyPercentLimit <- round(length(data[,1]) * 0.7,0)
data.train <- data[1:seventyPercentLimit,]
data.test <- data[(seventyPercentLimit+1):n,]

### Compute model

In [149]:
glm_model <- glm(smoking ~ ., 
                 data=data.train,
                 binomial(link = "logit"))

In [150]:
glm_model


Call:  glm(formula = smoking ~ ., family = binomial(link = "logit"), 
    data = data.train)

Coefficients:
        (Intercept)              genderM                  age  
         -6.4478823            2.9098498           -0.0001798  
         height.cm.           weight.kg.            waist.cm.  
          0.0213284           -0.0092073           -0.0025535  
     eyesight.left.      eyesight.right.        hearing.left.  
         -0.0019103           -0.0251815           -0.2967960  
     hearing.right.             systolic           relaxation  
          0.0226858           -0.0154884            0.0098499  
fasting.blood.sugar          Cholesterol         triglyceride  
          0.0036101           -0.0019763            0.0044446  
                HDL                  LDL           hemoglobin  
          0.0017690           -0.0003066            0.1464957  
      Urine.protein     serum.creatinine                  AST  
          0.0123172           -0.8665403           -0.00225

#### Calculate prediction error

In [151]:
names(data.train)

In [152]:
X.test <- data.test[,c('gender',
                        'age',
                        'height.cm.',
                        'weight.kg.',
                        'waist.cm.',
                        'eyesight.left.',
                        'eyesight.right.',
                        'hearing.left.',
                        'hearing.right.',
                        'systolic',
                        'relaxation',
                        'fasting.blood.sugar',
                        'Cholesterol',
                        'triglyceride',
                        'HDL',
                        'LDL',
                        'hemoglobin',
                        'Urine.protein',
                        'serum.creatinine',
                        'AST',
                        'ALT',
                        'Gtp',
                        'dental.caries',
                        'tartar',
                        'smoking')]

In [153]:
z <- predict(glm_model,X.test)
predictions <- round(exp(z)/(1+exp(z)))

In [154]:
y <- data.test[,"smoking"]

##### Confusion Matrix

In [155]:
A <- matrix(0,ncol=2,nrow=2)

colnames(A) <- c("Real: No smoker", " Real: Smoker")
rownames(A) <- c("Prognose: No smoker", "Prognose: Smoker") 

A[1,1] <- sum(ifelse(y == 0 & predictions == 0, 1,0))
A[1,2] <- sum(ifelse(y == 1 & predictions == 0, 1,0))
A[2,1] <- sum(ifelse(y == 0 & predictions == 1, 1,0))
A[2,2] <- sum(ifelse(y == 1 & predictions == 1, 1,0))

In [156]:
A

Unnamed: 0,Real: No smoker,Real: Smoker
Prognose: No smoker,8161,1841
Prognose: Smoker,2399,4307


##### True-Positive-Rate & True-Negative-Rate

In [157]:
A[2,2]/(A[1,2]+A[2,2]) # True-Positive-Rate
A[1,1]/(A[1,1]+A[2,1]) # True-Negative-Rate

## With LASSO feature selection

In [170]:
data <- read.csv("smoking.csv",header=TRUE,sep=",",fill=TRUE,stringsAsFactors=TRUE)

In [171]:
data <- subset(data, select=-ID)
data <- subset(data, select=-oral)

In [172]:
data[,"smoking"] <- as.factor(data[,"smoking"])

In [173]:
summary(data)

 gender         age          height.cm.      weight.kg.       waist.cm.     
 F:20291   Min.   :20.00   Min.   :130.0   Min.   : 30.00   Min.   : 51.00  
 M:35401   1st Qu.:40.00   1st Qu.:160.0   1st Qu.: 55.00   1st Qu.: 76.00  
           Median :40.00   Median :165.0   Median : 65.00   Median : 82.00  
           Mean   :44.18   Mean   :164.6   Mean   : 65.86   Mean   : 82.05  
           3rd Qu.:55.00   3rd Qu.:170.0   3rd Qu.: 75.00   3rd Qu.: 88.00  
           Max.   :85.00   Max.   :190.0   Max.   :135.00   Max.   :129.00  
 eyesight.left.  eyesight.right. hearing.left.   hearing.right. 
 Min.   :0.100   Min.   :0.100   Min.   :1.000   Min.   :1.000  
 1st Qu.:0.800   1st Qu.:0.800   1st Qu.:1.000   1st Qu.:1.000  
 Median :1.000   Median :1.000   Median :1.000   Median :1.000  
 Mean   :1.013   Mean   :1.007   Mean   :1.026   Mean   :1.026  
 3rd Qu.:1.200   3rd Qu.:1.200   3rd Qu.:1.000   3rd Qu.:1.000  
 Max.   :9.900   Max.   :9.900   Max.   :2.000   Max.   :2.000  
    sy

### Sort data randomly

In [174]:
n <- length(data[,1])
Index <- sample(seq(1,n,1), replace=FALSE)
data <- data[Index,]
rownames(data) <- 1:n

### LASSO feature selection

In [175]:
library(glmnet)

In [176]:
X <- model.matrix(smoking ~ ., data)
X <- X[,-1]

In [177]:
y <- data[,"smoking"]

In [178]:
m <- length(X[1,])
total.numbers <- rep(0,m)
RUNS <- 100

for (run in 1:RUNS) {
    model.lasso <- cv.glmnet(X,y)
    beta <- coef(model.lasso,s="lambda.1se")[-1,1]
    total.numbers <- total.numbers + ifelse(beta != 0, 1, 0)
}

ERROR: Error in storage.mode(y) <- "double": invalid to change the storage mode of a factor


In [None]:
total.numbers <- as.matrix(total.numbers)
total.numbers

In [None]:
temp <- total.numbers[total.numbers[,1] >= 50, 1]
selection <- names(temp)