# Model Selection

In [17]:

seeds.data = read.csv(url('https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt'),
                     header = FALSE, 
                     sep = '',
                     col.names=c('area','perimeter','compactness','length_of_kernel','width_of_kernel','asymmetry_coefficient','length_of_kernel_groove','target'))
seeds.data

area,perimeter,compactness,length_of_kernel,width_of_kernel,asymmetry_coefficient,length_of_kernel_groove,target
15.26,14.84,0.8710,5.763,3.312,2.2210,5.220,1
14.88,14.57,0.8811,5.554,3.333,1.0180,4.956,1
14.29,14.09,0.9050,5.291,3.337,2.6990,4.825,1
13.84,13.94,0.8955,5.324,3.379,2.2590,4.805,1
16.14,14.99,0.9034,5.658,3.562,1.3550,5.175,1
14.38,14.21,0.8951,5.386,3.312,2.4620,4.956,1
14.69,14.49,0.8799,5.563,3.259,3.5860,5.219,1
14.11,14.10,0.8911,5.420,3.302,2.7000,5.000,1
16.63,15.46,0.8747,6.053,3.465,2.0400,5.877,1
16.44,15.25,0.8880,5.884,3.505,1.9690,5.533,1


In [18]:
seeds.glm = glm("target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove", data = seeds.data)
summary(seeds.glm)


Call:
glm(formula = "target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove", 
    data = seeds.data)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.30568  -0.24785  -0.01632   0.24198   1.22362  

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)              53.44356    7.44511   7.178 1.32e-11 ***
area                      1.48907    0.26133   5.698 4.25e-08 ***
perimeter                -3.22038    0.53815  -5.984 9.77e-09 ***
compactness             -30.67744    5.24108  -5.853 1.92e-08 ***
length_of_kernel         -2.31510    0.45444  -5.094 8.01e-07 ***
width_of_kernel           0.24598    0.78571   0.313    0.755    
asymmetry_coefficient     0.11489    0.02257   5.089 8.19e-07 ***
length_of_kernel_groove   2.19260    0.20358  10.770  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion paramete

In [19]:
logLik(seeds.glm)
paste0("BIC = ", BIC(seeds.glm))

'log Lik.' -109.228 (df=9)

## Models

### BIC for one feature removed (Greedy algo)

In [20]:

model_1  = "target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_2a = "target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient                          "
model_2b = "target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel +                         length_of_kernel_groove"
model_2c = "target ~ 1 + area + perimeter + compactness + length_of_kernel +                 + asymmetry_coefficient + length_of_kernel_groove"
model_2d = "target ~ 1 + area + perimeter + compactness +                  + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_2e = "target ~ 1 + area + perimeter +             + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_2f = "target ~ 1 + area +           + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_2g = "target ~ 1 +      + perimeter + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"


In [21]:
seeds.glm.1 = glm(model_1, data=seeds.data)
seeds.glm.2a = glm(model_2a, data=seeds.data)
seeds.glm.2b = glm(model_2b, data=seeds.data)
seeds.glm.2c = glm(model_2c, data=seeds.data)
seeds.glm.2d = glm(model_2d, data=seeds.data)
seeds.glm.2e = glm(model_2e, data=seeds.data)
seeds.glm.2f = glm(model_2f, data=seeds.data)
seeds.glm.2g = glm(model_2g, data=seeds.data)


In [22]:
m1 = BIC(seeds.glm.1)
m2a = BIC(seeds.glm.2a)
m2b = BIC(seeds.glm.2b)
m2c = BIC(seeds.glm.2c)
m2d = BIC(seeds.glm.2d)
m2e = BIC(seeds.glm.2e)
m2f = BIC(seeds.glm.2f)
m2g = BIC(seeds.glm.2g)

In [23]:
require(reshape2)
seeds.df <- melt(data.frame(m1,m2a,m2b,m2c,m2d,m2e,m2f,m2g))
colnames(seeds.df) <- c('model_id', 'BIC')
seeds.df

No id variables; using all as measure variables


model_id,BIC
m1,266.5799
m2a,356.5257
m2b,286.5697
m2c,261.3347
m2d,286.6159
m2e,294.1331
m2f,295.5059
m2g,292.5325


### BIC for two features removed

In [24]:
model_3a = "target ~ 1 + area + perimeter + compactness + length_of_kernel + width_of_kernel                                                  "
model_3b = "target ~ 1 + area + perimeter + compactness + length_of_kernel                                           + length_of_kernel_groove"
model_3c = "target ~ 1 + area + perimeter + compactness +                                    + asymmetry_coefficient + length_of_kernel_groove"
model_3d = "target ~ 1 + area + perimeter +                                + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_3e = "target ~ 1 + area +                         + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_3f = "target ~ 1 +                  + compactness + length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"


In [25]:
seeds.glm.3a = glm(model_3a, data=seeds.data)
seeds.glm.3b = glm(model_3b, data=seeds.data)
seeds.glm.3c = glm(model_3c, data=seeds.data)
seeds.glm.3d = glm(model_3d, data=seeds.data)
seeds.glm.3e = glm(model_3e, data=seeds.data)
seeds.glm.3f = glm(model_3f, data=seeds.data)

In [26]:
m3a = BIC(seeds.glm.3a)
m3b = BIC(seeds.glm.3b)
m3c = BIC(seeds.glm.3c)
m3d = BIC(seeds.glm.3d)
m3e = BIC(seeds.glm.3e)
m3f = BIC(seeds.glm.3f)


In [27]:
temp.df <- melt(data.frame(m3a,m3b,m3c,m3d,m3e,m3f))
colnames(temp.df) <- c('model_id','BIC')
seeds.df <- rbind(seeds.df,temp.df)

No id variables; using all as measure variables


### Removing 3 Features

In [28]:
model_4a = "target ~ 1 + area + perimeter + compactness + length_of_kernel                                                                    "
model_4b = "target ~ 1 + area + perimeter + compactness +                                                              length_of_kernel_groove"
model_4c = "target ~ 1 + area + perimeter +                                                    asymmetry_coefficient + length_of_kernel_groove"
model_4d = "target ~ 1 + area +                                              width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"
model_4e = "target ~ 1 +                                  length_of_kernel + width_of_kernel + asymmetry_coefficient + length_of_kernel_groove"


In [29]:
seeds.glm.4a = glm(model_4a, data=seeds.data)
seeds.glm.4b = glm(model_4b, data=seeds.data)
seeds.glm.4c = glm(model_4c, data=seeds.data)
seeds.glm.4d = glm(model_4d, data=seeds.data)
seeds.glm.4e = glm(model_4e, data=seeds.data)

In [30]:
m4a = BIC(seeds.glm.4a)
m4b = BIC(seeds.glm.4b)
m4c = BIC(seeds.glm.4c)
m4d = BIC(seeds.glm.4d)
m4e = BIC(seeds.glm.4e)


In [31]:

temp2.df <- melt(data.frame(m4a,m4b,m4c,m4d,m4e))
colnames(temp2.df) <- c('model_id','BIC')
seeds.df <- rbind(seeds.df,temp2.df)

No id variables; using all as measure variables


In [32]:
seeds.df[order(seeds.df$BIC),]

Unnamed: 0,model_id,BIC
4,m2c,261.3347
1,m1,266.5799
11,m3c,283.1582
10,m3b,283.4745
3,m2b,286.5697
5,m2d,286.6159
14,m3f,290.9959
19,m4e,292.2405
8,m2g,292.5325
6,m2e,294.1331


Removing Length/Width of Kernel seems to provide the best results, while removing length of kernel groove seems the most detrimental to the model's BIC score. The model with the best BIC, m2c, has the following features:

*area , perimeter , compactness , length_of_kernel, asymmetry_coefficient , length_of_kernel_groove*

Surprisingly, the full model is better than all except m2c.
