# Filter methods

Rank the features according to some criterium, e.g. distance, information gain, dependency, consistency. Then by choosing only the top-ranked features, we hope to learn a better model.

In [20]:
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from matplotlib import pyplot as plt
from math import sqrt
import pandas as pd
import numpy as np
from sklearn.metrics import recall_score, precision_score, f1_score
from faraway.datasets import seatpos
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.discrete.discrete_model import Logit
from ml import wines_pd

In [3]:
df = wines_pd()

In [4]:
df.quality = (df.quality >=6).astype(int)

In [5]:
df.corr()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
fixed acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.095093
volatile acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.321441
citric acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.159129
residual sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,-0.00216
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.109494
free sulfur dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.061757
total sulfur dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.231963
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.15911
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.003264
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.218072


In [16]:
def data(keep=None):
    global train_X, train_y, valid_X, valid_y, train_df, valid_df
    if keep is None:
        keep = list(df.columns.drop('quality'))
    train_df, valid_df = train_test_split(df[keep + ['quality']], test_size=0.2)
    train_X = train_df.drop(columns='quality')
    train_y = train_df.quality
    valid_X = valid_df.drop(columns='quality')
    valid_y = valid_df.quality
    scaler = StandardScaler()
    train_X = scaler.fit_transform(train_X)
    valid_X = scaler.transform(valid_X)

In [17]:
f1s = []
for i in range(100):
    data()
    model = LogisticRegression()
    model.fit(train_X, train_y)
    pred_y = model.predict(valid_X)
    f1s.append(f1_score(valid_y, pred_y))
sum(f1s)/len(f1s)

0.757872596848619

In [31]:
from ml import heart_disease_pd
df = heart_disease_pd()


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 76 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        282 non-null    int64  
 1   ccf       282 non-null    int64  
 2   age       282 non-null    int64  
 3   sex       282 non-null    object 
 4   painloc   282 non-null    int64  
 5   painexer  282 non-null    int64  
 6   relrest   282 non-null    int64  
 7   pncaden   282 non-null    int64  
 8   cp        282 non-null    int64  
 9   trestbps  282 non-null    int64  
 10  htn       282 non-null    int64  
 11  chol      282 non-null    int64  
 12  smoke     282 non-null    int64  
 13  cigs      282 non-null    int64  
 14  years     282 non-null    int64  
 15  fbs       282 non-null    int64  
 16  dm        282 non-null    int64  
 17  famhist   282 non-null    int64  
 18  restecg   282 non-null    int64  
 19  ekgmo     282 non-null    int64  
 20  ekgday    282 non-null    int64 

In [10]:
df.corr()['quality']

fixed acidity           0.095093
volatile acidity       -0.321441
citric acid             0.159129
residual sugar         -0.002160
chlorides              -0.109494
free sulfur dioxide    -0.061757
total sulfur dioxide   -0.231963
density                -0.159110
pH                     -0.003264
sulphates               0.218072
alcohol                 0.434751
quality                 1.000000
Name: quality, dtype: float64

In [30]:
f1s = []
for i in range(100):
    data(['alcohol', 'density'])
    model = LogisticRegression()
    model.fit(train_X, train_y)
    pred_y = model.predict(valid_X)
    f1s.append(f1_score(valid_y, pred_y))
sum(f1s)/len(f1s)

0.7125562397627327

The data function keeps only the given features in the dataset.

In [22]:
def model_summary():
    """
    report the F-Test scores and VIF scores for the remaining variables
    """
    X2 = sm.add_constant(train_X)
    est = sm.Logit(train_y, X2)
    m = est.fit()
    print(m.summary())
    vif = pd.DataFrame()
    vif["vif"] = [variance_inflation_factor(train_X, i) for i in range(train_X.shape[1])]
    vif["features"] = train_df.drop(columns='quality').columns
    print(vif)

In [23]:
def model_validation_f1():
    """
    train a model and return the r2 score on the validation set
    """
    X2 = sm.add_constant(train_X)
    est = sm.Logit(train_y, X2)
    m = est.fit()
    X2 = sm.add_constant(valid_X)
    pred_y = m.predict(X2)
    return f1_score(valid_y, pred_y)

Because the dataset is really small, we need to bootstrap the experiment to get a more or less stable estimation of r2. We therefore repeat the experience 1000 times, resampling the dataset, learning an optimal model on the training set, computing r2 on the validation set and then taking the average of all r2's.

In [25]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,1
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,0
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,1
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,1
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,0


In [26]:
data(['alcohol', 'density'])

In [27]:
model_summary()

Optimization terminated successfully.
         Current function value: 0.582705
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                quality   No. Observations:                 1279
Model:                          Logit   Df Residuals:                     1276
Method:                           MLE   Df Model:                            2
Date:                Tue, 06 Oct 2020   Pseudo R-squ.:                  0.1549
Time:                        08:39:31   Log-Likelihood:                -745.28
converged:                       True   LL-Null:                       -881.88
Covariance Type:            nonrobust   LLR p-value:                 4.710e-60
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.2724      0.064      4.230      0.000       0.146       0.399
x1             1.1891      0.

We will learn a linear regression model using the statsmodel, because this has a summary function that helps us to analyse which features have a significant contribution to the prediction. For this, we use the `F-Test`, which in the columns below is the 4-th columns ( Probability >|T| ). This so called p-value indicates how likely it is that the contribution of a feature is random. It is common to dismiss anything above 0.05.

The summary shows that although the F-test for the regression function as a whole is significant ( 1.31E-5 ), no feature has a significant contribution. This is caused by collinearity. Columns 5 and 6 indicate a 95% confidence interval for the coefficients, which are very large and can all have positive or negative signs. This is strange, how can a feature be both contributing positively AND negatively?

Note that the warning indicates there may be strong collinearity present!

In [None]:
def exp(keep):
    f1s = []
    for i in range(100):
        data(keep)
        model = LogisticRegression()
        model.fit(train_X, train_y)
        pred_y = model.predict(valid_X)
        f1s.append(f1_score(valid_y, pred_y))
    sum(f1s)/len(f1s)
    

In [237]:
exp(['HtShoes'])

avg validation r2:  0.22178214514788896


In [249]:
exp(['Seated', 'HtShoes', 'Thigh', 'Arm', 'Leg', 'Weight'])

avg validation r2:  0.4476154865175942


In [250]:
model_summary()

                            OLS Regression Results                            
Dep. Variable:              hipcenter   R-squared:                       0.700
Model:                            OLS   Adj. R-squared:                  0.678
Method:                 Least Squares   F-statistic:                     31.50
Date:                Mon, 05 Oct 2020   Prob (F-statistic):           8.74e-08
Time:                        20:22:37   Log-Likelihood:                -147.32
No. Observations:                  30   AIC:                             300.6
Df Residuals:                      27   BIC:                             304.8
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -168.4238      6.321    -26.645      0.0

In [162]:
train_df.corr()

Unnamed: 0,Age,Weight,Ht,Seated,Arm,Thigh,Leg,hipcenter
Age,1.0,0.004359,-0.223846,-0.255888,0.193044,0.057571,-0.142485,0.292628
Weight,0.004359,1.0,0.775337,0.709538,0.614127,0.552714,0.780087,-0.60863
Ht,-0.223846,0.775337,1.0,0.912519,0.700325,0.72864,0.895856,-0.797398
Seated,-0.255888,0.709538,0.912519,1.0,0.569021,0.583649,0.792619,-0.713065
Arm,0.193044,0.614127,0.700325,0.569021,1.0,0.695021,0.737951,-0.605728
Thigh,0.057571,0.552714,0.72864,0.583649,0.695021,1.0,0.6157,-0.55633
Leg,-0.142485,0.780087,0.895856,0.792619,0.737951,0.6157,1.0,-0.780044
hipcenter,0.292628,-0.60863,-0.797398,-0.713065,-0.605728,-0.55633,-0.780044,1.0


In [191]:
data(['Ht', 'Seated', 'Leg', 'Thigh', 'Arm'])

In [192]:
model()
vif()

                            OLS Regression Results                            
Dep. Variable:              hipcenter   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.689
Method:                 Least Squares   F-statistic:                     14.27
Date:                Mon, 05 Oct 2020   Prob (F-statistic):           0.000115
Time:                        20:04:56   Log-Likelihood:                -90.692
No. Observations:                  19   AIC:                             189.4
Df Residuals:                      15   BIC:                             193.2
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -157.3286      7.391    -21.286      0.0

  "anyway, n=%i" % int(n))


Unnamed: 0,vif,features
0,1.062788,Age
1,3.847365,Weight
2,3.729984,HtShoes


In [90]:
model().summary()

0,1,2,3
Dep. Variable:,hipcenter,R-squared:,0.674
Model:,OLS,Adj. R-squared:,0.623
Method:,Least Squares,F-statistic:,13.24
Date:,"Mon, 05 Oct 2020",Prob (F-statistic):,5.04e-07
Time:,19:24:50,Log-Likelihood:,-187.47
No. Observations:,38,AIC:,386.9
Df Residuals:,32,BIC:,396.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,314.2850,83.522,3.763,0.001,144.157,484.413
Age,0.9604,0.490,1.959,0.059,-0.038,1.959
Weight,-0.1243,0.281,-0.442,0.662,-0.697,0.449
Arm,-2.2570,3.650,-0.618,0.541,-9.692,5.178
Thigh,-2.1683,2.202,-0.984,0.332,-6.655,2.318
Leg,-9.2977,3.678,-2.528,0.017,-16.789,-1.806

0,1,2,3
Omnibus:,1.334,Durbin-Watson:,1.834
Prob(Omnibus):,0.513,Jarque-Bera (JB):,1.263
Skew:,0.408,Prob(JB):,0.532
Kurtosis:,2.635,Cond. No.,2460.0


In [91]:
vif()

Unnamed: 0,vif,features
0,9.767946,Age
1,40.198227,Weight
2,394.644699,Arm
3,191.002985,Thigh
4,356.198808,Leg
