In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('insurance.csv')

In [3]:
X, y = df.drop('charges', axis=1), df['charges']
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
# Encodes target labels with value between 0 and n_classes-1
le = LabelEncoder()
X_enc = X.copy()

for col in ['age', 'sex', 'region', 'smoker']:
    X_enc[col] = le.fit_transform(X_enc[col])

## 1. Removing features with low variance 
(if lower than a threshold)

In [6]:
from sklearn.feature_selection import VarianceThreshold

In [7]:
# Default threshold = 0 -> same value in every sample of that feature
selector = VarianceThreshold(threshold=0.2)
data_selected = selector.fit_transform(X_enc)
X_sel = pd.DataFrame(data_selected, columns=X_enc.columns[selector.get_support()])

In [8]:
# Output: removed smoker
X_sel.head()

Unnamed: 0,age,sex,bmi,children,region
0,1.0,0.0,27.9,0.0,3.0
1,0.0,1.0,33.77,1.0,2.0
2,10.0,1.0,33.0,3.0,2.0
3,15.0,1.0,22.705,0.0,1.0
4,14.0,1.0,28.88,0.0,1.0


## 2. Univariate feature selection

(based on univariate statistical tests)

In [9]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, f_regression
# ch2/f_classif/f_regression measures dependance between variables
# and removes features that are the most independent of class (target)

In [10]:
f_val, p_val = f_regression(X_enc, y)
print(f'F_scores: {f_val.round()}\n','P_values: ', p_val.round(3))

F_scores: [ 131.    4.   55.    6. 2178.    0.]
 P_values:  [0.    0.036 0.    0.013 0.    0.821]


**p value** is the probability of obtaining test results at least as extreme as the observed results

**[F score](https://stats.stackexchange.com/questions/204141/difference-between-selecting-features-based-on-f-regression-and-based-on-r2)** is a way to measure a test's accuracy. **F-test** is a way of comparing the significance of the improvement of a model, with respect to the addition of new variables (from Wikipedia)

In [13]:
sl = SelectKBest(f_regression, k=5) # sorts f_scores in a descending order and takes the first n
sl.fit_transform(X_enc, y)
cols = X_enc.columns[sl.get_support()]
X_new = X_enc[cols]

In [12]:
# An alternative is to take p-values into account

## 3. Recursive feature elimination

In [14]:
from sklearn.feature_selection import RFE