In [1]:
#Load dataset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_diabetes
import pandas as pd
import numpy as np

diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
X = df.copy()
y = pd.DataFrame(diabetes.target.copy(), columns=['target'])

In [2]:
#Prepare train and test sets for feature selection
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123)
print(f'Train sets shapes: {X_train.shape}, {y_train.shape}')

Train sets shapes: (375, 10), (375, 1)


In [3]:
#Import necessary components

from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_validate

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

estimators =  {
           'LinearRegression': LinearRegression(),
           'RandomForestRegressor': RandomForestRegressor(random_state=123),
           'GradientBoostRegressor': GradientBoostingRegressor(random_state=123)
           }

models = {
           'LinearRegression': LinearRegression(),
           'RandomForestRegressor': RandomForestRegressor(max_depth=10, max_features=2, min_samples_leaf=2, n_estimators=50, random_state=123),
           'GradientBoostRegressor': GradientBoostingRegressor(max_depth=5, max_features=5, min_samples_leaf=5, n_estimators=10, random_state=123)
           }


In [5]:
#Feature engineering for dimensionality reduction
#Prepare datasets and store for research
datasets = []

#original dataset
datasets.append(('All_features', X_train, X_test))

#Calculating the principal components
pca = PCA(n_components=0.95)
PCA_train = pca.fit_transform(X_train)
PCA_test = pca.transform(X_test)
datasets.append(('PCA_features', PCA_train, PCA_test))
print(f'PCA features: {PCA_train.shape}, {PCA_test.shape}')

#For dimension reduction use number of features calculated by PCA
n_features = PCA_train.shape[1]

#Recursive Feature Elimination with estimators
for estimator_name, estimator in estimators.items():
    rfe = RFE(estimator=estimator, n_features_to_select=n_features, step=2, verbose=0)
    RFE_train = rfe.fit_transform(X_train, y_train)
    RFE_test = rfe.transform(X_test)
    datasets.append(('RFE_' + estimator_name, RFE_train, RFE_test))
    print(f'RFE with {estimator_name} esimator features: {X_train.columns[rfe.support_].values}')
    #print(dict(zip(X_train.columns, rfe_lr.ranking_)))

#Lasso feature selection
lcv = LassoCV()
lcv.fit(X_train, y_train)
lcv_alpha = lcv.alpha_
lcv_mask = lcv.coef_ > 0.1
datasets.append(('LassoCV_features', X_train.loc[:, lcv_mask], X_test.loc[:, lcv_mask]))
print(f'LassoCV features: {X_train.columns[lcv_mask].values}')

#Other dataset
datasets.append(('myFeatures', X_train.loc[:, ['bmi', 'bp', 's1', 's4', 's5']], X_test.loc[:, ['bmi', 'bp', 's1', 's4', 's5']]))

PCA features: (375, 8), (67, 8)
RFE with LinearRegression esimator features: ['sex' 'bmi' 'bp' 's1' 's2' 's3' 's4' 's5']
RFE with RandomForestRegressor esimator features: ['age' 'bmi' 'bp' 's1' 's2' 's3' 's5' 's6']
RFE with GradientBoostRegressor esimator features: ['age' 'bmi' 'bp' 's1' 's2' 's3' 's5' 's6']
LassoCV features: ['bmi' 'bp' 's2' 's3' 's4' 's5' 's6']


In [6]:
#Baseline score
cv = cross_validate(LinearRegression(), X_train, y_train, cv=5, scoring='r2', return_train_score=True)
print('Validation test_score: ', cv['test_score'].mean(), ' train_score: ', cv['train_score'].mean())

#Fit model with datasets and calculate scores
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import cross_validate

dataset_scores = pd.DataFrame(columns=['model', 'dataset', 'test_score', 'train_score'])

for i, (dataset_name, X_train_tmp, X_test_tmp) in enumerate(datasets):
    for model_name, model in models.items():
        
        model.fit(X_train_tmp, y_train)
        
        test_score= model.score(X_test_tmp, y_test)
        train_score= model.score(X_train_tmp, y_train)

        row = [model_name, dataset_name, test_score, train_score]
        dataset_scores.loc[len(dataset_scores)+1] = row
        dataset_scores = dataset_scores.reset_index(drop=True)

dataset_scores.sort_values('test_score', ascending=False).head(10)

Validation test_score:  0.4629812448078895  train_score:  0.503872956635694


Unnamed: 0,model,dataset,test_score,train_score
6,LinearRegression,RFE_LinearRegression,0.597933,0.498515
0,LinearRegression,All_features,0.59105,0.501008
3,LinearRegression,PCA_features,0.588152,0.49678
18,LinearRegression,myFeatures,0.56844,0.482131
12,LinearRegression,RFE_GradientBoostRegressor,0.56689,0.484918
9,LinearRegression,RFE_RandomForestRegressor,0.56689,0.484918
15,LinearRegression,LassoCV_features,0.562205,0.482197
1,RandomForestRegressor,All_features,0.547009,0.818995
7,RandomForestRegressor,RFE_LinearRegression,0.538936,0.809992
16,RandomForestRegressor,LassoCV_features,0.537971,0.822744
