In [1]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.feature_selection import (VarianceThreshold, SelectKBest, f_regression, mutual_info_regression, 
    RFE, RFECV)
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures

# Load the data
df = pd.read_csv('diabetes.tab.txt', sep='\t')
df.head()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135


In [3]:
df=pd.get_dummies(df, columns= ["SEX"], drop_first=True,dtype=int)
df.head

<bound method NDFrame.head of      AGE   BMI      BP   S1     S2    S3    S4      S5   S6    Y  SEX_2
0     59  32.1  101.00  157   93.2  38.0  4.00  4.8598   87  151      1
1     48  21.6   87.00  183  103.2  70.0  3.00  3.8918   69   75      0
2     72  30.5   93.00  156   93.6  41.0  4.00  4.6728   85  141      1
3     24  25.3   84.00  198  131.4  40.0  5.00  4.8903   89  206      0
4     50  23.0  101.00  192  125.4  52.0  4.00  4.2905   80  135      0
..   ...   ...     ...  ...    ...   ...   ...     ...  ...  ...    ...
437   60  28.2  112.00  185  113.8  42.0  4.00  4.9836   93  178      1
438   47  24.9   75.00  225  166.0  42.0  5.00  4.4427  102  104      1
439   60  24.9   99.67  162  106.6  43.0  3.77  4.1271   95  132      1
440   36  30.0   95.00  201  125.2  42.0  4.79  5.1299   85  220      0
441   36  19.6   71.00  250  133.2  97.0  3.00  4.5951   92   57      0

[442 rows x 11 columns]>

In [5]:
target = df['Y']
features = df.drop(['Y'], axis=1)
features.head()


Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,0


In [6]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [9]:
#scaling the data 
# Initialize the scaler
scaler = StandardScaler()

# Scale every feature except the binary column - female
transformed_training_features = scaler.fit_transform(x_train.iloc[:,:-1])
transformed_testing_features = scaler.transform(x_test.iloc[:,:-1])

# Convert the scaled features into a DataFrame
X_train_transformed = pd.DataFrame(scaler.transform(x_train.iloc[:,:-1]), 
                                   columns=x_train.columns[:-1], 
                                   index=x_train.index)
X_test_transformed = pd.DataFrame(scaler.transform(x_test.iloc[:,:-1]), 
                                  columns=x_train.columns[:-1], 
                                  index=x_test.index)

# Add binary column back in
X_train_transformed['female'] = features["SEX_2"]
X_test_transformed['female'] = features["SEX_2"]

# use the correct variable name (capital X) created above
X_train_transformed.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,female
17,1.498365,0.219902,1.138874,0.728473,1.055893,-0.824451,0.711038,0.547482,-0.061449,1
66,-0.228858,-0.419366,-0.710591,-0.424929,0.272425,-1.529791,1.484286,-0.019757,0.367236,1
137,0.085182,1.018987,1.992473,-0.309589,-0.326699,-0.119111,-0.06221,0.331237,-0.31866,0
245,-0.621409,-0.784662,-0.639458,-1.17464,-1.215508,0.6646,-0.835458,-1.069682,-2.719299,0
31,-0.542899,-1.42393,-1.706457,-0.799784,-1.110167,1.291569,-1.608706,-0.802859,-0.91882,0


In [10]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly_train = pd.DataFrame(poly.fit_transform(X_train_transformed),
                            columns=poly.get_feature_names_out(X_train_transformed.columns))
X_poly_test = pd.DataFrame(poly.transform(X_test_transformed),
                           columns=poly.get_feature_names_out(X_test_transformed.columns))
X_poly_train.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,female,...,S4^2,S4 S5,S4 S6,S4 female,S5^2,S5 S6,S5 female,S6^2,S6 female,female^2
0,1.498365,0.219902,1.138874,0.728473,1.055893,-0.824451,0.711038,0.547482,-0.061449,1.0,...,0.505575,0.38928,-0.043693,0.711038,0.299737,-0.033642,0.547482,0.003776,-0.061449,1.0
1,-0.228858,-0.419366,-0.710591,-0.424929,0.272425,-1.529791,1.484286,-0.019757,0.367236,1.0,...,2.203104,-0.029324,0.545084,1.484286,0.00039,-0.007255,-0.019757,0.134863,0.367236,1.0
2,0.085182,1.018987,1.992473,-0.309589,-0.326699,-0.119111,-0.06221,0.331237,-0.31866,0.0,...,0.00387,-0.020606,0.019824,-0.0,0.109718,-0.105552,0.0,0.101544,-0.0,0.0
3,-0.621409,-0.784662,-0.639458,-1.17464,-1.215508,0.6646,-0.835458,-1.069682,-2.719299,0.0,...,0.697991,0.893675,2.271861,-0.0,1.14422,2.908785,-0.0,7.394585,-0.0,0.0
4,-0.542899,-1.42393,-1.706457,-0.799784,-1.110167,1.291569,-1.608706,-0.802859,-0.91882,0.0,...,2.587936,1.291564,1.478111,-0.0,0.644582,0.737682,-0.0,0.84423,-0.0,0.0


In [12]:
def run_model(model, X_train, X_test, y_train, y_test, display=True):
    
    train_r2 = model.score(X_train, y_train)
    y_pred_train = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    
    test_r2 = model.score(X_test, y_test)
    y_pred_test = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    if (display):
        print('Training R^2:', train_r2)
        print('Training Root Mean Squared Error:', train_rmse)
        print('\n----------------\n')
        print('Testing R^2:', test_r2)
        print('Testing Root Mean Squared Error:', test_rmse)
        
    return test_r2, test_rmse

In [13]:
lr_poly = LinearRegression()
lr_poly.fit(X_poly_train, y_train)

poly_r2, poly_rmse = run_model(lr_poly, X_poly_train, X_poly_test, y_train, y_test)

Training R^2: 0.6061583502354682
Training Root Mean Squared Error: 48.919716046393646

----------------

Testing R^2: 0.4156399336407993
Testing Root Mean Squared Error: 55.6419653440124


In [14]:
#filter method
# Using Variance Threshold
selector = VarianceThreshold()
reduced_feature_train = pd.DataFrame(selector.fit_transform(X_poly_train), columns=X_poly_train.columns, index=X_poly_train.index)
reduced_feature_test = pd.DataFrame(selector.transform(X_poly_test), columns=X_poly_test.columns, index=X_poly_test.index)

lr = LinearRegression()
lr.fit(reduced_feature_train, y_train)
reduced_r2, reduced_rmse = run_model(lr, reduced_feature_train, reduced_feature_test, y_train, y_test)

print('\n----------------\n')
print(f"{reduced_feature_train.shape[1]} out of {X_poly_train.shape[1]} features used")
print('Baseline R-Squared:', round(poly_r2, 2))
print('Reduced R-Squared: ', round(reduced_r2, 2))

Training R^2: 0.6061583502354682
Training Root Mean Squared Error: 48.919716046393646

----------------

Testing R^2: 0.41563993364079876
Testing Root Mean Squared Error: 55.64196534401242

----------------

65 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.42


In [15]:
np.linspace(np.percentile(selector.variances_,10), np.percentile(selector.variances_, 90), 10)

array([0.48337857, 0.64896045, 0.81454232, 0.98012419, 1.14570606,
       1.31128794, 1.47686981, 1.64245168, 1.80803355, 1.97361543])

In [18]:
# Evaluate model performance for different variance thresholds
threshold_ranges = np.linspace(
    np.percentile(selector.variances_, 10),
    np.percentile(selector.variances_, 90),
    10
)

reduced_r2s = []

for thresh in threshold_ranges:
    # use a new selector variable so we don't overwrite the fitted 'selector' used above
    sel = VarianceThreshold(thresh)
    reduced_feature_train = sel.fit_transform(X_poly_train)
    reduced_feature_test = sel.transform(X_poly_test)

    lr = LinearRegression()
    lr.fit(reduced_feature_train, y_train)

    reduced_r2, reduced_rmse = run_model(
        lr, reduced_feature_train, reduced_feature_test, y_train, y_test, display=False
    )
    reduced_r2s.append(reduced_r2)

    print(f"Variance threshold: {thresh}")
    print(f"{reduced_feature_train.shape[1]} out of {X_poly_train.shape[1]} features used")
    print("Baseline R-Squared:", round(poly_r2, 2))
    print("Reduced R-Squared: ", round(reduced_r2, 2))
    print("\n--------------------------------------------------------------------\n")

Variance threshold: 0.48337857377380306
58 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.39

--------------------------------------------------------------------

Variance threshold: 0.6489604462415886
54 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.4

--------------------------------------------------------------------

Variance threshold: 0.8145423187093739
54 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.4

--------------------------------------------------------------------

Variance threshold: 0.9801241911771594
36 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.43

--------------------------------------------------------------------

Variance threshold: 1.1457060636449448
15 out of 65 features used
Baseline R-Squared: 0.42
Reduced R-Squared:  0.05

--------------------------------------------------------------------

Variance threshold: 1.3112879361127303
12 out of 65 features used
