In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [2]:
# load the dataset
training_df = pd.read_excel('drought_training_df_without_outliers.xlsx')

training_df

Unnamed: 0,339,341,342,344,345,346,348,349,351,352,...,2499,2501,2503,2505,2507,2509,2511,2514,2516,Water Potential (Mpa)
0,0.1481,0.1818,0.1748,0.1486,0.1045,0.0782,0.0904,0.083,0.1101,0.1168,...,0.0332,0.0347,0.0323,0.0308,0.036,0.0346,0.032,0.0328,0.0344,-0.089632
1,0.1422,0.136,0.1384,0.1283,0.1103,0.0967,0.1014,0.104,0.0934,0.0819,...,0.056,0.0589,0.056,0.0551,0.0563,0.0554,0.0533,0.0547,0.0528,-0.148237
2,0.0978,0.092,0.1038,0.0981,0.0793,0.0613,0.0477,0.052,0.0545,0.0698,...,0.028,0.0252,0.0269,0.0275,0.0293,0.0311,0.0284,0.0279,0.0294,-0.128243
3,0.1422,0.136,0.1538,0.1283,0.1241,0.1161,0.1074,0.0876,0.0831,0.0722,...,0.0456,0.0473,0.0484,0.0452,0.0428,0.045,0.0486,0.0425,0.0479,-0.164785
4,0.1333,0.128,0.1307,0.0981,0.1034,0.0838,0.0776,0.0767,0.0779,0.0722,...,0.0373,0.0347,0.0323,0.033,0.036,0.0369,0.0344,0.034,0.0405,-0.326122
5,0.0889,0.104,0.1,0.1056,0.0896,0.0839,0.0836,0.0877,0.0831,0.0723,...,0.0394,0.0368,0.0344,0.033,0.0349,0.0334,0.0332,0.034,0.0356,-0.147548
6,0.0978,0.08,0.0769,0.0604,0.0552,0.0967,0.0776,0.0712,0.0779,0.0482,...,0.0238,0.0252,0.0226,0.0242,0.0225,0.0208,0.0225,0.0231,0.0245,-0.252348
7,0.1052,0.0733,0.1128,0.1072,0.0985,0.1134,0.1044,0.0855,0.0549,0.0608,...,0.0278,0.0261,0.0245,0.0251,0.0232,0.0225,0.022,0.0201,0.0292,-0.474359
8,0.1626,0.1466,0.1476,0.1072,0.1062,0.1064,0.0718,0.1099,0.0989,0.0913,...,0.0353,0.0347,0.0345,0.0365,0.036,0.032,0.0317,0.0314,0.0342,-0.649486
9,0.1486,0.1351,0.1282,0.1176,0.1505,0.1414,0.1603,0.1304,0.1056,0.0763,...,0.0382,0.0387,0.0395,0.0395,0.0393,0.0401,0.04,0.0373,0.035,-0.408859


In [3]:
# ---------------------------------
# Define predictors and target
# ---------------------------------
predictors = training_df.drop(columns = ['Water Potential (Mpa)'])
target = training_df['Water Potential (Mpa)']

# <font color = Orange> KBest Feature Selection

## Linear Feature Selection

In [5]:
# Define the feature selection method and the model
# F-regression computes the degree of linear dependency between each feature and the target variable

selector = SelectKBest(score_func=f_regression)
model = LinearRegression()

# Create a pipeline that combines the feature selection and the model
pipeline = Pipeline(steps=[('feature_selection',selector),('m',model)])

# Define the grid
grid = {'feature_selection__k': range(1, predictors.shape[1]+1)}

# Define the grid search
search = GridSearchCV(pipeline, grid, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
result = search.fit(predictors, target)

# Print the best score and the best parameter
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# Use the best number of features to transform the data
selector = SelectKBest(f_regression, k=result.best_params_['feature_selection__k'])

# Fit the selector to the data and transform the data
X_new = selector.fit_transform(predictors, target)

# Use the selector to inverse transform the data back to the original feature space
# This will create a DataFrame with the same shape as the original data, but with zeros in the places of the dropped features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=predictors.index, 
                                 columns=predictors.columns)

# Keep only the columns that were selected by the selector
# This is done by using the get_support method of the selector, which returns a boolean mask of the selected features
selected_features = selected_features.loc[:, selector.get_support()]

# Display the first few rows of the DataFrame with the selected features
selected_features.head()

Best Score: -1.2939494898678365
Best Hyperparameters: {'feature_selection__k': 13}


Unnamed: 0,1891,1894,1898,1902,1905,1907,1910,1968,1971,1979,1982,1985,1988
0,0.0814,0.0678,0.0574,0.0311,0.0284,0.0269,0.0259,0.0336,0.0347,0.0395,0.041,0.043,0.0448
1,0.0983,0.0855,0.0757,0.0515,0.049,0.0464,0.0446,0.0522,0.0534,0.058,0.0598,0.0615,0.0631
2,0.0633,0.0533,0.0459,0.0282,0.0251,0.0238,0.023,0.028,0.029,0.0321,0.0333,0.0347,0.0359
3,0.0969,0.0831,0.0734,0.0493,0.0456,0.0433,0.0421,0.0475,0.0487,0.0531,0.055,0.0564,0.0579
4,0.0784,0.0662,0.0573,0.0336,0.0317,0.0301,0.0286,0.0343,0.0354,0.0391,0.0406,0.0421,0.0438


In [8]:
# Merge selected_features DF with the 'target' column from the original training DF
linear_selected_features = selected_features.merge(target, left_index=True, right_index=True)

linear_selected_features

Unnamed: 0,1891,1894,1898,1902,1905,1907,1910,1968,1971,1979,1982,1985,1988,Water Potential (Mpa)
0,0.0814,0.0678,0.0574,0.0311,0.0284,0.0269,0.0259,0.0336,0.0347,0.0395,0.041,0.043,0.0448,-0.089632
1,0.0983,0.0855,0.0757,0.0515,0.049,0.0464,0.0446,0.0522,0.0534,0.058,0.0598,0.0615,0.0631,-0.148237
2,0.0633,0.0533,0.0459,0.0282,0.0251,0.0238,0.023,0.028,0.029,0.0321,0.0333,0.0347,0.0359,-0.128243
3,0.0969,0.0831,0.0734,0.0493,0.0456,0.0433,0.0421,0.0475,0.0487,0.0531,0.055,0.0564,0.0579,-0.164785
4,0.0784,0.0662,0.0573,0.0336,0.0317,0.0301,0.0286,0.0343,0.0354,0.0391,0.0406,0.0421,0.0438,-0.326122
5,0.0731,0.0624,0.054,0.0347,0.0311,0.0282,0.0271,0.0326,0.0338,0.037,0.0381,0.0394,0.0409,-0.147548
6,0.0592,0.0502,0.0426,0.0247,0.0236,0.0225,0.0214,0.0247,0.0254,0.0286,0.0296,0.0304,0.0317,-0.252348
7,0.0622,0.053,0.0455,0.028,0.0261,0.0248,0.0239,0.0292,0.0302,0.0336,0.035,0.0363,0.0374,-0.474359
8,0.0862,0.0726,0.0619,0.0359,0.0331,0.0308,0.029,0.0377,0.0393,0.0441,0.0459,0.0472,0.0489,-0.649486
9,0.0945,0.0801,0.0686,0.0414,0.0381,0.0358,0.0335,0.0416,0.0431,0.0478,0.0491,0.051,0.0531,-0.408859


In [9]:
# save results
linear_selected_features.to_excel('linear_selected_features.xlsx', index = False)

## Non-linear Feature Selection

In [10]:
# Define the feature selection method and the model
# select features based on their mutual information with the target variable
# especially in cases where the relationship between features and the target may not be linear

selector = SelectKBest(score_func=mutual_info_regression)
model = LinearRegression()

# Create a pipeline that combines the feature selection and the model
pipeline = Pipeline(steps=[('feature_selection',selector),('m',model)])

# Define the grid
grid = {'feature_selection__k': range(1, predictors.shape[1]+1)}

# Define the grid search
search = GridSearchCV(pipeline, grid, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
result = search.fit(predictors, target)

# Print the best score and the best parameter
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# Use the best number of features to transform the data
selector = SelectKBest(mutual_info_regression, k=result.best_params_['feature_selection__k'])

# Fit the selector to the data and transform the data
X_new = selector.fit_transform(predictors, target)

# Use the selector to inverse transform the data back to the original feature space
# This will create a DataFrame with the same shape as the original data, but with zeros in the places of the dropped features
selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=predictors.index, 
                                 columns=predictors.columns)

# Keep only the columns that were selected by the selector
# This is done by using the get_support method of the selector, which returns a boolean mask of the selected features
selected_features = selected_features.loc[:, selector.get_support()]

# Display the first few rows of the DataFrame with the selected features
selected_features.head()

Best Score: -1.6933375549110523
Best Hyperparameters: {'feature_selection__k': 9}


Unnamed: 0,355,510,669,1894,1898,2023,2025,2028,2031
0,0.1073,0.0976,0.0513,0.0678,0.0574,0.0702,0.0717,0.074,0.0763
1,0.0666,0.0691,0.0522,0.0855,0.0757,0.0868,0.0887,0.0904,0.0919
2,0.0624,0.0388,0.0239,0.0533,0.0459,0.0534,0.0549,0.0561,0.0573
3,0.0874,0.0659,0.0448,0.0831,0.0734,0.0808,0.0827,0.0846,0.0861
4,0.0666,0.0394,0.0328,0.0662,0.0573,0.0653,0.0669,0.0683,0.0702


In [11]:
# Merge selected_features DF with the 'target' column from the original training DF
nonlinear_selected_features = selected_features.merge(target, left_index=True, right_index=True)

nonlinear_selected_features.head()

Unnamed: 0,355,510,669,1894,1898,2023,2025,2028,2031,Water Potential (Mpa)
0,0.1073,0.0976,0.0513,0.0678,0.0574,0.0702,0.0717,0.074,0.0763,-0.089632
1,0.0666,0.0691,0.0522,0.0855,0.0757,0.0868,0.0887,0.0904,0.0919,-0.148237
2,0.0624,0.0388,0.0239,0.0533,0.0459,0.0534,0.0549,0.0561,0.0573,-0.128243
3,0.0874,0.0659,0.0448,0.0831,0.0734,0.0808,0.0827,0.0846,0.0861,-0.164785
4,0.0666,0.0394,0.0328,0.0662,0.0573,0.0653,0.0669,0.0683,0.0702,-0.326122


In [12]:
# save results
nonlinear_selected_features.to_excel('nonlinear_selected_features.xlsx', index = False)

# <font color = red> Recursive Feature Elimination (RFE)

In [5]:
# Define the model and the feature selection method
model = LinearRegression()
rfe = RFE(estimator=model)

# Create a pipeline that combines the feature selection and the model
pipeline = Pipeline(steps=[('s',rfe),('m',model)])

# Define the grid
grid = {'s__n_features_to_select': range(1, predictors.shape[1]+1)}

# Define the grid search
search = GridSearchCV(pipeline, grid, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
result = search.fit(predictors, target)

# Print the best score and the best parameter
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# Use the best number of features to transform the data
rfe = RFE(estimator=model, n_features_to_select=result.best_params_['s__n_features_to_select'])

# Fit the selector to the data and transform the data
X_rfe = rfe.fit_transform(predictors, target)

# Use the selector to inverse transform the data back to the original feature space
selected_features = pd.DataFrame(rfe.inverse_transform(X_rfe), 
                                 index=predictors.index, 
                                 columns=predictors.columns)

# Keep only the columns that were selected by the selector
selected_features = selected_features.loc[:, rfe.support_]

selected_features.head()


Best Score: -1.0827576901166185
Best Hyperparameters: {'s__n_features_to_select': 2}


Unnamed: 0,2102,2117
0,0.1191,0.1275
1,0.126,0.133
2,0.0846,0.0893
3,0.1205,0.1281
4,0.1049,0.111


In [7]:
# Merge selected_features DF with the 'target' column from the original training DF
rfe_selected_features = selected_features.merge(target, left_index=True, right_index=True)

rfe_selected_features.head()

Unnamed: 0,2102,2117,Water Potential (Mpa)
0,0.1191,0.1275,-0.089632
1,0.126,0.133,-0.148237
2,0.0846,0.0893,-0.128243
3,0.1205,0.1281,-0.164785
4,0.1049,0.111,-0.326122


In [8]:
# save results
rfe_selected_features.to_excel('rfe_selected_features.xlsx', index = False)

# <font color = green> Sequential Feature Selection (SFS)

In [None]:
# Define the model
model = LinearRegression()

# Create a Sequential Forward Selection object
sfs = SequentialFeatureSelector(estimator=model,
                                direction='forward',  # Sequential Forward Selection
                                scoring='neg_mean_squared_error',  # Scoring metric
                                n_jobs=-1)

# Create a pipeline that combines the feature selection and the model
pipeline = Pipeline(steps=[('s', sfs), ('m', model)])

# Define the grid (number of features to select)
grid = {'s__n_features_to_select': range(1, predictors.shape[1] + 1)}

# Define the grid search
search = GridSearchCV(pipeline, grid, scoring='neg_mean_squared_error', n_jobs=-1)

# Perform the grid search
result = search.fit(predictors, target)

# Print the best score and the best parameter
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# Use the best number of features to transform the data
sfs = SequentialFeatureSelector(estimator=model,
                                direction='forward',
                                n_features_to_select=result.best_params_['s__n_features_to_select'])
X_sfs = sfs.fit_transform(predictors, target)

# Get the selected feature indices
selected_feature_indices = sfs.get_support(indices=True)

# Filter the predictors to keep only the selected features
selected_features = predictors.iloc[:, selected_feature_indices]

selected_features.head()

In [None]:
# Merge selected_features DF with the 'target' column from the original training DF
sfs_selected_features = selected_features.merge(target, left_index=True, right_index=True)

sfs_selected_features.head()

In [None]:
# save results
sfs_selected_features.to_excel('sfs_selected_features.xlsx', index = False)