In [1]:
# import necessary libraries
from sklearn.feature_selection import SelectKBest  
from sklearn.feature_selection import f_classif
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold  

In [2]:
# load the dataset into a pandas DataFrame
df = pd.read_csv('Bias_correction_ucl.csv')

# drop the 'Date' feature as it's meaningless
df = df.drop(columns='Date')

# print the shape of dataset
print('Dataset shape before cleaning:', df.shape)

# clean the dataset by replacing negative values with NaN and dropping NaN values
df.iloc[:, 2:] = df.iloc[:, 2:].mask(df.iloc[:, 2:] < 0)
df = df.dropna()

# print the shape of the dataset after cleaning
print('Dataset shape after cleaning:', df.shape)

# define the feature variables in the dataset
X = df.iloc[:, :-2].values

# define the target variable for this task
y = df.iloc[:, -2].values

Dataset shape before cleaning: (7752, 24)
Dataset shape after cleaning: (7585, 24)


In [3]:
# apply univariate feature selection

# initialize SelectKBest with ANOVA F-value test to select the 7 best features
selector = SelectKBest(f_classif, k=5)

# apply the SelectKBest object to the feature matrix and target vector
X_uni = selector.fit_transform(X, y)

# show dimensions of the feature matrix after feature selection
print('Dataset shape after univariate feature selection:', X_uni.shape)

# show the columns after the feature selection in ranking order
scores = selector.scores_ # get scores for all features
indices = np.argsort(scores)[-5:][::-1] # get indices of selected features in descending order
features = df.columns[:-2][indices] # get the names of the features
print('Five highest scoring columns after univariate feature selection (highest scores first):\n  ', 
      features.tolist())

Dataset shape after univariate feature selection: (7585, 5)
Five highest scoring columns after univariate feature selection (highest scores first):
   ['LDAPS_Tmax_lapse', 'Present_Tmax', 'LDAPS_Tmin_lapse', 'LDAPS_CC3', 'LDAPS_CC2']


In [4]:
# apply low variance feature selection

# instantiate the VarianceThreshold object
sel = VarianceThreshold(threshold=0.2)

# apply the VarianceThreshold to the dataset
X_low = sel.fit_transform(X)

# get columns of the selected features
columns = df.columns[:-2][sel.get_support()]

# pick the top five features
variances = X.var(axis=0)
indices = variances.argsort()[::-1]
columns = df.columns[:-2][indices][:5]

# print the selected columns
print('Five highest scoring columns after low variance feature selection (highest scores first):\n  ', 
      columns.tolist())

Five highest scoring columns after low variance feature selection (highest scores first):
   ['Solar radiation', 'DEM', 'LDAPS_LH', 'LDAPS_RHmin', 'station']


In [5]:
# Results:
#
# I used the univariate and low variance feature selections because they both were able to 
# process datasets with continuous values for this dataset. Low variance feature selection 
# didn't need to be modified, however, univariate feature selection did have to change from 
# 'chi2' scoring function to 'f_classif' as the latter can process continuous values.
#
# From these two feature selection methods, both of them chose different sets of five 
# features. It is interesting to see these results, as the univariate feature selection 
# process chose the features using both the feature and target variable and from my 
# understanding it looks at their relationship together. Meanwhile, the low variance 
# feature selection only looks at the feature variables. With this, these two different 
# sets of features selected is different but can include new insight into what features
# should be selected when going forth and training a model.