<a href="https://colab.research.google.com/github/KenzaxTazi/Agri-Risk/blob/master/RFECV_damons_way_Raghul_for_Mala.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recursive Feature Selection
This Notebook describes how to run recursive feature selection with k-folds cross validation on a data set.
This was used to reduce the number of features used in the training sets.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import time
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import feature_selection
import time
import xgboost as xgb
%matplotlib inline

In [None]:
# Read in the data
df = pd.read_csv('/content/drive/My Drive/Team Plants/data/climate_monthly_seasonal_2005_2010.csv')

In [None]:
# Interpolate NaNs away and sample the data set (the full data set caused time and memory issues)
df = df.sort_values(by=['lon'])
df = df.interpolate(axis=1)
df = df.sample(50000, random_state=42)

In [None]:
# Create the features and corresponding target
X = df.drop(['maize_a_2010'], axis=1)
y = df['maize_a_2010']

In [None]:
# Create the Pipeline and fit the feature selector
class PipelineRFE(Pipeline):
    def fit(self, X, y=None, **fit_params):
        super(PipelineRFE, self).fit(X, y, **fit_params)
        self.feature_importances_ = self.steps[-1][-1].feature_importances_
        return self

pipe = PipelineRFE([("RF", xgb.XGBRegressor(tree_method='gpu_hist',
                                            objective='reg:squarederror',
                                            random_state=42,
                                            error_score='raise',
                                            gamma=1,
                                            max_depth=8,
                                            min_child_weight=20,
                                            n_estimators=80))])
     
kfolds = KFold(n_splits=4, random_state=42, shuffle=False)
feature_selector_cv = feature_selection.RFECV(pipe, cv=kfolds, step=1, scoring="neg_mean_squared_error",verbose=3)
feature_selector_cv.fit(X, y)

In [None]:
# Plot the RMSE as a function of the number of features
cv_grid_rmse = np.sqrt(-feature_selector_cv.grid_scores_)

plt.plot(cv_grid_rmse)
plt.title('RMSE versus number of features')
plt.show()

In [None]:
# Based on the analysing the graph and computational considerations, decide on a number of features
number_of_features_selected = 150
selected_features = [feature for feature, ranking in zip(X.columns, feature_selector_cv.ranking_) if ranking < 150]
print(selected_features)