In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, VarianceThreshold,SequentialFeatureSelector
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression
import math
from scipy.stats.stats import pearsonr
# Lấy dữ liệu
import os
train_file = '/content/drive/MyDrive/StormSurgeFS/StormVNTraining.csv'
test_file = '/content/drive/MyDrive/StormSurgeFS/StormVNTesting.csv'
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
X_train=train.drop('output', axis=1)
y_train=train['output']
X_test=test.drop('output', axis=1)
y_test=test['output']

# 1. Feature Selection bằng phương pháp Filter (SelectKBest với f_regression)
k_best_f = SelectKBest(score_func=f_regression, k=7)
X_train_kbest_f = k_best_f.fit_transform(X_train, y_train)
X_test_kbest_f = k_best_f.transform(X_test)

# 2. Feature Selection bằng phương pháp Filter (SelectKBest với mutual_info_regression)
k_best_mi = SelectKBest(score_func=mutual_info_regression, k=7)
X_train_kbest_mi = k_best_mi.fit_transform(X_train, y_train)
X_test_kbest_mi = k_best_mi.transform(X_test)

# 3. Feature Selection bằng phương pháp Filter (VarianceThreshold)
var_thresh = VarianceThreshold(threshold=0.0005)
X_train_var = var_thresh.fit_transform(X_train)
X_test_var = var_thresh.transform(X_test)

# 4. Feature Selection bằng Information Gain (mutual_info_regression)
info_gain = mutual_info_regression(X_train, y_train)
selected_features_ig = np.argsort(info_gain)[-7:]
X_train_ig = X_train.iloc[:, selected_features_ig]
X_test_ig = X_test.iloc[:, selected_features_ig]


# 5. Feature Selection bằng Forward Feature Selection
sfs = SequentialFeatureSelector(MLPRegressor(), n_features_to_select=7, direction='forward')
X_train_sfs_forward = sfs.fit_transform(X_train, y_train)
X_test_sfs_forward = sfs.transform(X_test)
# 6. Feature Selection bằng Backward Feature Selection
sfs_backward = SequentialFeatureSelector(MLPRegressor(), n_features_to_select=7, direction='backward')
X_train_sfs_backward = sfs_backward.fit_transform(X_train, y_train)
X_test_sfs_backward = sfs_backward.transform(X_test)
# 7. Feature Selection bằng Random Forest Feature Importance
rf = RandomForestRegressor(n_estimators=70, random_state=42)
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_
selected_features_rf = np.argsort(feature_importances)[-7:]
X_train_rf = X_train.iloc[:, selected_features_rf]
X_test_rf = X_test.iloc[:, selected_features_rf]
# 8. Feature Selection bằng Lasso Regression
lasso = Lasso(alpha=0.02)
lasso.fit(X_train, y_train)
selected_features_lasso = np.argsort(np.abs(lasso.coef_))[-8:]
X_train_lasso = X_train.iloc[:, selected_features_lasso]
X_test_lasso = X_test.iloc[:, selected_features_lasso]

# Huấn luyện mô hình và đánh giá
#for name, X_train_sel, X_test_sel in zip([
#    "KBest f_regression", "KBest mutual_info", "VarianceThreshold", "Information Gain", "Forward Selection", "Backward Selection", "Random Forest Importance", "Lasso Regression"],
#    [X_train_kbest_f, X_train_kbest_mi, X_train_var, X_train_ig, X_train_sfs_forward, X_train_sfs_backward, X_train_rf, X_train_lasso],
#    [X_test_kbest_f, X_test_kbest_mi, X_test_var, X_test_ig, X_test_sfs_forward, X_test_sfs_backward, X_test_rf, X_test_lasso]):
#    #model = LinearRegression()
#    model= MLPRegressor()
#    model.fit(X_train_sel, y_train)
#    y_pred = model.predict(X_test_sel)
#    mse = math.sqrt(mean_squared_error(y_test, y_pred))/(max(y_test)-min(y_test))
#    #nr2=r2_score(y_pred,y_test)/
#    nr2=pearsonr(y_pred, y_test)
#    print(f"{name} - MSE: {mse:.4f}")
#    print(nr2)

  from scipy.stats.stats import pearsonr


In [16]:
# Huấn luyện mô hình và đánh giá
for name, X_train_sel, X_test_sel, selector  in zip([
    "KBest f_regression", "KBest mutual_info", "VarianceThreshold", "Information Gain", "Forward Selection", "Backward Selection", "Random Forest Importance", "Lasso Regression"],
    [X_train_kbest_f, X_train_kbest_mi, X_train_var, X_train_ig, X_train_sfs_forward, X_train_sfs_backward, X_train_rf, X_train_lasso],
    [X_test_kbest_f, X_test_kbest_mi, X_test_var, X_test_ig, X_test_sfs_forward, X_test_sfs_backward, X_test_rf, X_test_lasso],
    [k_best_f, k_best_mi, var_thresh, selected_features_ig, sfs, sfs_backward, selected_features_rf, selected_features_lasso]
                                                    ):
    #model = LinearRegression()
    model= MLPRegressor()
    model.fit(X_train_sel, y_train)
     # Get feature names based on the selector type
    if name in ["KBest f_regression", "KBest mutual_info",  "Lasso Regression","Random Forest Importance"]:
        # These selectors return feature indices or scores, not transformed data
        if isinstance(selector, (SelectKBest)):
            selected_features = selector.get_support(indices=True)
        else:
            selected_features = selector
        feature_names = X_train.columns[selected_features].tolist()
    elif name in ["Forward Selection", "Backward Selection"]:
        # SequentialFeatureSelector has get_feature_names_out method in newer versions
        try:
            feature_names = selector.get_feature_names_out()
            feature_names = feature_names.tolist()  # Convert to list if necessary
        except AttributeError:
            # For older versions, get selected feature indices
            selected_features = selector.get_support(indices=True)
            feature_names = X_train.columns[selected_features].tolist()
    elif name == "VarianceThreshold":  # Add this condition for VarianceThreshold
        feature_names = X_train.columns[selector.get_support()].tolist()
    else: # This will handle 'Information Gain'
        feature_names = X_train.columns[selector].tolist() # Directly use selected_features_ig
    y_pred = model.predict(X_test_sel)
    mse = math.sqrt(mean_squared_error(y_test, y_pred))/(max(y_test)-min(y_test))
    #nr2=r2_score(y_pred,y_test)/
    nr2=pearsonr(y_pred, y_test)
    print(f"{name} - MSE: {mse:.4f}")
    print(nr2)
    print(feature_names) # Moved print statement here to show feature names

KBest f_regression - MSE: 0.1806
PearsonRResult(statistic=0.7147154170224024, pvalue=2.8755425565641744e-13)
['WS', 'SLP', 'DSLP', 'SSL', 'CAP', 'HWS', 'SS']
KBest mutual_info - MSE: 0.1832
PearsonRResult(statistic=0.7010778932707569, pvalue=1.2506143713857203e-12)
['WS', 'WD', 'SLP', 'DSLP', 'SSL', 'LT', 'SS']
VarianceThreshold - MSE: 0.1784
PearsonRResult(statistic=0.7269834458595211, pvalue=7.111039173970798e-14)
['WS', 'WD', 'DSLP', 'SSL', 'LG', 'LT', 'HWS', 'SS']
Information Gain - MSE: 0.1847
PearsonRResult(statistic=0.6968775137426029, pvalue=1.9346097855094963e-12)
['SSL', 'WS', 'WD', 'LT', 'DSLP', 'SS', 'SLP']
Forward Selection - MSE: 0.1777
PearsonRResult(statistic=0.7261054782262567, pvalue=7.878443954691422e-14)
['WS', 'WD', 'LG', 'LT', 'CAP', 'HWS', 'SS']
Backward Selection - MSE: 0.1777
PearsonRResult(statistic=0.7248606564373554, pvalue=9.104417081842465e-14)
['WS', 'WD', 'SLP', 'LG', 'LT', 'HWS', 'SS']
Random Forest Importance - MSE: 0.1823
PearsonRResult(statistic=0.71