# Setup

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression


In [2]:
import warnings

# Filter out the ConvergenceWarning
warnings.filterwarnings("ignore", category=Warning, module="sklearn")

# Helper function

## Hàm hiệu chỉnh tham số khởi tạo dữ liệu


In [3]:
# Tạo dữ liệu cho mô hình Regression
# Generate data with informant features
def generate_data(n_samples=1000, n_informant_features=20):
    informant_features = np.random.randn(n_samples, n_informant_features)      #
    true_coefficients  = np.random.uniform(0.8,1.3, size=n_informant_features) # hệ số w trong y = wx + b
    target = np.dot(informant_features, true_coefficients)
    return informant_features, target

# Add noise cho dữ liệu Regression
def addNoise(data, noise_level): #data: có thể là x hoặc y, noise_level: là một số thực bất kỳ
 # đổi trực tiếp gía trị của dữ liệu đã có (không thêm dữ liệu)
  return data+np.random.uniform(-noise_level, noise_level, size=data.shape)

# Add noise cho dữ liệu Classification
#array là y, noise_level là từ 0-1, đổi trực tiếp class của dữ liệu đã có (không thêm dữ liệu)
def addNoise_class(array, noise_level):
    # Determine the number of elements to switch based on noise_percent
    num_elements_to_switch = int(len(array) * noise_level)

    # Randomly choose indices to switch
    switch_indices = np.random.choice(len(array), num_elements_to_switch, replace=False)

    # Switch the values at the chosen indices
    array[switch_indices] = 1 - array[switch_indices]

    return array

# Add thêm features mới mà correlate với Features đã có
# Feature collinearity: sự ảnh hưởng nhau giữa các feature
# informant_features: Thường là X
# multicollinearity: số lượng FT muốn thêm
def add_multicollinearity(informant_features, multicollinearity):
    i=0
    n_samples,n_informant_features=informant_features.shape
    selected_features= np.random.randint(0, n_informant_features,multicollinearity)
    informant_features=np.concatenate((informant_features, np.zeros((n_samples,multicollinearity))), axis=1)
    while i<multicollinearity:
        scaling_factor = np.random.uniform(0.5, 1)

        # Generate random numbers in the range 0.8 to 1.3
        random_numbers = np.random.uniform(0.8, 1.3, size=(n_samples, 1)).T

        # Create multicollinear feature
        tempo_feature = informant_features[:,selected_features[i]] * (scaling_factor * random_numbers)

        # Add the multicollinear feature to the informant features
        informant_features[:, i+n_informant_features ] = tempo_feature
        i=i+1
    return informant_features

# Thêm features redundant (nhiễu)
def addRedundant(informant_features, n_redundant):
    n_samples,n_informant_features=informant_features.shape
    return np.concatenate((informant_features, np.random.randn(n_samples, n_redundant)), axis=1)

In [4]:
def sigmoid(x):
    y=np.array(1 / (1 + np.exp(-x)))
    y[y>=0.5]=1
    y[y<=0.5]=0
    return y

In [5]:
def applyNoise(X_train, X_test, y_train, y_test, X_noise, y_noise):
    return (addNoise(X_train, X_noise), addNoise(X_test, X_noise), addNoise(y_train, y_noise), y_test)
def applyNoise_class(X_train, X_test, y_train, y_test,X_noise,y_noise):
    return (addNoise(X_train, X_noise), addNoise(X_test, X_noise), addNoise_class(y_train, y_noise), y_test)

def Split_applyNoise(X, y, X_noise, y_noise):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return applyNoise(X_train, X_test, y_train, y_test, X_noise, y_noise)
def Split_applyNoise_class(X, y, X_noise, y_noise):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return applyNoise_class(X_train, X_test, y_train, y_test, X_noise, y_noise)


## Evaluate Function

In [6]:
#Dont use this
# Regression
def _r2score(X_train, X_test, y_train, y_test, modi, parameter):
  X_train_corr, X_test_corr = modi(X_train, X_test, y_train, y_test, parameter)
  model = LinearRegression()
  model.fit(X_train, y_train)
  print("default  :", r2_score(y_test, model.predict(X_test)))
  model.fit(X_train_corr, y_train)
  print("after FS :", r2_score(y_test, model.predict(X_test_corr)))
  np.savez('{}_regression.npz'.format(modi.__name__), X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
# Classification
def _f1score(X_train, X_test, y_train, y_test, modi, parameter):
  X_train_corr,X_test_corr =modi(X_train, X_test, y_train, y_test, parameter)
  model = LogisticRegression(random_state=42)
  model.fit(X_train, y_train)
  print("default  :", f1_score(y_test, model.predict(X_test)))
  model.fit(X_train_corr, y_train)
  print("after FS :", f1_score(y_test, model.predict(X_test_corr)))
  np.savez('{}.npz'.format(modi.__name__), X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)

In [7]:
# Regression
def r2score(X,y,modi,parameter,X_noise,y_noise):
  #modi: tên của feature selection model
  #parameter: Tham số alpha của từng mô hình tương ứng
  X_train, X_test, y_train, y_test = Split_applyNoise(X,y,X_noise,y_noise)
  _r2score(X_train, X_test, y_train, y_test,modi,parameter)

# Classification
def f1score(X,y,modi,parameter,X_noise,y_noise):
  X_train, X_test, y_train, y_test = Split_applyNoise_class(X,y,X_noise,y_noise)
  _f1score(X_train, X_test, y_train, y_test, modi, parameter)


In [8]:
# Regression
def r2score_2( X_train, X_test, y_train, y_test,modi,parameter,X_noise,y_noise):
  #modi: tên của feature selection model
  #parameter: Tham số alpha của từng mô hình tương ứng
  X_train, X_test, y_train, y_test = applyNoise(X_train, X_test, y_train, y_test,X_noise,y_noise)
  _r2score(X_train, X_test, y_train, y_test, modi, parameter)

# Classification
def f1score_2( X_train, X_test, y_train, y_test,modi,parameter,X_noise,y_noise):
  X_train, X_test, y_train, y_test = applyNoise_class(X_train, X_test, y_train, y_test,X_noise,y_noise)
  _f1score(X_train, X_test, y_train, y_test, modi, parameter)


## Hàm cho từng mô hình Feature Selection

In [9]:
#1. Correlation coefficient (Pearson)
def ccFS(X_train, X_test, y_train, y_test,threshhold=0.5):
    correlation_matrix = np.abs(pd.DataFrame(X_train).corr().values)
    high_correlation_features = set()
    for i in range(correlation_matrix.shape[0]):
        for j in range(i+1, correlation_matrix.shape[0]):
            if correlation_matrix[i, j] > threshhold:
                high_correlation_features.add(max(i, j))
    correlation_feature_indices = [i for i in range(correlation_matrix.shape[0]) if i not in high_correlation_features]
    X_train_corr = X_train[:, correlation_feature_indices]
    X_test_corr = X_test[:, correlation_feature_indices]
    return (X_train_corr, X_test_corr)

In [10]:
from sklearn.feature_selection import VarianceThreshold
# 2. Variance Threshold
def varianceThreshold(X_train, X_test, y_train, y_test,threshhold):
    selector = VarianceThreshold(threshold=threshhold).fit(X_train)
    new_cols = selector.get_support()
    return X_train[:,new_cols],X_test[:,new_cols]

In [11]:
# 3. Mutual Information
from sklearn.feature_selection import (SelectKBest
                                      ,mutual_info_regression,mutual_info_classif)
def mutualInformation(X_train, X_test, y_train, y_test,k):
  selector = SelectKBest(mutual_info_regression, k=k)
  X_train_mi = selector.fit_transform(X_train, y_train)
  X_test_mi = selector.transform(X_test)
  return(X_train_mi, X_test_mi)
def mutualInformation_classification(X_train, X_test, y_train, y_test,k):
  selector = SelectKBest(mutual_info_classif, k=k)
  X_train_mi = selector.fit_transform(X_train, y_train)
  X_test_mi = selector.transform(X_test)
  return(X_train_mi, X_test_mi)

In [12]:
# 4. Forward Selection
from sklearn.feature_selection import  SequentialFeatureSelector
def forwardSelector(X_train, X_test, y_train, y_test,n_features_to_select):
  forward_selector = SequentialFeatureSelector(LinearRegression(), direction='forward', n_features_to_select=n_features_to_select)
  X_train_forward = forward_selector.fit_transform(X_train, y_train)
  X_test_forward = forward_selector.transform(X_test)
  return(X_train_forward, X_test_forward)

def forwardSelector_classification(X_train, X_test, y_train, y_test,n_features_to_select):
  forward_selector = SequentialFeatureSelector(LogisticRegression(max_iter=1000,random_state=42), direction='forward', n_features_to_select=n_features_to_select)
  X_train_forward = forward_selector.fit_transform(X_train, y_train)
  X_test_forward = forward_selector.transform(X_test)
  return(X_train_forward, X_test_forward)


In [47]:
#5. Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE as RFe
def RFE(X_train,X_test, y_train, y_test,threshhold=0.5):
  rfe_selector = RFe(estimator=LinearRegression(),n_features_to_select=threshhold,step=10)
  X_train_rfe = rfe_selector.fit_transform(X_train, y_train)
  X_test_rfe = rfe_selector.transform(X_test)
  return (X_train_rfe,X_test_rfe)

def RFE_classification(X_train,X_test, y_train, y_test,threshhold=0.5):
  rfe_selector = RFe(estimator=LogisticRegression(random_state=42),n_features_to_select=threshhold,step=10)
  X_train_rfe = rfe_selector.fit_transform(X_train, y_train)
  X_test_rfe = rfe_selector.transform(X_test)
  return (X_train_rfe,X_test_rfe)

In [14]:
# 6. LASSO
from sklearn.linear_model import  LassoCV
def lasso(X_train,X_test, y_train, y_test,threshhold=0.5):
  lasso_selector = SelectFromModel(LassoCV(cv=threshhold, max_iter=1000))
  X_train_lasso = lasso_selector.fit_transform(X_train, y_train)
  X_test_lasso = lasso_selector.transform(X_test)
  return (X_train_lasso,X_test_lasso)

In [15]:
# 7. Ridge Regression
from sklearn.linear_model import Ridge,RidgeCV,RidgeClassifier
def ridge(X_train,X_test, y_train, y_test,alpha):
  ridge_selector = SelectFromModel(RidgeCV())
  X_train_ridge = ridge_selector.fit_transform(X_train, y_train)
  X_test_ridge = ridge_selector.transform(X_test)
  return (X_train_ridge,X_test_ridge)
def ridge_classification(X_train,X_test, y_train, y_test,alpha):
  ridge_selector = SelectFromModel(RidgeClassifier(alpha=alpha))
  X_train_ridge = ridge_selector.fit_transform(X_train, y_train)
  X_test_ridge = ridge_selector.transform(X_test)
  return (X_train_ridge,X_test_ridge)

In [16]:
#8. Elastic Net
from sklearn.linear_model import ElasticNetCV
def elastic(X_train,X_test, y_train, y_test,cv):
    elastic_net_selector = SelectFromModel(ElasticNetCV(cv=cv))
    X_train_elastic_net = elastic_net_selector.fit_transform(X_train, y_train)
    X_test_elastic_net = elastic_net_selector.transform(X_test)
    return(X_train_elastic_net,X_test_elastic_net)


In [17]:
# 9. Random Forest
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
def randomForest(X_train,X_test, y_train, y_test,n_estimators):
    rf_selector = SelectFromModel(RandomForestRegressor(n_estimators=n_estimators, random_state=42))
    X_train_rf = rf_selector.fit_transform(X_train, y_train)
    X_test_rf = rf_selector.transform(X_test)
    return(X_train_rf,X_test_rf)
def randomForest_classification(X_train,X_test, y_train, y_test,n_estimators):
    rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=n_estimators, random_state=42))
    X_train_rf = rf_selector.fit_transform(X_train, y_train)
    X_test_rf = rf_selector.transform(X_test)
    return(X_train_rf,X_test_rf)


In [18]:
# 10. GBM
from sklearn.ensemble import  GradientBoostingClassifier,GradientBoostingRegressor
def gbm(X_train,X_test, y_train, y_test,n_estimators):
  gbm_selector = SelectFromModel(GradientBoostingRegressor(n_estimators=n_estimators, random_state=42))
  X_train_gbm = gbm_selector.fit_transform(X_train, y_train)
  X_test_gbm = gbm_selector.transform(X_test)
  return (X_train_gbm,X_test_gbm)
def gbm_classification(X_train,X_test, y_train, y_test,n_estimators):
  gbm_selector = SelectFromModel(GradientBoostingClassifier(n_estimators=n_estimators, random_state=42))
  X_train_gbm = gbm_selector.fit_transform(X_train, y_train)
  X_test_gbm = gbm_selector.transform(X_test)
  return (X_train_gbm,X_test_gbm)

In [19]:
# 11. Factor Analysis
from sklearn.decomposition import FactorAnalysis
def factorAnalysis(X_train,X_test, y_train, y_test,n_components):
  fa_transformer = FactorAnalysis(n_components=n_components, random_state=42)
  X_train_fa = fa_transformer.fit_transform(X_train)
  X_test_fa = fa_transformer.transform(X_test)
  return (X_train_fa,X_test_fa)

In [20]:
# 12. PCA
from sklearn.decomposition import PCA
def pca(X_train,X_test, y_train, y_test,n_components):
  pca_transformer = PCA(n_components=n_components, random_state=42)
  X_train_pca = pca_transformer.fit_transform(X_train)
  X_test_pca = pca_transformer.transform(X_test)
  return (X_train_pca,X_test_pca)

In [21]:
# 13. ICA
from sklearn.decomposition import FastICA
def ica(X_train,X_test, y_train, y_test,n_components):
  ica_transformer = FastICA(n_components=n_components, random_state=42)
  X_train_ica = ica_transformer.fit_transform(X_train)
  X_test_ica = ica_transformer.transform(X_test)
  return (X_train_ica,X_test_ica)

# Khởi tạo dữ liệu và đánh giá

##1. Correlation Coefficient (Pearson)

In [22]:
# Tạo bộ dữ liệu cho mô hình Regression
X,y = generate_data(n_samples=1000, n_informant_features=50)
X=add_multicollinearity(X,multicollinearity=100)
r2score(X,y,ccFS,0.3,0.25,10)

default  : 0.8834797067089832
after FS : 0.9534578724841646


In [23]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=50,   # Number of features
    n_informative=50,  # Number of informative features
    n_redundant=0,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=add_multicollinearity(X,100)

f1score(X,y,ccFS,0.3,1,0.2)

default  : 0.6564102564102564
after FS : 0.6829268292682927


## 2. Variance Threshold

In [24]:
def idTypeValue(true_value,shape):
  ids=np.random.randint(0, len(true_value), shape)
  ids_value=np.copy(ids)
  for i,value in enumerate(true_value):
    ids_value[ids_value==i] =value
  return (ids,ids_value)

In [25]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients  =np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([10,6],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients)

#Create Test dataset
X_test,X_test_value=idTypeValue([3,6],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients)

r2score_2(X_train,X_test,y_train,y_test,varianceThreshold,0.9,0,0)

default  : -0.9239270353504541
after FS : 0.852196990809972


In [26]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients  =np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([-30,30],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = sigmoid(np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients))

#Create Test dataset
X_test,X_test_value=idTypeValue([60,-60],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = sigmoid( np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients))

f1score_2(X_train,X_test,y_train,y_test,varianceThreshold,0.9,0,0)

default  : 0.6666666666666666
after FS : 0.7254901960784313


## 3. Mutual Information

In [27]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients = np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([10,6],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients)

#Create Test dataset
X_test,X_test_value=idTypeValue([3,6],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients)

r2score_2(X_train, X_test, y_train, y_test, mutualInformation,30,0,0)

default  : -1.220590587215435
after FS : 0.37825464595848923


In [28]:
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=100,   # Number of features
    n_informative=50,  # Number of informative features
    n_redundant=50,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=add_multicollinearity(X,100)

f1score(X,y,mutualInformation_classification,50,0.1,0.15)

default  : 0.7142857142857143
after FS : 0.75


## 4. Forward Selection

In [29]:
# Tạo bộ dữ liệu cho mô hình Regression

X,y=  generate_data(n_samples=1000, n_informant_features=20)
X=addRedundant(X,180)
r2score(X,y,forwardSelector,20,1,1)

default  : 0.727738380216751
after FS : 0.7638232702464407


In [30]:
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=20,  # Number of informative features
    n_redundant=180,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
f1score(X,y,forwardSelector_classification,20,1,0.05)


default  : 0.6994535519125683
after FS : 0.7624309392265194


##5. Recursive Feature Elimination (RFE)

In [48]:
# Tạo bộ dữ liệu cho mô hình Regression

X,y = generate_data(n_samples=1000, n_informant_features=20)
X=addRedundant(X,180)
r2score(X,y,RFE,20,1,1)

default  : 0.7207058720588579
after FS : 0.7803162823583533


In [50]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=20,  # Number of informative features
    n_redundant=180,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
f1score(X,y,RFE_classification, 20,0.5,0.1)



default  : 0.6979166666666666
after FS : 0.7668393782383419


## 6. Lasso

In [72]:
# Tạo bộ dữ liệu cho mô hình Regression
X,y=  generate_data(n_samples=1000, n_informant_features=40)
X=addRedundant(X,160)
r2score(X, y, lasso, 2,0.75,2)

default  : 0.7172857564972426
after FS : 0.758852658194994


In [52]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=40,  # Number of informative features
    n_redundant=160,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
f1score(X,y,lasso,40,0.75,0.1)

default  : 0.7539267015706806
after FS : 0.8085106382978724


## 7. Ridge

In [53]:
# Tạo bộ dữ liệu cho mô hình Regression
X,y = generate_data(n_samples=1000, n_informant_features=10)
X = addRedundant(X,n_redundant=40)
r2score(X, y, ridge, 0.9,0.25,8)


default  : 0.7326050213267283
after FS : 0.8297030991790406


In [68]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=100,   # Number of features
    n_informative=40,  # Number of informative features
    n_redundant=60,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)

f1score(X,y,ridge_classification, 0.5,0.5,0.2)

default  : 0.7684210526315789
after FS : 0.7853403141361256


## 8. Elastic Net

In [55]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients = np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([10,6],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients)

#Create Test dataset
X_test,X_test_value=idTypeValue([3,6],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients)

r2score_2(X_train,X_test,y_train,y_test,elastic,10,0,0)

default  : -0.733043125286065
after FS : 0.9140132680122369


In [56]:
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=40,  # Number of informative features
    n_redundant=160,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
f1score(X,y,elastic,10,0.75,0.1)

default  : 0.7000000000000001
after FS : 0.8167539267015707


## 9. Random Forest

In [57]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients  =np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([10,6],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients)

#Create Test dataset
X_test,X_test_value=idTypeValue([3,6],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients)

r2score_2(X_train,X_test,y_train,y_test,randomForest,100,0,0)

default  : -0.7816656851779029
after FS : 0.8585368720118308


In [58]:
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=40,  # Number of informative features
    n_redundant=0,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=addRedundant(X,160)
f1score(X,y,randomForest_classification,100,0.75,0.2)

default  : 0.6792452830188679
after FS : 0.7281553398058253


## 10. GBM

In [59]:
n_id_data_type= 40
n_float_data_type=20

true_coefficients  =np.concatenate((np.random.uniform(0.1,0.2, size=n_id_data_type),np.random.uniform(0.8,1.3, size=n_float_data_type))) # hệ số w trong y = wx + b
#Create Train dataset
X_train,X_train_value=idTypeValue([10,6],(800,n_id_data_type))
random_floats = np.random.uniform(-10, 10, (800, n_float_data_type))
X_train = np.concatenate((X_train,random_floats),axis=1)
y_train = np.dot(np.concatenate((X_train_value,random_floats),axis=1), true_coefficients)

#Create Test dataset
X_test,X_test_value=idTypeValue([3,6],(200,n_id_data_type))
random_floats = np.random.uniform(-20, 20, (200, n_float_data_type))
X_test = np.concatenate((X_test+4,random_floats),axis=1)
y_test = np.dot(np.concatenate((X_test_value,random_floats),axis=1), true_coefficients)

r2score_2(X_train,X_test,y_train,y_test,gbm,100,0,0)

default  : -0.861312711670335
after FS : 0.850827042714563


In [60]:
X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=200,   # Number of features
    n_informative=40,  # Number of informative features
    n_redundant=160,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
f1score(X,y,gbm_classification,40,0.75,0.1)

default  : 0.7486631016042781
after FS : 0.8397790055248618


## 11. Factor Analysis

In [61]:
X,y=  generate_data(n_samples=1000, n_informant_features=20)
X=add_multicollinearity(X,multicollinearity=80)
r2score(X,y,factorAnalysis,20,1,10)

default  : 0.7063078562280405
after FS : 0.8089911586314348


In [62]:

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=100,   # Number of features
    n_informative=20,  # Number of informative features
    n_redundant=0,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=add_multicollinearity(X,80)
f1score(X,y,factorAnalysis,40,3,0.3)

default  : 0.6137566137566138
after FS : 0.7333333333333334


## 12. PCA

In [63]:
# Tạo bộ dữ liệu cho mô hình Regression

X,y=  generate_data(n_samples=1000, n_informant_features=20)
X=add_multicollinearity(X,multicollinearity=180)
#X=addRedundant(X,20)
print(X.shape)
r2score(X,y,pca,20,2,1)



(1000, 200)
default  : 0.7369638733490765
after FS : 0.8142097326971622


In [64]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=100,   # Number of features
    n_informative=100,  # Number of informative features
    n_redundant=0,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=add_multicollinearity(X,100)

f1score(X,y,pca,100,2,0.1)


default  : 0.7171717171717172
after FS : 0.7653061224489796


## 13. ICA

In [65]:
# Tạo bộ dữ liệu cho mô hình Regression

X,y=  generate_data(n_samples=1000, n_informant_features=100)
X=add_multicollinearity(X,multicollinearity=100)
r2score(X,y,ica,100,1,2)

default  : 0.7721475272160343
after FS : 0.8008790634811767


In [66]:
# Tạo bộ dữ liệu cho mô hình Classification

X, y = make_classification(
    n_samples=1000,  # Number of samples
    n_features=100,   # Number of features
    n_informative=100,  # Number of informative features
    n_redundant=0,    # Number of redundant features
    n_classes=2,      # Number of classes
    random_state=42   # Random seed for reproducibility
)
X=add_multicollinearity(X,100)
f1score(X,y,ica,100,1,0.1)

default  : 0.7609756097560975
after FS : 0.7759562841530053


In [76]:
!zip -r /content/z.zip /content/z
from google.colab import files
files.download("/content/z.zip")

  adding: content/z/ (stored 0%)
  adding: content/z/elastic_regression.npz (deflated 63%)
  adding: content/z/ridge_regression.npz (deflated 4%)
  adding: content/z/RFE_regression.npz (deflated 4%)
  adding: content/z/mutualInformation_regression.npz (deflated 63%)
  adding: content/z/ica_regression.npz (deflated 4%)
  adding: content/z/forwardSelector_classification.npz (deflated 4%)
  adding: content/z/ccFS_classification.npz (deflated 4%)
  adding: content/z/varianceThreshold_regression.npz (deflated 63%)
  adding: content/z/gbm_regression.npz (deflated 63%)
  adding: content/z/forwardSelector_regression.npz (deflated 4%)
  adding: content/z/ridge_classification.npz (deflated 4%)
  adding: content/z/lasso_regression.npz (deflated 4%)
  adding: content/z/varianceThreshold_classification.npz (deflated 64%)
  adding: content/z/RFE_classification.npz (deflated 4%)
  adding: content/z/mutualInformation_classification.npz (deflated 4%)
  adding: content/z/ccFS_regression.npz (deflated 4%

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>