In [None]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor  # Example model

from sklearn.linear_model import Lasso,ElasticNet
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error


In [None]:
from sklearn.metrics import mean_absolute_error,make_scorer
import numpy as np

def mae_exp(y_true_log, y_pred_log):
    y_true = np.exp(y_true_log)
    y_pred = np.exp(y_pred_log)
    return mean_absolute_error(y_true, y_pred)

# Create a scorer from the custom scoring function
mae_exp_scorer = make_scorer(mae_exp, greater_is_better=False)

In [None]:
class TargetCorrelation(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.correlated_features = set()
        self.feature_names_ =None
        
    def fit(self, X, y):
        self.feature_names_ = None
        self.correlated_features.clear()
        # Calculate correlation matrix
        correlation_matrix = pd.DataFrame(X.join(y)).corr()
        # Print the correlation between each feature and the target variable
        # print("Correlation with Target:")
        for feature in X.columns[:-1]:  # Exclude the last column which is the target itself
            correlation_with_target = abs(correlation_matrix.at[feature, 'age'])
            # print(f"{feature}: {correlation_with_target:.4f}")
            if correlation_with_target < self.threshold:
                self.correlated_features.add(feature)
        return self

    def transform(self, X):
        existing_correlated_features = self.correlated_features.intersection(set(X.columns))
        # print("TTTTTTTTTTTTTTTTTTTTTTT:",existing_correlated_features)
        X = X.drop(labels=existing_correlated_features, axis=1)
        # self.correlated_features.clear()
        self.feature_names_ = X.columns.tolist()
        return X
    
    def get_feature_names_out(self):
        return  self.feature_names_

In [None]:
class CorrelationThreshold(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.correlated_features = set()
        self.feature_names_ =None
        
    def fit(self, X, y=None):
        self.feature_names_ = None
        self.correlated_features.clear()
        
        # Calculate the correlation matrix
        corr_matrix = pd.DataFrame(data=X).corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > self.threshold:
                    colname = corr_matrix.columns[i]
                    self.correlated_features.add(colname)
        return self

    def transform(self, X):
        existing_correlated_features = self.correlated_features.intersection(set(X.columns))
        X = X.drop(labels=existing_correlated_features, axis=1)
        # self.correlated_features.clear()
        self.feature_names_ = X.columns.tolist()
        return X
    
    def get_feature_names_out(self):
        return  self.feature_names_

In [None]:
class DataFrameVarianceThreshold(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.0):
        # Initialize the VarianceThreshold with the given threshold
        self.threshold = threshold
        self.selector = VarianceThreshold(self.threshold)
        self.feature_names_ = None
    def fit(self, X, y=None):
        self.feature_names_ = None
        # Fit the VarianceThreshold selector to the DataFrame
        self.selector.fit(X)
        self.feature_names_ = X.columns.tolist()
        return self
    
    def transform(self, X):
        # Apply the selector to the DataFrame
        transformed_array = self.selector.transform(X)
        # Convert the output back to a DataFrame
        # Get the support mask (boolean array) of the features selected
        features = X.columns[self.selector.get_support()]
        self.feature_names_ = features.tolist()
        transformed_df = pd.DataFrame(transformed_array, index=X.index, columns=features)
        return transformed_df
    
    def get_feature_names_out(self):
        return self.feature_names_ 

In [None]:
class DataFrameStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Initialize the StandardScaler
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        # Fit the scaler to the data, ensuring X is a DataFrame
        self.scaler.fit(X)
        return self
    
    def transform(self, X):
        # Apply the scaler to the DataFrame
        scaled_array = self.scaler.transform(X)
        # Return a DataFrame, maintaining the same index and column labels
        scaled_df = pd.DataFrame(scaled_array, index=X.index, columns=X.columns)
        return scaled_df
    
    

In [None]:
from sklearn.linear_model import LassoCV,LinearRegression,Ridge

class LassoFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, alphas=1, cv=5):
        self.alphas = alphas
        self.cv = cv
        self.model = Lasso(alpha=alphas, random_state=42)
        self.support_ = None
        self.feature_names_ = None

    def fit(self, X, y):
        # reset
        self.feature_names_ = None
        self.support_ = None
        # Fit the LassoCV model to the data
        self.model.fit(X, y)
        # Find the indices of the features with non-zero coefficients
        self.support_ = self.model.coef_ != 0
        self.feature_names_ = X.columns[self.support_].tolist()
       
        return self

    def transform(self, X):
        # Return the DataFrame with only the selected features
        if isinstance(X, pd.DataFrame):
            return X.loc[:, self.support_]
        else:
            return X[:, self.support_]
        
    def get_feature_names_out(self):
        return self.feature_names_  

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('correlation_threshold', CorrelationThreshold(threshold=0.8)),  # Adjust threshold as needed
    ('variance_threshold', DataFrameVarianceThreshold(threshold=0.0001)),  # Adjust threshold as needed
    # ('target_correlation',TargetCorrelation()),
    ('standard_scaler', DataFrameStandardScaler()),
    ('model', LinearRegression())  # Example model
])



In [None]:
param_grid = {
    'correlation_threshold__threshold' : [0.8,0.9,1],
    'variance_threshold__threshold' : [0.0001,0.001],
    # 'lasso_feature__alphas': np.logspace(-4, -0.5, 20),
    # 'target_correlation__threshold' : [0.001,0.01,0.0001]
}

In [None]:
# dataframe = pd.read_csv(r'C:\Users\User\OneDrive\Desktop\毕业论文2024\Notebook\Data\before_shuffle\transformed_merged_hemisphere.csv')
# dataframe.head()

In [None]:
filename = ['original','transformed_original','merged_hemisphere','transformed_merged_hemisphere','grouped_merged']

from sklearn.model_selection import train_test_split


# Load your data
dataframe = pd.read_csv(fr'C:\Users\User\OneDrive\Desktop\毕业论文2024\Notebook\Data\without_fake\{filename[0]}.csv')
   
y = (dataframe['age'])
X = dataframe.drop('age',axis = 1)

x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

# %store -r lr_merged_selected
# x_train = x_train[lr_merged_selected]
# x_test = x_test[lr_merged_selected]    


In [None]:
cat_features= dataframe.filter(regex = 'interaction').columns.tolist()

dataframe = dataframe.drop(cat_features,axis = 1)
# from sklearn.preprocessing import OneHotEncoder

# encoder = OneHotEncoder(sparse_output=False,drop='first')
# x_train_encoded = encoder.fit_transform(x_train[cat_features])

# encoded_df = pd.DataFrame(x_train_encoded,columns=encoder.get_feature_names_out())
# encoded_df

# x_train = x_train.reset_index(drop=True)
# x_train = pd.concat([x_train,encoded_df],axis = 1).drop(cat_features,axis = 1)

# x_test_encoded = encoder.transform(x_test[cat_features])
# encoded_df = pd.DataFrame(x_test_encoded,columns = encoder.get_feature_names_out())

# x_test = x_test.reset_index(drop=True)
# x_test = pd.concat([x_test,encoded_df],axis = 1).drop(cat_features,axis = 1)
# x_test

In [None]:
from sklearn.metrics import mean_absolute_error,r2_score
from  scipy.stats import pearsonr

# pipeline.fit(x_train,y_train)
# # Predict using the best model
# y_pred = pipeline.predict(x_test)

# cv_lr = cross_val_score(pipeline, x_train, y_train, cv=8, scoring='neg_mean_absolute_error')


# # Evaluate the model
# mae = mean_absolute_error(y_test, y_pred)
# print("lienar regression : ",np.mean((-cv_lr)))

# print("Test Mean absolute Error:", mae)



In [None]:
pipeline.fit(x_train,y_train)

baseline_mae = -np.mean(cross_val_score(pipeline,x_train,y_train,scoring='neg_mean_absolute_error',cv=10))
print(baseline_mae)
y_pred = pipeline.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Test Mean absolute Error:", mae)
print('linear regression  r2: ',(r2_score(y_test,y_pred)))
print('linear regression pearson: ',(pearsonr(y_test,y_pred)))

currently determining the best hyperparam for variance and correlation evaluated using vanilla linear regresison

In [None]:
# Set up GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='neg_mean_absolute_error', verbose=1)

# Fit the grid search
grid_search.fit(x_train, y_train)

# Best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score (MSE):", -grid_search.best_score_)

# Predict using the best model
y_pred = grid_search.predict(x_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
print("Test Mean absolute Error:", mae)
print('linear regression  r2: ',(r2_score(y_test,y_pred)))
print('linear regression pearson: ',(pearsonr(y_test,y_pred)))

In [None]:
# Evaluate the model
mae = mean_absolute_error((y_test), (y_pred))
print("Test Mean absolute Error:", mae)

In [None]:
# # Define the pipeline
pipeline2 = Pipeline([
    ('correlation_threshold', CorrelationThreshold(threshold=0.8)),  # Adjust threshold as needed
    ('variance_threshold', DataFrameVarianceThreshold(threshold=0.0001)),  # Adjust threshold as needed
    ('target_correlation',TargetCorrelation(threshold=0.001)),
    ('standard_scaler', DataFrameStandardScaler()),
    # ('lasso_feature' , LassoFeatureSelector(alphas = 0.0001)),
    ('model', ElasticNet())  # Example model
])


# pipeline2.fit(x_train,y_train)
# print(len(pipeline2.named_steps['correlation_threshold'].get_feature_names_out()))
# print(len(pipeline2.named_steps['variance_threshold'].get_feature_names_out()))
# print(len(pipeline2.named_steps['target_correlation'].get_feature_names_out()))

# # print((pipeline2.named_steps['lasso_feature'].get_feature_names_out()))
# print(pipeline2.named_steps['model'].n_features_in_)

baseline_mae = -np.mean(cross_val_score(pipeline2,x_train,y_train,scoring=mae_exp_scorer,cv=10))
print(baseline_mae)

In [None]:
pipeline