In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, ElasticNetCV

# Load the data from CSV
data_ = pd.read_csv('Train_test_combined.csv')

# Extract feature data (excluding the first column if it's non-numeric or an index)
data = data_.iloc[:, 1:]

# Define labels
label1 = np.ones((1288, 1))  # Value can be changed
label2 = np.zeros((1133, 1))
label3 = np.ones((258, 1))   # Value can be changed
label4 = np.zeros((227, 1))

# Split the data into training and testing sets
X_train = data.loc[0:2420]
y_train = np.concatenate((label1, label2))

X_test = data.loc[2421:]
y_test = np.concatenate((label3, label4))

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data to DataFrames
df_train = pd.DataFrame(X_train_scaled)
df_test = pd.DataFrame(X_test_scaled)

# Remove features with zero standard deviation in the training data
zero_std_features = df_train.columns[df_train.std() == 0].tolist()
df_train = df_train.loc[:, df_train.std() != 0]

# Apply the same feature exclusion to the test data
df_test = df_test.loc[:, df_train.columns]

# Drop highly correlated features (correlation > 0.8) in the training data
threshold = 0.8
cor = df_train.corr().abs()
high_corr_var = np.where(cor > threshold)
high_corr_var = [(cor.columns[x], cor.columns[y]) for x, y in zip(*high_corr_var) if x != y and x < y]

to_drop = set()
for pair in high_corr_var:
    to_drop.add(pair[1])

df_train = df_train.drop(columns=to_drop)
df_test = df_test.drop(columns=to_drop)

# Convert the feature-selected training and testing data back to numpy arrays
X_train_selected = df_train.values
X_test_selected = df_test.values

# Fit ElasticNetCV to the training data for feature selection
model_cv = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], alphas=[0.01, 0.1, 1.0, 10.0], cv=10)
model_cv.fit(X_train_selected, y_train)

# Train the ElasticNet model with the best alpha and l1_ratio on the training data
model = ElasticNet(alpha=model_cv.alpha_, l1_ratio=model_cv.l1_ratio_)
model.fit(X_train_selected, y_train)

# Get the importance scores (coefficients) from the trained model
importance = model.coef_

# Identify the selected feature indices (non-zero coefficients)
selected_indices = np.where(importance != 0)[0]

# Create a DataFrame with feature importance, original indices, and absolute importance scores
importance_df = pd.DataFrame({
    'feature_index': selected_indices,
    'importance_score': importance[selected_indices],
    'absolute_importance': np.abs(importance[selected_indices])
})

# Rank the importance scores based on the absolute values
importance_df['rank'] = importance_df['absolute_importance'].rank(ascending=False)

# Sort the DataFrame by the rank
importance_df = importance_df.sort_values(by='rank')

# Save the importance scores, indices, and ranks to a CSV file
importance_df.to_csv('Clathrin10_EN_importance_ranked.csv', index=True)

# Select the important features based on the selected indices for both training and testing sets
X_train_important = X_train_selected[:, selected_indices]
X_test_important = X_test_selected[:, selected_indices]

# Combine the training and testing data with the correct indices from the original CSV
combined_data = np.concatenate((X_train_important, X_test_important), axis=0)

# Create a DataFrame with the combined data
df_combined = pd.DataFrame(combined_data)

# Use only the indices corresponding to the combined training and test samples
combined_indices = pd.concat([data_.iloc[0:2421, 0], data_.iloc[2421:, 0]])  # First column of original data assumed to be index or identifier

# Assign the correct index to the combined DataFrame
df_combined.index = combined_indices

# Save the combined DataFrame with selected features and the correct index to a CSV file
df_combined.to_csv('Clathrin10_EN_features.csv', index=True, header=True)

print("Shape of the combined feature set:", np.shape(df_combined))
