### Load the dataset

In [None]:
from load_dataset import load_dataset
df = load_dataset(file_path=) # Enter the file path here
label_counts = df["label"].value_counts()
print(label_counts)
print(df.shape)

### Split the dataset

In [None]:
import pandas as pd

# Separate the data based on the label
df_label_1 = df[df["label"] == 1]
df_label_0 = df[df["label"] == 0]

# Sample 20% of each label class for the test set
df_label_1_test = df_label_1.sample(frac=0.2, random_state=123)
df_label_0_test = df_label_0.sample(n=len(df_label_1_test), random_state=123)  # Match the size of label=1

# Combine the test data
test_data = pd.concat([df_label_1_test, df_label_0_test])

# The rest goes to the training set
df_label_1_train = df_label_1.drop(df_label_1_test.index)
df_label_0_train = df_label_0.drop(df_label_0_test.index)

# Combine the training data
train_data = pd.concat([df_label_1_train, df_label_0_train])

# Separate features and labels for train and test sets
X_train = train_data.drop(columns=["label"])
y_train = train_data["label"]

X_test = test_data.drop(columns=["label"])
y_test = test_data["label"]
# # Save the train and test datasets
# train_data.to_csv("train.csv", index=False)
# test_data.to_csv("test.csv", index=False)

### Do Feature Selection on the Balanced Train Dataset

In [None]:
from load_dataset import balanced_dataset_downsampling
balanced_train_data_downsampling =  balanced_dataset_downsampling(train_data)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

X_train = balanced_train_data_downsampling.drop(columns=["label"])
y_train = balanced_train_data_downsampling["label"]

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

# Make predictions and calculate performance
y_pred = lasso.predict(X_test)
print("Lasso Regression Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Lasso coefficients (note that some may be zero)
print("Lasso Coefficients:", lasso.coef_)

In [None]:
import numpy as np

feature_names = X_train.columns

# Non-zero coefficients
non_zero_indices = np.where(lasso.coef_ != 0)[0]
important_features = [(feature_names[i], lasso.coef_[i]) for i in non_zero_indices]

# Print non-zero coefficients
print("Important Features and their Coefficients:")
for feature, coef in important_features:
    print(f"Feature: {feature}, Coefficient: {coef}")

# Features with coefficients > 1e-8
threshold = 1e-8
filtered_features = [feature_names[i] for i in non_zero_indices if abs(lasso.coef_[i]) > threshold]

print("\nFeatures with coefficients > 1e-8:")
print(filtered_features)

In [None]:
train_filtered_df = train_data[filtered_features + ['label']]
test_filtered_df = test_data[filtered_features + ['label']]
print(f"Filtered train dataset shape: {train_filtered_df.shape}")
print(f"Filtered test dataset shape: {test_filtered_df.shape}")

In [None]:
train_filtered_df.to_csv("", index=False) # Specify where to save the train df
test_filtered_df.to_csv("", index=False) # Specify where to save the test df