# Feature creation and selection

In [38]:
######################## LOAD IN FILES #############################
import os
import pandas as pd
import sys
import importlib

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
# Define data directory
brighten_dir = os.path.join(project_root, 'BRIGHTEN_data')

# Add project root to sys.path for script usage
sys.path.append(project_root)

# Import and reload (optional) custom scripts
# Import and reload custom scripts
from scripts import preprocessing as pre
from scripts import visualization as vis
from scripts import feature_selection as fs
from scripts import clustering as cl
from scripts import variables as var
importlib.reload(pre)
importlib.reload(vis)
importlib.reload(fs)
importlib.reload(cl)
importlib.reload(var)

df_alldays_int70 = pd.read_csv(os.path.join(brighten_dir, 'df_alldays_int70.csv'))
df_alldays_int70['dt'] = pd.to_datetime(df_alldays_int70['dt'], errors='coerce')
week_df_int70 = pd.read_csv(os.path.join(brighten_dir, 'week_df_int70.csv'))

############ LOAD in PACKAGES  #############
import numpy as np
from sklearn.linear_model import LogisticRegression


################ DEFINE column variables from data ###################
from scripts.variables import id_columns, daily_cols_v1, daily_v2_common 
from scripts.variables import phq2_cols, phq9_cols, weekly_cols, passive_cols, survey_cols
from scripts.variables import df_names, df_names_with_mis


## Create wide versions of each vars list
wide_v1_cols = []
for var in daily_cols_v1:
    wide_v1_cols.append(f'{var}_avg_w1')
    wide_v1_cols.append(f'{var}_avg_w2')
    wide_v1_cols.append(f'{var}_slope_w2')
    wide_v1_cols.append(f'{var}_avg_w4')
    wide_v1_cols.append(f'{var}_slope_w4')

wide_v2_cols = []
for var in daily_v2_common:
    wide_v2_cols.append(f'{var}_avg_w1')
    wide_v2_cols.append(f'{var}_avg_w2')
    wide_v2_cols.append(f'{var}_slope_w2')
    wide_v2_cols.append(f'{var}_avg_w4')
    wide_v2_cols.append(f'{var}_slope_w4')




  df_alldays_int70 = pd.read_csv(os.path.join(brighten_dir, 'df_alldays_int70.csv'))
  week_df_int70 = pd.read_csv(os.path.join(brighten_dir, 'week_df_int70.csv'))


In [39]:
wide_dfs_clusters = {}
for name in df_names_with_mis:
    wide_dfs_clusters[name] = pd.read_csv(os.path.join(brighten_dir, f'wide_{name}_clusters.csv'))


In [1]:

from sklearn.feature_selection import VarianceThreshold

# Define feature and target columns
X_cols = [col for col in wide_dfs_clusters['v1_day'] if 'depression' not in col and 'phq2' not in col and col not in id_columns]
y_col = 'phq2_bin_avg_w4'

# Create DataFrame with relevant columns
lr_df = wide_dfs_clusters['v1_day'][['num_id', y_col] + X_cols]

# Display feature variances
feature_variances = lr_df[X_cols].var()
print(feature_variances.sort_values())  # See how much variance your features have

# Display correlation matrix
corr_matrix = lr_df[X_cols + [y_col]].corr().abs()
# print(corr_matrix[y_col])


# Apply Variance Threshold to drop low-variance features
selector = VarianceThreshold(threshold=0.005)  # Adjust as needed
X_selected = selector.fit_transform(lr_df[X_cols])  # Returns a NumPy array

# Get names of selected columns
selected_columns = [X_cols[i] for i in range(len(X_cols)) if selector.get_support()[i]]
dropped_columns = list(set(X_cols) - set(selected_columns))

print(f"Dropped low-variance features: {dropped_columns}")

# Convert selected features back to DataFrame for correlation analysis
X_filtered = pd.DataFrame(X_selected, columns=selected_columns)

# Compute correlation matrix (absolute values)
corr_matrix = X_filtered.corr().abs()
print(corr_matrix[y_col])

# Zero out the diagonal (self-correlation)
np.fill_diagonal(corr_matrix.values, 0)
corr_matrix.to_csv(os.path.join(brighten_dir, 'wide_v1Day_highVar_corrMatrix.csv'))

# Define threshold for high correlation (e.g., 0.9)
threshold = 0.9

# Get correlation of features with target variable
target_corr = corr_matrix[y_col].sort_values(ascending=False)

# Identify highly correlated feature pairs
to_drop = set()
for col in corr_matrix.columns:
    if col == y_col:  
        continue  # Skip target variable
    
    # Find features highly correlated with the current feature
    high_corr_features = corr_matrix.index[(corr_matrix[col] > threshold)].tolist()
    
    # Remove features but keep the one most correlated with the target
    high_corr_features.remove(col)  # Keep the current feature
    for feature in high_corr_features:
        if target_corr[col] >= target_corr[feature]:  # Keep the stronger predictor
            to_drop.add(feature)
        else:
            to_drop.add(col)

# Drop selected features
df_selected = X_filtered.drop(columns=to_drop)

print(f"Dropped {len(to_drop)} highly correlated features: {to_drop}")

display(df_selected)

NameError: name 'wide_dfs_clusters' is not defined

In [None]:
# PHQ9 Cat, phq9 cat change

print(f'df is shape {lr_df.shape}')

X_scaled = lr_df[X_cols].copy()
y = lr_df[y_col].copy()

# Apply Randomized Logistic Regression
selected_features, feature_importances = fs.randomized_logistic_regression(X_scaled, y)

# Print selected feature indices
print(f"Selected Features: {selected_features}")
print(f"Feature Importances: {feature_importances}")


df is shape (57, 102)


KeyboardInterrupt: 

In [None]:
# PHQ9 Cat, phq9 cat change

y_cols = 'phq9_bin_avg_w4'
X_cols = [col for col in wide_dfs_clusters['v1_week'] if 'depression' not in col and 'phq2' not in col and 'phq9' not in col and col not in id_columns]
rlr_df = wide_dfs_clusters['v1_week'][['num_id', y_cols] + X_cols]
rlr_df = rlr_df.dropna()
print(f'df is shape {rlr_df.shape}')

X_scaled = rlr_df[X_cols].copy()
y = rlr_df[y_cols].copy()

# Apply Randomized Logistic Regression
selected_features, feature_importances = fs.randomized_logistic_regression(X_scaled, y)

# Print selected feature indices
print(f"Selected Features: {selected_features}")
print(f"Feature Importances: {feature_importances}")


KeyError: "['aggregate_communication_avg_w2', 'aggregate_communication_slope_w2', 'aggregate_communication_slope_w4', 'call_count_avg_w2', 'call_count_slope_w2', 'call_count_slope_w4', 'call_duration_avg_w2', 'call_duration_slope_w2', 'call_duration_slope_w4', 'interaction_diversity_avg_w2', 'interaction_diversity_slope_w2', 'interaction_diversity_slope_w4', 'missed_interactions_avg_w2', 'missed_interactions_slope_w2', 'missed_interactions_slope_w4', 'mobility_avg_w2', 'mobility_slope_w2', 'mobility_slope_w4', 'mobility_radius_avg_w2', 'mobility_radius_slope_w2', 'mobility_radius_slope_w4', 'sms_count_avg_w2', 'sms_count_slope_w2', 'sms_count_slope_w4', 'sms_length_avg_w2', 'sms_length_slope_w2', 'sms_length_slope_w4', 'unreturned_calls_avg_w2', 'unreturned_calls_slope_w2', 'unreturned_calls_slope_w4'] not in index"