In [None]:
# packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import scipy.stats as stats
import networkx as nx
import tensorflow as tf
from typing import List, Dict, Any, Union
from itertools import product
from typing import List, Dict, Any, Union
from itertools import product
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import StandardScaler

from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'keras_tuner'

In [None]:
# Plan:

# Split the data into data[train, test, validation]
# remove WLB score from data
# Create an MLP using the train and test data
# use the MLP to get embeddings of the validation data
# use that to make the similarity graph (size is same as validation set)
# Clustering on similarity graph (louvain and spectral methods) (GNN)
# Analyze the results of the clusters: Get the WLB scores of each cluster, see similarities in each cluster

# research:
# graph clustering based prediction tasks
# node classification
# node property prediction
# interpretable prediction for graphs
# FAISS -- pairwise similarity computation


# expectation:
# subset of factors to analyze


In [None]:
# TODO: This MIGHT be different for yall, so just pay attention to this.
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = "576_Project/data"
GOOGLE_DRIVE_PATH = os.path.join("drive", "My Drive", GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
sys.path.append(GOOGLE_DRIVE_PATH)

# Should just see Wellbeing_and_lifestyle_data_Kaggle.csv here
print(os.listdir(GOOGLE_DRIVE_PATH))

assert 'Wellbeing_and_lifestyle_data_Kaggle.csv' in list(os.listdir(GOOGLE_DRIVE_PATH)), "Data not found, check your file paths!"

full_data_path = os.path.join("drive", "My Drive", GOOGLE_DRIVE_PATH_AFTER_MYDRIVE, "Wellbeing_and_lifestyle_data_Kaggle.csv")
df = pd.read_csv(full_data_path)
print(df.head())

In [None]:
def load_and_preprocess_data(google_drive_path: str, filename: str):
    '''
    Inputs:
    - google_drive_path: str, path to the dataset directory.
    - filename: str, name of the dataset file.

    Outputs:
    - DataFrame: cleaned and preprocessed dataset.
    '''
    try:
        # Combine paths to form the full data path
        full_data_path = os.path.join(google_drive_path, filename)

        # Check if the file exists in the specified directory
        if filename not in os.listdir(google_drive_path):
            raise FileNotFoundError(f"Data file '{filename}' not found in the specified directory.")

        # Load the dataset
        df = pd.read_csv(full_data_path)
        print("Dataset loaded successfully.")

        # Preprocessing: Type conversion
        df["AGE"] = df['AGE'].map({"Less than 20": 0, "21 to 35": 1, "36 to 50": 2, "51 or more": 3}).fillna(0)
        df["GENDER"] = df["GENDER"].map({"Female": 0, "Male": 1}).fillna(0)

        # Convert columns to numeric where possible, coercing errors to NaN
        df = df.apply(pd.to_numeric, errors='coerce')

        print("Preprocessing completed.")
        return df

    except FileNotFoundError as e:
        print(e)
    except pd.errors.EmptyDataError:
        print("Error: File is empty.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = "drive/My Drive/576_Project/data"
FILENAME = "Wellbeing_and_lifestyle_data_Kaggle.csv"

df = load_and_preprocess_data(GOOGLE_DRIVE_PATH_AFTER_MYDRIVE, FILENAME)
# print(df.head())


# correlation matrix
plt.figure(figsize=(15,8))
data = df.drop('Timestamp', axis=1)
# print(data)
sns.heatmap(data=data.corr(), annot=True, fmt='0.3f', cmap='GnBu');

In [None]:
def split_data(df, target_column, test_size=0.15, val_size=0.15, random_state=None):
    """
    Splits the data into training, test, and validation sets.

    Parameters:
    - df (pd.DataFrame): The DataFrame containing the data.
    - target_column (str): The name of the column representing the target variable.
    - test_size (float): The proportion of the dataset to include in the test split (default is 0.2).
    - val_size (float): The proportion of the dataset to include in the validation split (default is 0.1).
    - random_state (int): Random state for reproducibility (default is None).

    Returns:
    - X_train, X_val, X_test, y_train, y_val, y_test: Split data ready for model training.
    """

    X = df.drop(columns=[target_column])
    y = df[target_column]

    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(test_size + val_size), random_state=random_state
    )

    val_proportion = val_size / (test_size + val_size)

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=val_proportion, random_state=random_state
    )

    return X_train, X_val, X_test, y_train, y_val, y_test


In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, target_column='WORK_LIFE_BALANCE_SCORE', test_size=0.15, val_size=0.15, random_state=1)


In [None]:
def create_mlp_model(X_train, y_train, X_val, y_val, input_dim, epochs=50, batch_size=32):
    """
    Creates and trains a Multi-Layer Perceptron (MLP) model.

    Parameters:
    - X_train (np.array or pd.DataFrame): Training data features.
    - y_train (np.array or pd.Series): Training data labels.
    - X_val (np.array or pd.DataFrame): Validation data features.
    - y_val (np.array or pd.Series): Validation data labels.
    - input_dim (int): Number of input features.
    - epochs (int): Number of epochs for training (default is 50).
    - batch_size (int): Size of training batches (default is 32).

    Returns:
    - model (tf.keras.Model): Trained MLP model.
    """
    #standardizes data, not sure if we need to change this for some of the parameters that are binary
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dense(32, activation='relu'),
        Dense(1)  # Adjust this based on your output requirements (e.g., activation='sigmoid' for binary classification)
    ])

    model.compile(optimizer='adam', loss='mse', metrics=['mae'])  # Adjust 'loss' based on the type of problem

    model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=2
    )

    return model


In [None]:
model = create_mlp_model(X_train, y_train, X_val, y_val, input_dim=X_train.shape[1])

In [None]:
# Define the partition rules here
# Options for setting rules are:
#   - Values: define the boundaries between bins as values
#   - Percentile: define the boundaries between bins as percentiles
#   - Exact: define the exact values a category can take as a list

partition_rules = {
    "WORK_LIFE_BALANCE_SCORE": {"type": "percentile", "bins": [0, 10, 90, 100]},
    "GENDER": {"type": "exact", "bins": [0, 1]},  # Exact values for gender
    "BMI_RANGE": {"type": "exact", "bins": [1, 2]} # Exact values for BMI because there are only 2, not sure why
}

# Assume 'df' is your data DataFrame
partitions = partition_data(df, partition_rules)

# Example: print the first few rows of some partitions
for partition_name, partition_df in partitions.items():
    print(f"Partition: {partition_name}")
    print(f"{partition_name}: {len(partition_df)} items")
    # print(partition_df.head())

In [None]:
## 3. Correlation Calculation
def calculate_correlation(data: pd.DataFrame, method: str = "pearson") -> pd.DataFrame:
    '''
    Inputs:
    - data: DataFrame for which to compute correlations.
    - method: str, correlation method ("pearson", "spearman", "kendall").

    Outputs:
    - DataFrame: correlation matrix.
    '''
    return data.corr(method=method)

In [None]:
def is_significant_correlation(correlation_value, sample_size, alpha=0.05):
    """
    Runs a t-test to determine if a correlation value is significantly different from 0.

    Parameters:
    - correlation_value: The correlation coefficient to test.
    - sample_size: The number of samples in the partition.
    - alpha: Significance level (default is 0.05).

    Returns:
    - 1 if the correlation is significant, otherwise 0.
    """
    # Degrees of freedom for the t-test
    df = sample_size - 2

    # Calculate the t-statistic
    t_statistic = correlation_value * np.sqrt(df / (1 - correlation_value**2))

    # Calculate the two-tailed p-value
    p_value = 2 * (1 - stats.t.cdf(abs(t_statistic), df))

    # Return 1 if p-value is less than alpha, indicating significance
    return 1 if p_value < alpha else 0

def generate_significance_graph(correlation_matrix, sample_size):
    """
    Generates a matrix of 1s and 0s indicating whether each correlation
    in the correlation matrix is significant.

    Parameters:
    - correlation_matrix: The partition dataframe to analyze.

    Returns:
    - A DataFrame of 1s and 0s indicating significance.
    """
    # Create a matrix to store significance results, initially copying the correlation matrix
    significance_matrix = correlation_matrix.copy()

    # Iterate over the matrix to modify only off-diagonal elements
    for i in range(correlation_matrix.shape[0]):
        for j in range(correlation_matrix.shape[1]):
            if i != j:  # Skip diagonal elements
                correlation_value = correlation_matrix.iloc[i, j]
                if not np.isnan(correlation_value):
                    significance_matrix.iloc[i, j] = is_significant_correlation(correlation_value, sample_size)
                else:
                    significance_matrix.iloc[i, j] = 0
            else:
                significance_matrix.iloc[i, j] = 0  # Set diagonal to 0

    return significance_matrix


In [None]:
def visualize_graph(adj_matrix):
    """
    Visualizes a graph from an adjacency matrix.

    Parameters:
    - adj_matrix: A 2D dataframe where 1s represent edges and 0s represent no edges.
    """
    # Create a graph from the adjacency matrix
    G = nx.from_pandas_adjacency(adj_matrix)

    # Draw the graph
    plt.figure(figsize=(15, 8))
    nx.draw(G, with_labels=True, node_color='skyblue', node_size=1000, font_size=8, font_weight='bold', edge_color='gray')
    plt.title("Graph Visualization")
    plt.show()


In [None]:
for partition_name, partition_df in partitions.items():
    print(f"Partition: {partition_name}")
    print(f"{partition_name}: {len(partition_df)} items")

    partition_columns = list(partition_rules.keys())

    # Drop the columns used for partitioning from the data
    filtered_partition_df = partition_df.drop(columns=partition_columns, errors='ignore')
    filtered_partition_df = filtered_partition_df.drop('Timestamp', axis=1)

    sample_size = len(filtered_partition_df)
    # print(partition_df.head())
    plt.figure(figsize=(15,8))
    correlation_matrix = calculate_correlation(filtered_partition_df)
    sns.heatmap(data=correlation_matrix, annot=True, fmt='0.3f', cmap='GnBu')
    significance_graph = generate_significance_graph(correlation_matrix, sample_size)
    # print(significance_graph)
    visualize_graph(significance_graph)
    break


In [None]:
## 6. Clustering Execution
def perform_clustering(significance_graph: pd.Dataframe, method: Callable, params: Dict[str, Any]) -> Dict[int, List[str]]:
    '''
    Inputs:
    - graph: Graph, correlation graph.
    - method: Callable, clustering method function.
    - params: Dict, parameters for the chosen clustering method.

    Outputs:
    - Dictionary mapping cluster IDs to lists of factor names.
    '''
    pass


In [None]:
## 7. Graph Visualization
def visualize_graph(graph: Graph, clusters: Dict[int, List[str]], output_file: str) -> None:
    '''
    Inputs:
    - graph: Graph, correlation graph.
    - clusters: Dict, resulting clusters.
    - output_file: str, path for saving the visualization.

    Outputs:
    - None (saves the visualization to a file).
    '''
    pass

In [None]:
## 8. Cluster Analysis and Comparison
def analyze_clusters(clusters: Dict[int, List[str]], partitions: Dict[str, DataFrame]) -> DataFrame:
    '''
    Inputs:
    - clusters: Dict, cluster results.
    - partitions: Dict, different subsets of the dataset.

    Outputs:
    - DataFrame summarizing key findings from clusters across partitions.
    '''
    pass

In [None]:
## 9. Evaluation of Correlation Methods
def evaluate_correlation_methods(data: DataFrame, methods: List[str], partitions: Dict[str, DataFrame]) -> DataFrame:
    '''
    Inputs:
    - data: DataFrame, original dataset.
    - methods: List of correlation methods.
    - partitions: Dict of data partitions.

    Outputs:
    - DataFrame summarizing how correlation structures vary across methods.
    '''
    pass

In [None]:
## 10. Evaluation of Clustering Methods
def louvain_clustering(graph: Graph, params: Dict[str, Any], weighted=False) -> Dict[int, List[str]]:
  if weighted:
    pass
  else:
    pass
def kmeans_clustering(graph: Graph, params: Dict[str, Any]) -> Dict[int, List[str]]:
  pass
def gn_clustering(graph: Graph, params: Dict[str, Any]) -> Dict[int, List[str]]:
  pass

def evaluate_clustering_methods(graph: Graph, methods: List[str], params: Dict[str, Dict[str, Any]]) -> DataFrame:
    '''
    Inputs:
    - graph: Graph, correlation graph.
    - methods: List of clustering methods to test.
    - params: Dict, parameters for each clustering method.

    Outputs:
    - DataFrame comparing performance and output of clustering methods.
    '''
    # louvain & GN & k-means
    louvain_result = louvain_clustering(graph, params["louvain"])
    gn_result = gn_clustering(graph, params["GN"])
    k_means_result = kmeans_clustering(graph, params["k-means"])
    pass