In [None]:
import os
project_dir = '/Users/demo/Library/CloudStorage/Box-Box/Holmes_lab_kaley/motif_proj'
brighten_dir = os.path.join(project_dir, 'BRIGHTEN_data')

import pandas as pd

dfs_lagged = {}
dfs_lagged['v1_day'] = pd.read_csv(os.path.join(brighten_dir, 'v1_day_lag.csv'))
dfs_lagged['v1_week_phq9'] = pd.read_csv(os.path.join(brighten_dir, 'v1_week_phq9_lag.csv'))
dfs_lagged['v2_day'] = pd.read_csv(os.path.join(brighten_dir, 'v2_day_lag.csv'))
dfs_lagged['v2_week_phq9'] = pd.read_csv(os.path.join(brighten_dir, 'v2_week_phq9_lag.csv'))


In [None]:


def find_largest_continuous_sequence(df, time_period, sequence_length=6):
    """
    Finds the largest continuous sequence in the time_period column of a DataFrame.
    Returns a subset of the DataFrame for the specified sequence length.
    """
    # Sort the DataFrame by time_period
    df = df.sort_values(by=time_period).reset_index(drop=True)

    # Calculate differences and group continuous sequences
    diff = df[time_period].diff().ne(1).cumsum()
    sequences = df.groupby(diff)[time_period].agg(['min', 'max', 'size'])

    # Find the largest sequence
    largest_sequence = sequences.loc[sequences['size'].idxmax()]

    # Check if the largest sequence meets the minimum size requirement
    if largest_sequence['size'] >= sequence_length:
        idx_seq_min = largest_sequence['min']
        idx_seq_max = idx_seq_min + sequence_length - 1  # Inclusive range
        return df[(df[time_period] >= idx_seq_min) & (df[time_period] <= idx_seq_max)]
    else:
        return None  # Return None if no sequence meets the condition




In [None]:

# Prep for mmelsm 
df_mmelsm = {}

# V2_Day
v2_day_mmelsm = []
sequence_length = 6 
time_period = 'day'

for participant, df in dfs_lagged['v2_day_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v2_day_mmelsm.append(result)

# Combine all results into a single DataFrame
if v2_day_mmelsm:
    v2_day_mmelsm = pd.concat(v2_day_mmelsm, ignore_index=True)
    df_mmelsm['v2_day_mmelsm'] = v2_day_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v2_day_mmelsm: {len(v2_day_mmelsm['participant_id'].unique())}')


# V1_Day
v1_day_mmelsm = []
sequence_length = 9

for participant, df in dfs_lagged['v1_day_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v1_day_mmelsm.append(result)

# Combine all results into a single DataFrame
if v1_day_mmelsm:
    v1_day_mmelsm = pd.concat(v1_day_mmelsm, ignore_index=True)
    df_mmelsm['v1_day_mmelsm'] = v1_day_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v1_day_mmelsm: {len(v1_day_mmelsm['participant_id'].unique())}')


# V1_Phq2
v1_week_phq2_mmelsm = []
sequence_length = 8 #4=170, 8=89
time_period = 'week'

for participant, df in dfs_lagged['v1_week_phq2_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v1_week_phq2_mmelsm.append(result)

# Combine all results into a single DataFrame
if v1_week_phq2_mmelsm:
    v1_week_phq2_mmelsm = pd.concat(v1_week_phq2_mmelsm, ignore_index=True)
    df_mmelsm['v1_week_phq2_mmelsm'] = v1_week_phq2_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v1_week_phq2_mmelsm: {len(v1_week_phq2_mmelsm['participant_id'].unique())}')


# V2_Phq2
v2_week_phq2_mmelsm = []
sequence_length = 4
time_period = 'week'

for participant, df in dfs_lagged['v2_week_phq2_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v2_week_phq2_mmelsm.append(result)

# Combine all results into a single DataFrame
if v2_week_phq2_mmelsm:
    v2_week_phq2_mmelsm = pd.concat(v2_week_phq2_mmelsm, ignore_index=True)
    df_mmelsm['v2_week_phq2_mmelsm'] = v2_week_phq2_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v2_week_phq2_mmelsm: {len(v2_week_phq2_mmelsm['participant_id'].unique())}')




# V1_Phq9 
v1_week_phq9_mmelsm = []
sequence_length = 4
time_period = 'week'

for participant, df in dfs_lagged['v1_week_phq9_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v1_week_phq9_mmelsm.append(result)

# Combine all results into a single DataFrame
if v1_week_phq9_mmelsm:
    v1_week_phq9_mmelsm = pd.concat(v1_week_phq9_mmelsm, ignore_index=True)
    df_mmelsm['v1_week_phq9_mmelsm'] = v1_week_phq9_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v1_week_phq9_mmelsm: {len(v1_week_phq9_mmelsm['participant_id'].unique())}')



# V2_Phq9 
v2_week_phq9_mmelsm = []
sequence_length = 3
time_period = 'week'

for participant, df in dfs_lagged['v2_week_phq9_lag'].groupby('participant_id'):
    result = find_largest_continuous_sequence(df, time_period, sequence_length)
    if result is not None:
        v2_week_phq9_mmelsm.append(result)

# Combine all results into a single DataFrame
if v2_week_phq9_mmelsm:
    v2_week_phq9_mmelsm = pd.concat(v2_week_phq9_mmelsm, ignore_index=True)
    df_mmelsm['v2_week_phq9_mmelsm'] = v2_week_phq9_mmelsm
    print(f'Participants with {sequence_length} continuous {time_period}s in v2_week_phq9_mmelsm: {len(v2_week_phq9_mmelsm['participant_id'].unique())}')





for name, df in df_mmelsm.items():
    df.to_csv(os.path.join(brighten_dir, f'{name}.csv'))