In [1]:
import os
import pandas as pd
import logging
import os
import pandas as pd
import logger 

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def impute_time_series_data(df, patient_id):
    def get_neighbor_values(df, index, column):
        neighbors = []
        index_list = df.index.tolist()
        current_index_position = index_list.index(index)

        if current_index_position > 0:
            prev_index = index_list[current_index_position - 1]
            neighbors.extend(
                [df.loc[prev_index, "Morning"], df.loc[prev_index, "Afternoon"]]
            )

        if current_index_position < len(index_list) - 1:
            next_index = index_list[current_index_position + 1]
            neighbors.extend(
                [df.loc[next_index, "Morning"], df.loc[next_index, "Afternoon"]]
            )

        if column == "Morning" and pd.notna(df.loc[index, "Afternoon"]):
            neighbors.append(df.loc[index, "Afternoon"])
        elif column == "Afternoon" and pd.notna(df.loc[index, "Morning"]):
            neighbors.append(df.loc[index, "Morning"])

        return [x for x in neighbors if pd.notna(x)]

    def find_missing_chunks(df, column):
        """Finds missing chunks and their lengths in the specified column."""
        missing_chunks = []
        current_chunk = []
        for i, val in enumerate(df[column].isna()):
            if val:  # If value is NaN
                current_chunk.append(i)
            else:
                if current_chunk:  # Close chunk when consecutive NaNs end
                    if len(current_chunk) <= 14:
                        missing_chunks.append(current_chunk)
                    current_chunk = []
        if current_chunk and len(current_chunk) <= 14:
            missing_chunks.append(current_chunk)  # Handle case where NaNs extend to the end
        return missing_chunks

    # Iterate over both columns
    for column in ["Morning", "Afternoon"]:
        missing_chunks = find_missing_chunks(df, column)  # Find missing chunks in the column

        for chunk in missing_chunks:
            for index in chunk:
                neighbor_values = get_neighbor_values(df, df.index[index], column)
                if neighbor_values:
                    imputed_value = round(sum(neighbor_values) / len(neighbor_values))
                    df.loc[df.index[index], column] = imputed_value

    # Log the imputed data and save to CSV
    logger.info(f"Imputed data for patient {patient_id} shape: {df.shape}")
    df.to_csv(f"/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data/{patient_id}-recent-Imputed.csv", index=False)
    return df

def process_multiple_csv_files(folder_path):
    # Loop over all CSV files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            patient_id = filename.split('.')[0]  # Extract patient ID from filename
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path, parse_dates=["Date"])  # Read the CSV file
            
            # Impute the data
            df_imputed = impute_time_series_data(df, patient_id)
            
            # Save imputed data to CSV in a new folder
            output_folder = "/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data"
            os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist
            output_file = os.path.join(output_folder, f"{patient_id}-recent-Imputed.csv")
            df_imputed.to_csv(output_file, index=False)
            print(f"Processed and saved: {output_file}")

# Set the path to the folder containing CSV files
folder_path = "/Users/siddhi/research_new/schas-2024-asthama-2.0/2024-Data-Cleaned/Original_Data"

# Run the process on all CSV files in the folder
process_multiple_csv_files(folder_path)

[27/Sep/2024 00:03:07] INFO - Imputed data for patient SB-023-OG shape: (1422, 4)
Processed and saved: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data/SB-023-OG-recent-Imputed.csv
[27/Sep/2024 00:03:07] INFO - Imputed data for patient SB-040-OG shape: (1934, 4)
Processed and saved: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data/SB-040-OG-recent-Imputed.csv
[27/Sep/2024 00:03:07] INFO - Imputed data for patient SB-031-OG shape: (574, 4)
Processed and saved: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data/SB-031-OG-recent-Imputed.csv
[27/Sep/2024 00:03:07] INFO - Imputed data for patient SB-052-OG shape: (386, 4)
Processed and saved: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data/SB-052-OG-recent-Imputed.csv
[27/Sep/2024 00:03:07] INFO - Imputed data for patient SB-007-OG shape: (309, 4)
Processed and saved: /Users/siddhi/research_n

In [2]:
# Folder containing patient CSV files
folder_path = '/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data'

In [3]:
# Load all patient data
all_data = []
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        patient_id = file.split('.')[0]  # Assuming patient ID is the filename without extension
        df = pd.read_csv(os.path.join(folder_path, file))
        df['patient_id'] = patient_id
        all_data.append(df)

In [4]:
# Combine all data into a single DataFrame
df_all = pd.concat(all_data)

In [5]:
# Convert date column to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])


In [6]:
# Determine the earliest and latest dates
start_date = df_all['Date'].min()
end_date = df_all['Date'].max()


In [7]:
print(f"Earliest Start Date: {start_date}")
print(f"Latest End Date: {end_date}")


Earliest Start Date: 2014-11-21 00:00:00
Latest End Date: 2022-01-19 00:00:00


In [8]:
import pandas as pd
import altair as alt
import numpy as np

# Convert date to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])

# Create a DataFrame to hold the missing chunks for visualization
missing_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])
available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date'])

# Loop through each patient to identify missing and available chunks in the "Afternoon" column
for patient_id in df_all['patient_id'].unique():
    patient_data = df_all[df_all['patient_id'] == patient_id].sort_values(by='Date')
    
    # Create a column to identify missing data
    patient_data['is_missing'] = patient_data['Morning'].isna()
    
    # Calculate the difference between consecutive dates
    patient_data['date_diff'] = patient_data['Date'].diff().dt.days
    
    # Identify missing chunks
    missing_chunk_start = None
    missing_chunk_size = 0
    available_chunk_start = None
    missing_chunks = []
    available_chunks = []

    for index, row in patient_data.iterrows():
        if row['is_missing']:
            if missing_chunk_start is None:
                missing_chunk_start = row['Date']
            missing_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
            
            # If an available chunk just ended, record it
            if available_chunk_start is not None:
                available_chunks.append({
                    'patient_id': patient_id,
                    'start_date': available_chunk_start,
                    'end_date': row['Date']
                })
                available_chunk_start = None
                
        else:
            if missing_chunk_size > 14:
                missing_chunks.append({
                    'patient_id': patient_id,
                    'start_date': missing_chunk_start,
                    'end_date': row['Date'],
                    'chunk_size': missing_chunk_size
                })
            missing_chunk_start = None
            missing_chunk_size = 0
            
            # Start a new available chunk
            if available_chunk_start is None:
                available_chunk_start = row['Date']
    
    # Check if the last chunk is missing and longer than 14 days
    if missing_chunk_size > 14:
        missing_chunks.append({
            'patient_id': patient_id,
            'start_date': missing_chunk_start,
            'end_date': patient_data.iloc[-1]['Date'],
            'chunk_size': missing_chunk_size
        })
    
    # Check if the last chunk is available
    if available_chunk_start is not None:
        available_chunks.append({
            'patient_id': patient_id,
            'start_date': available_chunk_start,
            'end_date': patient_data.iloc[-1]['Date']
        })

    # Append missing chunks to the DataFrame
    if missing_chunks:
        missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
    
    # Append available chunks to the DataFrame
    if available_chunks:
        available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)

# Convert dates to datetime
missing_chunks_df['start_date'] = pd.to_datetime(missing_chunks_df['start_date'])
missing_chunks_df['end_date'] = pd.to_datetime(missing_chunks_df['end_date'])
available_chunks_df['start_date'] = pd.to_datetime(available_chunks_df['start_date'])
available_chunks_df['end_date'] = pd.to_datetime(available_chunks_df['end_date'])

# Create a base Altair chart
base = alt.Chart(missing_chunks_df).encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Date')
)

# Create bars representing missing data chunks
missing_bars = base.mark_bar().encode(
    x2='end_date:T',
    color=alt.value('#E3B178'),  # Fixed color for missing data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
)

# Create bars representing available data chunks
available_bars = alt.Chart(available_chunks_df).mark_bar().encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Date'),
    x2='end_date:T',
    color=alt.value('#799AE5'),  # Fixed color for available data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T']
)

# Layer the two sets of bars
layered_chart = alt.layer(missing_bars, available_bars).properties(
    title='Missing and Available Data Chunks in "Morning" Column after Imputing',
    width=1400,
    height=1200
)

# Show the chart
layered_chart.show()

  missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
  available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)


In [9]:
import pandas as pd
import altair as alt
import numpy as np

# Convert date to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])

# Create a DataFrame to hold the missing chunks for visualization
missing_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])
available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date'])

# Loop through each patient to identify missing and available chunks in the "Afternoon" column
for patient_id in df_all['patient_id'].unique():
    patient_data = df_all[df_all['patient_id'] == patient_id].sort_values(by='Date')
    
    # Create a column to identify missing data
    patient_data['is_missing'] = patient_data['Afternoon'].isna()
    
    # Calculate the difference between consecutive dates
    patient_data['date_diff'] = patient_data['Date'].diff().dt.days
    
    # Identify missing chunks
    missing_chunk_start = None
    missing_chunk_size = 0
    available_chunk_start = None
    missing_chunks = []
    available_chunks = []

    for index, row in patient_data.iterrows():
        if row['is_missing']:
            if missing_chunk_start is None:
                missing_chunk_start = row['Date']
            missing_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
            
            # If an available chunk just ended, record it
            if available_chunk_start is not None:
                available_chunks.append({
                    'patient_id': patient_id,
                    'start_date': available_chunk_start,
                    'end_date': row['Date']
                })
                available_chunk_start = None
                
        else:
            if missing_chunk_size > 14:
                missing_chunks.append({
                    'patient_id': patient_id,
                    'start_date': missing_chunk_start,
                    'end_date': row['Date'],
                    'chunk_size': missing_chunk_size
                })
            missing_chunk_start = None
            missing_chunk_size = 0
            
            # Start a new available chunk
            if available_chunk_start is None:
                available_chunk_start = row['Date']
    
    # Check if the last chunk is missing and longer than 14 days
    if missing_chunk_size > 14:
        missing_chunks.append({
            'patient_id': patient_id,
            'start_date': missing_chunk_start,
            'end_date': patient_data.iloc[-1]['Date'],
            'chunk_size': missing_chunk_size
        })
    
    # Check if the last chunk is available
    if available_chunk_start is not None:
        available_chunks.append({
            'patient_id': patient_id,
            'start_date': available_chunk_start,
            'end_date': patient_data.iloc[-1]['Date']
        })

    # Append missing chunks to the DataFrame
    if missing_chunks:
        missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
    
    # Append available chunks to the DataFrame
    if available_chunks:
        available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)

# Convert dates to datetime
missing_chunks_df['start_date'] = pd.to_datetime(missing_chunks_df['start_date'])
missing_chunks_df['end_date'] = pd.to_datetime(missing_chunks_df['end_date'])
available_chunks_df['start_date'] = pd.to_datetime(available_chunks_df['start_date'])
available_chunks_df['end_date'] = pd.to_datetime(available_chunks_df['end_date'])

# Create a base Altair chart
base = alt.Chart(missing_chunks_df).encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Date')
)

# Create bars representing missing data chunks
missing_bars = base.mark_bar().encode(
    x2='end_date:T',
    color=alt.value('#E3B178'),  # Fixed color for missing data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
)

# Create bars representing available data chunks
available_bars = alt.Chart(available_chunks_df).mark_bar().encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Date'),
    x2='end_date:T',
    color=alt.value('#799AE5'),  # Fixed color for available data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T']
)

# Layer the two sets of bars
layered_chart = alt.layer(missing_bars, available_bars).properties(
    title='Missing and Available Data Chunks in "Afternoon" Column After Imputing',
    width=1400,
    height=1200
)

# Show the chart
layered_chart.show()

  missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
  available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)


## Display Largest available chunk in "Morning" Column

In [10]:
import pandas as pd
import altair as alt

# Convert date to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])

# Create a DataFrame to hold the biggest available chunks for visualization
biggest_available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])

# Function to find the largest available chunk
def find_largest_available_chunk(patient_data, column):
    patient_data = patient_data.sort_values(by='Date')
    patient_data['is_missing'] = patient_data[column].isna()
    
    # Variables to track available chunks
    largest_chunk = {
        'patient_id': None,
        'start_date': None,
        'end_date': None,
        'chunk_size': 0
    }
    current_chunk_start = None
    current_chunk_size = 0

    # Loop through the patient data to find the largest available chunk
    for index, row in patient_data.iterrows():
        if not row['is_missing']:  # If value is not NaN (i.e., available)
            if current_chunk_start is None:
                current_chunk_start = row['Date']  # Start of a new available chunk
            current_chunk_size += 1
        else:
            # Check if the current available chunk is the largest so far
            if current_chunk_size > largest_chunk['chunk_size']:
                largest_chunk = {
                    'patient_id': row['patient_id'],
                    'start_date': current_chunk_start,
                    'end_date': row['Date'],
                    'chunk_size': current_chunk_size
                }
            current_chunk_start = None
            current_chunk_size = 0

    # Handle the case where the last chunk is the largest
    if current_chunk_size > largest_chunk['chunk_size']:
        largest_chunk = {
            'patient_id': patient_data.iloc[-1]['patient_id'],
            'start_date': current_chunk_start,
            'end_date': patient_data.iloc[-1]['Date'],
            'chunk_size': current_chunk_size
        }
    
    return largest_chunk

# Loop through each patient to find the largest available chunk for the "Morning" column
for patient_id in df_all['patient_id'].unique():
    patient_data = df_all[df_all['patient_id'] == patient_id]
    largest_chunk = find_largest_available_chunk(patient_data, "Morning")

    # Append the largest chunk to the DataFrame
    if largest_chunk['chunk_size'] > 0:
        biggest_available_chunks_df = pd.concat([biggest_available_chunks_df, pd.DataFrame([largest_chunk])], ignore_index=True)

# Convert dates to datetime
biggest_available_chunks_df['start_date'] = pd.to_datetime(biggest_available_chunks_df['start_date'])
biggest_available_chunks_df['end_date'] = pd.to_datetime(biggest_available_chunks_df['end_date'])

# Create a base Altair chart for the largest available chunk
base = alt.Chart(biggest_available_chunks_df).encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Start Date')
)

# Create bars representing the largest available chunk for each patient
chunk_bars = base.mark_bar().encode(
    x2='end_date:T',
    color=alt.value('#799AE5'),  # Color for available data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
)

# Display the chart
chunk_bars.properties(
    title='Largest Available Data Chunks in the "Morning" Column for Each Patient',
    width=1400,
    height=1200
).interactive()

  biggest_available_chunks_df = pd.concat([biggest_available_chunks_df, pd.DataFrame([largest_chunk])], ignore_index=True)


## Display Largest available chunk in "Afternoon" Column

In [11]:
import pandas as pd
import altair as alt

# Convert date to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])

# Create a DataFrame to hold the biggest available chunks for visualization
biggest_available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])

# Function to find the largest available chunk
def find_largest_available_chunk(patient_data, column):
    patient_data = patient_data.sort_values(by='Date')
    patient_data['is_missing'] = patient_data[column].isna()
    
    # Variables to track available chunks
    largest_chunk = {
        'patient_id': None,
        'start_date': None,
        'end_date': None,
        'chunk_size': 0
    }
    current_chunk_start = None
    current_chunk_size = 0

    # Loop through the patient data to find the largest available chunk
    for index, row in patient_data.iterrows():
        if not row['is_missing']:  # If value is not NaN (i.e., available)
            if current_chunk_start is None:
                current_chunk_start = row['Date']  # Start of a new available chunk
            current_chunk_size += 1
        else:
            # Check if the current available chunk is the largest so far
            if current_chunk_size > largest_chunk['chunk_size']:
                largest_chunk = {
                    'patient_id': row['patient_id'],
                    'start_date': current_chunk_start,
                    'end_date': row['Date'],
                    'chunk_size': current_chunk_size
                }
            current_chunk_start = None
            current_chunk_size = 0

    # Handle the case where the last chunk is the largest
    if current_chunk_size > largest_chunk['chunk_size']:
        largest_chunk = {
            'patient_id': patient_data.iloc[-1]['patient_id'],
            'start_date': current_chunk_start,
            'end_date': patient_data.iloc[-1]['Date'],
            'chunk_size': current_chunk_size
        }
    
    return largest_chunk

# Loop through each patient to find the largest available chunk for the "Morning" column
for patient_id in df_all['patient_id'].unique():
    patient_data = df_all[df_all['patient_id'] == patient_id]
    largest_chunk = find_largest_available_chunk(patient_data, "Afternoon")

    # Append the largest chunk to the DataFrame
    if largest_chunk['chunk_size'] > 0:
        biggest_available_chunks_df = pd.concat([biggest_available_chunks_df, pd.DataFrame([largest_chunk])], ignore_index=True)

# Convert dates to datetime
biggest_available_chunks_df['start_date'] = pd.to_datetime(biggest_available_chunks_df['start_date'])
biggest_available_chunks_df['end_date'] = pd.to_datetime(biggest_available_chunks_df['end_date'])

# Create a base Altair chart for the largest available chunk
base = alt.Chart(biggest_available_chunks_df).encode(
    y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
    x=alt.X('start_date:T', title='Start Date')
)

# Create bars representing the largest available chunk for each patient
chunk_bars = base.mark_bar().encode(
    x2='end_date:T',
    color=alt.value('#799AE5'),  # Color for available data periods
    tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
)

# Display the chart
chunk_bars.properties(
    title='Largest Available Data Chunks in the "Afternoon" Column for Each Patient',
    width=1400,
    height=1200
).interactive()

  biggest_available_chunks_df = pd.concat([biggest_available_chunks_df, pd.DataFrame([largest_chunk])], ignore_index=True)


In [12]:
# import pandas as pd
# import altair as alt

# # Convert date to datetime
# df_all['Date'] = pd.to_datetime(df_all['Date'])

# # Create DataFrames to hold the missing and available chunks for visualization
# missing_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])
# available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])

# # Loop through each patient to identify missing and available chunks in the "Afternoon" column
# for patient_id in df_all['patient_id'].unique():
#     patient_data = df_all[df_all['patient_id'] == patient_id].sort_values(by='Date')
    
#     # Create a column to identify missing data
#     patient_data['is_missing'] = patient_data['Morning'].isna()
    
#     # Calculate the difference between consecutive dates
#     patient_data['date_diff'] = patient_data['Date'].diff().dt.days
    
#     # Identify missing and available chunks
#     missing_chunk_start = None
#     missing_chunk_size = 0
#     available_chunk_start = None
#     available_chunk_size = 0
#     missing_chunks = []
#     available_chunks = []

#     for index, row in patient_data.iterrows():
#         if row['is_missing']:
#             if missing_chunk_start is None:
#                 missing_chunk_start = row['Date']
#             missing_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
            
#             # If an available chunk just ended, record it
#             if available_chunk_start is not None:
#                 available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
#                 available_chunks.append({
#                     'patient_id': patient_id,
#                     'start_date': available_chunk_start,
#                     'end_date': row['Date'],
#                     'chunk_size': available_chunk_size
#                 })
#                 available_chunk_start = None
#                 available_chunk_size = 0
                
#         else:
#             if missing_chunk_size > 14:
#                 missing_chunks.append({
#                     'patient_id': patient_id,
#                     'start_date': missing_chunk_start,
#                     'end_date': row['Date'],
#                     'chunk_size': missing_chunk_size
#                 })
#             missing_chunk_start = None
#             missing_chunk_size = 0
            
#             # Start a new available chunk
#             if available_chunk_start is None:
#                 available_chunk_start = row['Date']
#             available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
    
#     # Check if the last chunk is missing and longer than 14 days
#     if missing_chunk_size > 14:
#         missing_chunks.append({
#             'patient_id': patient_id,
#             'start_date': missing_chunk_start,
#             'end_date': patient_data.iloc[-1]['Date'],
#             'chunk_size': missing_chunk_size
#         })
    
#     # Check if the last chunk is available
#     if available_chunk_start is not None:
#         available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
#         available_chunks.append({
#             'patient_id': patient_id,
#             'start_date': available_chunk_start,
#             'end_date': patient_data.iloc[-1]['Date'],
#             'chunk_size': available_chunk_size
#         })

#     # Append missing chunks to the DataFrame
#     if missing_chunks:
#         missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
    
#     # Append available chunks to the DataFrame
#     if available_chunks:
#         available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)

# # Convert dates to datetime
# missing_chunks_df['start_date'] = pd.to_datetime(missing_chunks_df['start_date'])
# missing_chunks_df['end_date'] = pd.to_datetime(missing_chunks_df['end_date'])
# available_chunks_df['start_date'] = pd.to_datetime(available_chunks_df['start_date'])
# available_chunks_df['end_date'] = pd.to_datetime(available_chunks_df['end_date'])

# # Create a base Altair chart
# base = alt.Chart(missing_chunks_df).encode(
#     y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
#     x=alt.X('start_date:T', title='Date')
# )

# # Create bars representing missing data chunks
# missing_bars = base.mark_bar().encode(
#     x2='end_date:T',
#     color=alt.value('#E3B178'),  # Fixed color for missing data periods
#     tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
# )

# # Create bars representing available data chunks
# available_bars = alt.Chart(available_chunks_df).mark_bar().encode(
#     y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
#     x=alt.X('start_date:T', title='Date'),
#     x2='end_date:T',
#     color=alt.value('#799AE5'),  # Fixed color for available data periods
#     tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
# )

# # Layer the two sets of bars
# layered_chart = alt.layer(missing_bars, available_bars).properties(
#     title='Missing and Available Data Chunks in "Morning" Column After Imputing',
#     width=1400,
#     height=1200
# )

# # Show the chart
# layered_chart.show()

In [13]:
# import pandas as pd
# import altair as alt

# # Convert date to datetime
# df_all['Date'] = pd.to_datetime(df_all['Date'])

# # Create DataFrames to hold the missing and available chunks for visualization
# missing_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])
# available_chunks_df = pd.DataFrame(columns=['patient_id', 'start_date', 'end_date', 'chunk_size'])

# # Loop through each patient to identify missing and available chunks in the "Afternoon" column
# for patient_id in df_all['patient_id'].unique():
#     patient_data = df_all[df_all['patient_id'] == patient_id].sort_values(by='Date')
    
#     # Create a column to identify missing data
#     patient_data['is_missing'] = patient_data['Afternoon'].isna()
    
#     # Calculate the difference between consecutive dates
#     patient_data['date_diff'] = patient_data['Date'].diff().dt.days
    
#     # Identify missing and available chunks
#     missing_chunk_start = None
#     missing_chunk_size = 0
#     available_chunk_start = None
#     available_chunk_size = 0
#     missing_chunks = []
#     available_chunks = []

#     for index, row in patient_data.iterrows():
#         if row['is_missing']:
#             if missing_chunk_start is None:
#                 missing_chunk_start = row['Date']
#             missing_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
            
#             # If an available chunk just ended, record it
#             if available_chunk_start is not None:
#                 available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
#                 available_chunks.append({
#                     'patient_id': patient_id,
#                     'start_date': available_chunk_start,
#                     'end_date': row['Date'],
#                     'chunk_size': available_chunk_size
#                 })
#                 available_chunk_start = None
#                 available_chunk_size = 0
                
#         else:
#             if missing_chunk_size > 14:
#                 missing_chunks.append({
#                     'patient_id': patient_id,
#                     'start_date': missing_chunk_start,
#                     'end_date': row['Date'],
#                     'chunk_size': missing_chunk_size
#                 })
#             missing_chunk_start = None
#             missing_chunk_size = 0
            
#             # Start a new available chunk
#             if available_chunk_start is None:
#                 available_chunk_start = row['Date']
#             available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
    
#     # Check if the last chunk is missing and longer than 14 days
#     if missing_chunk_size > 14:
#         missing_chunks.append({
#             'patient_id': patient_id,
#             'start_date': missing_chunk_start,
#             'end_date': patient_data.iloc[-1]['Date'],
#             'chunk_size': missing_chunk_size
#         })
    
#     # Check if the last chunk is available
#     if available_chunk_start is not None:
#         available_chunk_size += row['date_diff'] if pd.notna(row['date_diff']) else 1
#         available_chunks.append({
#             'patient_id': patient_id,
#             'start_date': available_chunk_start,
#             'end_date': patient_data.iloc[-1]['Date'],
#             'chunk_size': available_chunk_size
#         })

#     # Append missing chunks to the DataFrame
#     if missing_chunks:
#         missing_chunks_df = pd.concat([missing_chunks_df, pd.DataFrame(missing_chunks)], ignore_index=True)
    
#     # Append available chunks to the DataFrame
#     if available_chunks:
#         available_chunks_df = pd.concat([available_chunks_df, pd.DataFrame(available_chunks)], ignore_index=True)

# # Convert dates to datetime
# missing_chunks_df['start_date'] = pd.to_datetime(missing_chunks_df['start_date'])
# missing_chunks_df['end_date'] = pd.to_datetime(missing_chunks_df['end_date'])
# available_chunks_df['start_date'] = pd.to_datetime(available_chunks_df['start_date'])
# available_chunks_df['end_date'] = pd.to_datetime(available_chunks_df['end_date'])

# # Create a base Altair chart
# base = alt.Chart(missing_chunks_df).encode(
#     y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
#     x=alt.X('start_date:T', title='Date')
# )

# # Create bars representing missing data chunks
# missing_bars = base.mark_bar().encode(
#     x2='end_date:T',
#     color=alt.value('#E3B178'),  # Fixed color for missing data periods
#     tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
# )

# # Create bars representing available data chunks
# available_bars = alt.Chart(available_chunks_df).mark_bar().encode(
#     y=alt.Y('patient_id:N', title='Patient ID', sort=alt.EncodingSortField(field='patient_id', order='ascending')),
#     x=alt.X('start_date:T', title='Date'),
#     x2='end_date:T',
#     color=alt.value('#799AE5'),  # Fixed color for available data periods
#     tooltip=['patient_id:N', 'start_date:T', 'end_date:T', 'chunk_size:Q']
# )

# # Layer the two sets of bars
# layered_chart = alt.layer(missing_bars, available_bars).properties(
#     title='Missing and Available Data Chunks in "Afternoon" Column After Imputing',
#     width=1400,
#     height=1200
# )

# # Show the chart
# layered_chart.show()

## Dictionary for available start date and end date for each chunk for the Morning column

In [14]:
import os
import pandas as pd

# Folder containing patient CSV files
folder_path = '/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/New_Imputed_Data'

# Load all patient data
all_data = []
for file in os.listdir(folder_path):
    if file.endswith('.csv'):
        patient_id = file.split('.')[0]  # Assuming patient ID is the filename without extension
        df = pd.read_csv(os.path.join(folder_path, file))
        df['patient_id'] = patient_id
        all_data.append(df)

# Combine all data into a single DataFrame
df_all = pd.concat(all_data)

# Convert date column to datetime
df_all['Date'] = pd.to_datetime(df_all['Date'])

# Determine the earliest and latest dates
start_date = df_all['Date'].min().strftime('%Y-%m-%d')  # Format to remove time
end_date = df_all['Date'].max().strftime('%Y-%m-%d')    # Format to remove time

print(f"Earliest Start Date: {start_date}")
print(f"Latest End Date: {end_date}")

# Initialize the dictionary to store available data chunks for each patient
morning_available_data = {}

# Loop through each patient ID
for patient_id, df in df_all.groupby('patient_id'):
    # Create a column to identify available data in the "Morning" column
    df['is_available'] = df['Morning'].notna()

    # Initialize variables to store the start date, size, and chunks
    available_chunk_start = None
    available_chunk_size = 0
    chunks = []

    # Loop through the data to identify chunks of available data
    for index, row in df.iterrows():
        if row['is_available']:
            if available_chunk_start is None:
                available_chunk_start = row['Date'].strftime('%Y-%m-%d')  # Format to remove time
            available_chunk_size += 1
        else:
            if available_chunk_size > 0:  # End of an available chunk
                chunks.append({
                    'chunk_size': available_chunk_size,
                    'chunk_start_date': available_chunk_start,
                    'chunk_end_date': (row['Date'] - pd.Timedelta(days=1)).strftime('%Y-%m-%d')  # Format to remove time
                })
            available_chunk_start = None
            available_chunk_size = 0

    # Check if the last chunk was available
    if available_chunk_size > 0:
        chunks.append({
            'chunk_size': available_chunk_size,
            'chunk_start_date': available_chunk_start,
            'chunk_end_date': df.iloc[-1]['Date'].strftime('%Y-%m-%d')  # Format to remove time
        })

    # Add the patient's available data chunks to the dictionary
    if chunks:
        morning_available_data[patient_id] = chunks

print(morning_available_data)


Earliest Start Date: 2014-11-21
Latest End Date: 2022-01-19
{'SB-001-OG-recent-Imputed': [{'chunk_size': 1336, 'chunk_start_date': '2016-06-10', 'chunk_end_date': '2020-02-05'}], 'SB-002-OG-recent-Imputed': [{'chunk_size': 572, 'chunk_start_date': '2014-11-21', 'chunk_end_date': '2017-01-29'}], 'SB-003-OG-recent-Imputed': [{'chunk_size': 1064, 'chunk_start_date': '2017-05-01', 'chunk_end_date': '2020-03-29'}], 'SB-004-OG-recent-Imputed': [{'chunk_size': 498, 'chunk_start_date': '2016-09-21', 'chunk_end_date': '2018-01-31'}, {'chunk_size': 605, 'chunk_start_date': '2018-09-20', 'chunk_end_date': '2020-05-16'}, {'chunk_size': 128, 'chunk_start_date': '2020-06-01', 'chunk_end_date': '2020-10-06'}, {'chunk_size': 1, 'chunk_start_date': '2020-10-23', 'chunk_end_date': '2020-10-23'}, {'chunk_size': 49, 'chunk_start_date': '2020-11-17', 'chunk_end_date': '2021-01-04'}, {'chunk_size': 223, 'chunk_start_date': '2021-01-21', 'chunk_end_date': '2021-08-31'}], 'SB-006-OG-recent-Imputed': [{'chunk_

In [15]:
import pandas as pd
import altair as alt
from datetime import timedelta

# Prepare the data for visualization
data_for_viz = []

for patient_id, chunks in morning_available_data.items():
    for chunk in chunks:
        data_for_viz.append({
            'patient_id': patient_id,
            'chunk_start_date': chunk['chunk_start_date'],
            'chunk_end_date': chunk['chunk_end_date'],
            'chunk_size': chunk['chunk_size'],
            'status': 'available'
        })

# Convert to DataFrame
df_viz = pd.DataFrame(data_for_viz)

# Convert chunk_start_date and chunk_end_date back to datetime
df_viz['chunk_start_date'] = pd.to_datetime(df_viz['chunk_start_date'])
df_viz['chunk_end_date'] = pd.to_datetime(df_viz['chunk_end_date'])

# Create the Altair chart
chart = alt.Chart(df_viz).mark_bar().encode(
    x=alt.X('chunk_start_date:T', title='Start Date'),
    x2='chunk_end_date:T',
    y=alt.Y('patient_id:N', title='Patient ID'),
    color=alt.Color('status:N', scale=alt.Scale(domain=['available'], range=['#72bcd4'])),
    tooltip=[
        alt.Tooltip('patient_id:N', title='Patient ID'),
        alt.Tooltip('chunk_start_date:T', title='Start Date'),
        alt.Tooltip('chunk_end_date:T', title='End Date'),
        alt.Tooltip('chunk_size:Q', title='Chunk Size')
    ]
).properties(
    title="Morning Patient Data Availability",
    width=1400,
    height=1200
)

# Display the chart
chart


In [16]:
# Initialize the dictionary to store available data chunks for each patient
afternoon_available_data = {}

# Loop through each patient ID
for patient_id, df in df_all.groupby('patient_id'):
    # Create a column to identify available data in the "Morning" column
    df['is_available'] = df['Afternoon'].notna()

    # Initialize variables to store the start date, size, and chunks
    available_chunk_start = None
    available_chunk_size = 0
    chunks = []

    # Loop through the data to identify chunks of available data
    for index, row in df.iterrows():
        if row['is_available']:
            if available_chunk_start is None:
                available_chunk_start = row['Date'].strftime('%Y-%m-%d')  # Format to remove time
            available_chunk_size += 1
        else:
            if available_chunk_size > 0:  # End of an available chunk
                chunks.append({
                    'chunk_size': available_chunk_size,
                    'chunk_start_date': available_chunk_start,
                    'chunk_end_date': (row['Date'] - pd.Timedelta(days=1)).strftime('%Y-%m-%d')  # Format to remove time
                })
            available_chunk_start = None
            available_chunk_size = 0

    # Check if the last chunk was available
    if available_chunk_size > 0:
        chunks.append({
            'chunk_size': available_chunk_size,
            'chunk_start_date': available_chunk_start,
            'chunk_end_date': df.iloc[-1]['Date'].strftime('%Y-%m-%d')  # Format to remove time
        })

    # Add the patient's available data chunks to the dictionary
    if chunks:
        afternoon_available_data[patient_id] = chunks

print(afternoon_available_data)


{'SB-001-OG-recent-Imputed': [{'chunk_size': 1336, 'chunk_start_date': '2016-06-10', 'chunk_end_date': '2020-02-05'}], 'SB-003-OG-recent-Imputed': [{'chunk_size': 1064, 'chunk_start_date': '2017-05-01', 'chunk_end_date': '2020-03-29'}], 'SB-004-OG-recent-Imputed': [{'chunk_size': 498, 'chunk_start_date': '2016-09-21', 'chunk_end_date': '2018-01-31'}, {'chunk_size': 453, 'chunk_start_date': '2018-09-20', 'chunk_end_date': '2019-12-16'}, {'chunk_size': 30, 'chunk_start_date': '2020-01-02', 'chunk_end_date': '2020-01-31'}, {'chunk_size': 162, 'chunk_start_date': '2020-02-17', 'chunk_end_date': '2020-07-27'}, {'chunk_size': 294, 'chunk_start_date': '2020-08-12', 'chunk_end_date': '2021-06-01'}, {'chunk_size': 27, 'chunk_start_date': '2021-06-22', 'chunk_end_date': '2021-07-18'}, {'chunk_size': 33, 'chunk_start_date': '2021-08-16', 'chunk_end_date': '2021-09-17'}], 'SB-006-OG-recent-Imputed': [{'chunk_size': 129, 'chunk_start_date': '2016-05-13', 'chunk_end_date': '2016-09-27'}, {'chunk_siz

In [17]:
import pandas as pd
import altair as alt
from datetime import timedelta

# Prepare the data for visualization
data_for_viz = []

for patient_id, chunks in afternoon_available_data.items():
    for chunk in chunks:
        data_for_viz.append({
            'patient_id': patient_id,
            'chunk_start_date': chunk['chunk_start_date'],
            'chunk_end_date': chunk['chunk_end_date'],
            'chunk_size': chunk['chunk_size'],
            'status': 'available'
        })

# Convert to DataFrame
df_viz = pd.DataFrame(data_for_viz)

# Convert chunk_start_date and chunk_end_date back to datetime
df_viz['chunk_start_date'] = pd.to_datetime(df_viz['chunk_start_date'])
df_viz['chunk_end_date'] = pd.to_datetime(df_viz['chunk_end_date'])

# Create the Altair chart
chart = alt.Chart(df_viz).mark_bar().encode(
    x=alt.X('chunk_start_date:T', title='Start Date'),
    x2='chunk_end_date:T',
    y=alt.Y('patient_id:N', title='Patient ID'),
    color=alt.Color('status:N', scale=alt.Scale(domain=['available'], range=['#72bcd4'])),
    tooltip=[
        alt.Tooltip('patient_id:N', title='Patient ID'),
        alt.Tooltip('chunk_start_date:T', title='Start Date'),
        alt.Tooltip('chunk_end_date:T', title='End Date'),
        alt.Tooltip('chunk_size:Q', title='Chunk Size')
    ]
).properties(
    title="Afternoon Patient Data Availability",
    width=1400,
    height=1200
)

# Display the chart
chart


In [18]:
# Function to find the largest available chunk for a specific column
def find_largest_chunk(df, column_name):
    df['is_available'] = df[column_name].notna()
    
    largest_chunk = {
        'patient_id': None,
        'chunk_size': 0,
        'chunk_start_date': None,
        'chunk_end_date': None
    }
    
    available_chunk_start = None
    available_chunk_size = 0
    
    for index, row in df.iterrows():
        if row['is_available']:
            if available_chunk_start is None:
                available_chunk_start = row['Date']
            available_chunk_size += 1
        else:
            if available_chunk_size > largest_chunk['chunk_size']:
                largest_chunk = {
                    'patient_id': row['patient_id'],
                    'chunk_size': available_chunk_size,
                    'chunk_start_date': available_chunk_start,
                    'chunk_end_date': row['Date'] - pd.Timedelta(days=1)
                }
            available_chunk_start = None
            available_chunk_size = 0

    # Handle case where the largest chunk ends at the last row
    if available_chunk_size > largest_chunk['chunk_size']:
        largest_chunk = {
            'patient_id': row['patient_id'],
            'chunk_size': available_chunk_size,
            'chunk_start_date': available_chunk_start,
            'chunk_end_date': df.iloc[-1]['Date']
        }
    
    return largest_chunk

# Group by patient_id and find the largest chunk for both Morning and Afternoon columns
morning_chunks = []
afternoon_chunks = []

for patient_id, patient_df in df_all.groupby('patient_id'):
    # Largest chunk for "Morning"
    largest_morning_chunk = find_largest_chunk(patient_df, 'Morning')
    morning_chunks.append(largest_morning_chunk)

    # Largest chunk for "Afternoon"
    largest_afternoon_chunk = find_largest_chunk(patient_df, 'Afternoon')
    afternoon_chunks.append(largest_afternoon_chunk)

# Convert the results to DataFrames
df_morning = pd.DataFrame(morning_chunks)
df_afternoon = pd.DataFrame(afternoon_chunks)

# Define the save locations for the CSV files
save_path_morning = '/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/largest_chunk_csv/largest_morning_chunks.csv'
save_path_afternoon = '/Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/largest_chunk_csv/largest_afternoon_chunks.csv'

# Save to CSV files at the desired location
df_morning.to_csv(save_path_morning, index=False)
df_afternoon.to_csv(save_path_afternoon, index=False)

print(f"CSV files saved at:\nMorning: {save_path_morning}\nAfternoon: {save_path_afternoon}")

CSV files saved at:
Morning: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/largest_chunk_csv/largest_morning_chunks.csv
Afternoon: /Users/siddhi/research_new/schas-2024-asthama-2.0/EDA-on-missing-data/largest_chunk_csv/largest_afternoon_chunks.csv
