# Upload data

In [2]:
#import sys
#!{sys.executable} -m pip install dash

In [70]:
import pandas as pd
import os #for viewing HTML in web browser
import matplotlib.pyplot as plt
from typing import Tuple, List # If you have functions that return more than one value, 
# they will be returned in a tuple and you need this  to write that out in typehints

In [72]:
# Display Pandas dataframes such that they're easy to scroll through
from IPython.display import display, HTML
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [74]:
#Open df in the browser - easiest way to view all the data in this large dataset
def to_html(df: pd.DataFrame) -> str:
    '''
    Render the df in HTML. Return the full HTML address. You can paste the HTML address in the 
    address bar to see the df at any time.
    '''
    df.to_html('df_view.html')
    full_path = os.path.abspath('df_view.html')
    print(full_path)

In [76]:
#Load file
# work_filepath = "C:\\Users\\kylimcqueen\\Downloads\\all_nts_animals_baseline_tall_00.csv"
#mac_filepath = '/Users/kyli/Desktop/Neurotrauma/all_nts_animals_baseline_tall_00.csv'
filepath=("C:\\Users\\kylimcqueen\\Downloads\\all_nts_animals_postinjury_tall_01.csv")

In [78]:
#Check that we grabbed the filepath
print(filepath)

C:\Users\kylimcqueen\Downloads\all_nts_animals_postinjury_tall_01.csv


In [80]:
#Create df
original_df = pd.read_csv(filepath)

In [82]:
#Look at the df
to_html(original_df)

C:\Git\Sleep-Analysis\df_view.html


In [84]:
# Get the shape of the datast
original_df.shape

(41634, 12)

# Data Cleaning

## Remove excluded animals

We are only using data from animals in the sham vehicle and injured vehicle groups.

In [88]:
#Check for excluded animals
def whats_in_the_col(df: pd.DataFrame, data_name: str, col_name: str) -> Tuple[int, List[str]]:
    '''
    Counts the number of cells in the specified column of the dataframe that don't 
    exactly match the string "Included".
    '''
    assert len(df[col_name]) == len(df) #Assert we're accessing the whole column

    #Assert all values in the column are of the same type (string)
    #.map(type) gets the type of each value
    #.nunique() counts number of unique types
    assert df[col_name].map(type).nunique() == 1, "Not all values are the same type"
                            
    # Get values that don't match the data_name
    non_matching_mask: list = df[col_name] != data_name
    
    # Count how many cells don't match
    count: int = non_matching_mask.sum()
    
    # Get the actual values that don't match
    non_matching_values: list = df.loc[non_matching_mask, col_name].unique().tolist()
    
    return count, non_matching_values

In [90]:
count, no_match_list = whats_in_the_col(original_df, "Included", "Included")

In [92]:
# Check that this Excel file only has animals that are included in the study (animals that lived past injury)

print(f' Count: {count}.\nIf count = 0 then every item in the column matches the data_name, which, for the postinjury file, is "Included".\n\n') 
print(f' List of values that don\'t match the target value: {no_match_list}\nIf no_match_list is empty, then every item in the column matches the data name, which, for the postinjury file, is "Included".') 

 Count: 0.
If count = 0 then every item in the column matches the data_name, which, for the postinjury file, is "Included".


 List of values that don't match the target value: []
If no_match_list is empty, then every item in the column matches the data name, which, for the postinjury file, is "Included".


## Remove animals given drug

In [95]:
# Check how many animals are not vehicle
count, no_match_list = whats_in_the_col(original_df, "vehicle", "Treatment")
print(f" There are {count} cells in this column that don't match the string that you searched for")

 There are 24158 cells in this column that don't match the string that you searched for


In [97]:
# Filter the dataframe to remove animals that were treated with drug
vehicle_df = original_df.loc[original_df['Treatment'] == 'vehicle']

In [99]:
print(f"Original dataframe shape: {original_df.shape}")
print(f"Filtered dataframe shape: {vehicle_df.shape}")
print(f"Unique treatment values in filtered dataframe: {vehicle_df['Treatment'].unique()}")

Original dataframe shape: (41634, 12)
Filtered dataframe shape: (17476, 12)
Unique treatment values in filtered dataframe: ['vehicle']


In [101]:
# If you want to see the list of unique animals
unique_animals = vehicle_df['UniqueMouse'].unique()
print(f"There are {len(unique_animals)} unique animals in the dataset.\nList of unique animals: {unique_animals}")


There are 34 unique animals in the dataset.
List of unique animals: ['C1-01' 'C1-07' 'C1-14' 'C2-04' 'C2-05' 'C2-09' 'C2-13' 'C3-04' 'C3-07'
 'C3-10' 'C3-14' 'C4-01' 'C4-04' 'C4-06' 'C4-10' 'C4-15' 'C5-03' 'C5-11'
 'C5-15' 'C6-02' 'C6-04' 'C6-06' 'C6-09' 'C6-11' 'C6-15' 'C7-02' 'C7-07'
 'C7-14' 'C7-15' 'C8-05' 'C8-06' 'C8-12' 'C8-13' 'C8-16']


## Handle leading and lagging NaN values

Data is in [TIDY](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html) format. This means each row is an observation and each column is a variable. For each different cohort, there is a certain number of NaNs at the beginning and end of sleep percent column. This is because of the way the data was originally formatted in the Excel. To address missing values, we need to get rid of all the leading and lagging NaNs and keep in mind that the number of leading and lagging Nans is the same within each cohort but different between each cohort.

Note: There are 71 animals in the postinjury dataset (excluding cohort 8).

Note: All UniqueMice should have 372 rows of data before trimming the leading and lagging NaNs. 372 rows per UniqueMouse * 71 UniqueMice = 26412 total rows of data, which is in agreement with the shape of the original postinjury df. Every UniqueMouse has between 300 and 400 data points and the number of datapoints is consistent within each Cohort.

In [168]:
def trim_nan_edges(df: pd.DataFrame, cohort_col: pd.Series, mouse_col: pd.Series, data_col:pd.Series):
    '''
    Get rid of the leading and lagging NaNs at the beginning and end of the set of rows that represent 
    all observations for each mouse. Do not get rid of NaNs in the middle of observations. 
    All mice within the same cohort have the same number of leading and lagging NaNs.
    '''
    trimmed_groups: list = []

    # Nested for loop to loop over each UniqueMouse within each Cohort
    df_grouped = df.groupby(cohort_col) # Group by Cohorts
    for cohort_name, cohort_group in df_grouped: # Loop over each cohort    
        for mouse_name, mouse_group in cohort_group.groupby(mouse_col): # Loop over UniqueMouse
    
            mouse_group = mouse_group.reset_index(drop=True) # Reset the index for the rows of a UniqueMouse
            #print(f' The length of the mouse group is {len(mouse_group)}. The length of the mouse group should be 514 for postinjury.') testing
            # Create a new list called valid_mask where each entry is a boolean value where T = 1 = a number and
            # F = 0 = a NaN for that Uniquemouse's SleepPercent data
            valid_mask = mouse_group[data_col].notna() 

            #If there are any NaNs
            if valid_mask.any():
                # first_valid_idx is the first index that returns true - the first number after leading NaNs
                first_valid_idx = valid_mask.idxmax()
                # last_valid_idx is the first index going backwards that returns true - the last number before lagging NaNs
                last_valid_idx = valid_mask[valid_mask].index[-1]
                #print(f' The number of rows in this list is {len(valid_mask)}') # Testing statement
    

                # Take the first_valid_idx and last_valid_idx, match them to corresponding actual values in the UniqueMouse
                # SleepPercent data, and put the actual SleepPercent values in a new list called trimmed
                trimmed = mouse_group.iloc[first_valid_idx:last_valid_idx + 1].copy() 
                #print(len(trimmed)) #Testing statement
                # Add the trimmed SleepPercent column from one UniqueMouse to the trimmed_groups list, which is a list
                # of trimmed values for all UniqueMice across all Cohorts
                trimmed_groups.append(trimmed) # Add the trimmed SleepPercent column to the 
    
    trimmed_df = pd.concat(trimmed_groups, ignore_index=True)
    return trimmed_df

In [None]:
# Run trim_nan_edges function
trimmed_df = trim_nan_edges(vehicle_df, "Cohort", "UniqueMouse", "PercentSleep")

In [108]:
to_html(trimmed_df)

C:\Git\Sleep-Analysis\df_view.html


## Remove unnecessary columns

There are multiple ways to do this. You can mask, like df[['Column name', 'Next column name', 'etc']]. 
You can use .loc to select specific parts of the df.

Using .loc is likely the more stable way because when you mask it creates a something like a temporary DataFrame whose relationship to the original dataframe is unclear using a chained indexing operation. This means its not always clear whether the dataframe returned by masking is a view of the original dataframe or a copy of it. Therefore, when you try to further modify a mask of the dataframe, Pandas isn't always sure whether to modify the original or the view/copy. So you sometimes get SettingWithCopyWarnings.

Using df.loc[:,columns] grabs all the rows from whatever columns you're interested in preserving. It's a single indexing operation that returns a view of the original dataframe. When you make changes with .loc, they will affect the original DataFrame.

In [110]:
trimmed_df.columns

Index(['StudyPart', 'UniqueMouse', 'Cohort', 'InjuredGroup', 'Treatment',
       'Included', 'Lights', 'HourOfDay', 'CumulativeHour', 'DoseMarker',
       'PercentSleep', 'SleepBout'],
      dtype='object')

In [112]:
# Identify the relevant columns for later analysis
relevant_cols = ['UniqueMouse', 'Cohort', 'InjuredGroup', 'Treatment','HourOfDay', 'CumulativeHour', 'PercentSleep']

In [None]:
# Remove unnecessary columns
clean_df = trimmed_df.loc[:,relevant_cols]
clean_df

## Remove duplicates

In [117]:
duplicate_count = clean_df.duplicated().sum()
print(f' Number of duplicate rows: {duplicate_count}')

 Number of duplicate rows: 0


## Check and convert datatypes

### Change object datatypes to categories

In [121]:
print(f'DataFrame information: \n{clean_df.info()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12216 entries, 0 to 12215
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   UniqueMouse     12216 non-null  object 
 1   Cohort          12216 non-null  object 
 2   InjuredGroup    12216 non-null  object 
 3   Treatment       12216 non-null  object 
 4   HourOfDay       12216 non-null  int64  
 5   CumulativeHour  12216 non-null  int64  
 6   PercentSleep    12129 non-null  float64
dtypes: float64(1), int64(2), object(4)
memory usage: 668.2+ KB
DataFrame information: 
None


In [123]:
'''Convert columns with type 'object' to categories because all data in these columns are limited unique values that repeat (like group identifiers).
Category designation stores data as categorical/enumerated values, is more memory-efficient, and is faster for operations on columns.'''
def turn_obj_cols_to_categories(df: pd.DataFrame) -> pd.DataFrame:
    obj_cols = df.select_dtypes(include='object').columns
    for col in obj_cols:
        df[col] = df[col].astype('category')
    return df

In [None]:
turn_obj_cols_to_categories(clean_df)

In [127]:
clean_df.dtypes

UniqueMouse       category
Cohort            category
InjuredGroup      category
Treatment         category
HourOfDay            int64
CumulativeHour       int64
PercentSleep       float64
dtype: object

# Handle all other NaNs

In [None]:
# See the % of NaNs per total data for each UniqueMouse
na_summary = clean_df["PercentSleep"].isna().groupby(clean_df["UniqueMouse"]).mean()
# Print the % NaNs per total data for each UniqueMouse from largest to smallest
print(f' There are a total of {len(na_summary)} mice.')
print(type(na_summary.sort_values(ascending=False)))
print(na_summary.sort_values(ascending=False))

'''
The vast majority of animals with NaN values are from Cohort 5. 
'''

### How do we fill NaNs?

Options: 

a. Drop them. This is likely to break analysis.
df = df.dropna(subset=['PercentSleep'])

b. Forward fill or backward fill because sleep is continuous and doesn't change abruptly.
df['PercentSleep'] = df.groupby('UniqueMouse')['PercentSleep'].ffill().bfill() (I think choose either ffill or bfill)

c. Interpolate. This is a good process to use for evenly spaced time series (ex: hourly data) because it creates a straight line between
two adjacent data points and estimates the value between them. It considers the lienar relationshp between the known data points (Medium article)[https://medium.com/@datasciencewizards/preprocessing-and-data-exploration-for-time-series-handling-missing-values-e5c507f6c71c]
df["PercentSleep"] = df.groupby("UniqueMouse")["PercentSleep"].transform(lambda x: x.interpolate())

d. Flag and leave. Can make a new column to flag for missingness and ealuate that later?

In [132]:
# Interpolation
clean_df["PercentSleep"] = clean_df.groupby("UniqueMouse", observed=False)["PercentSleep"].transform(lambda x: x.interpolate())

In [134]:
# Check that there's no NaNs now
na_summary = clean_df["PercentSleep"].isna().groupby(clean_df["UniqueMouse"]).mean()
print(f' There are a total of {len(na_summary)} mice.')
print(type(na_summary.sort_values(ascending=False)))
#print(na_summary.sort_values(ascending=False))

#Maybe replace this with an assert statement


 There are a total of 34 mice.
<class 'pandas.core.series.Series'>


  na_summary = clean_df["PercentSleep"].isna().groupby(clean_df["UniqueMouse"]).mean()


In [136]:
# Make a dictionary of dataframes so that you can view each cohort independently
# Each key is the name of the cohort (ex: C2, C3, etc) and the value is the datafarme of that cohort only
cohorts_dict = {cohort: data for cohort, data in clean_df.groupby('Cohort')}

  cohorts_dict = {cohort: data for cohort, data in clean_df.groupby('Cohort')}


In [None]:
# Check a few cohorts to make sure they look normal
display(cohorts_dict['C7'])

In [148]:
to_html(cohorts_dict['C1'])

C:\Git\Sleep-Analysis\df_view.html


# Data size

In [172]:
# Number of unique animals after data cleaning
unique_animals = clean_df['UniqueMouse'].nunique()
print(f'There are {unique_animals} unique animals in the dataset after initial data cleaning')

There are 34 unique animals in the dataset after initial data cleaning


After initial data cleaning and visualization, I found that animals in cohort 5 were missing 25 hours of data.  I will drop the animals in cohort 5 becuase they are missing more than 24 hours of data.

# Visualization

In [185]:
# Remove cohort 5 animals
# The new df clean_df is equal to the old clean_df where we grouped by Cohort and took all the data where 'Cohort' is not equal to 5
clean_df = clean_df.loc[clean_df['Cohort'] != 5]

In [201]:
print(clean_df['C5'])
# When I return from lunch, drop cohort 5 before ceating cohorts_dict

KeyError: 'C5'

In [175]:
#Produce interactive graphs of sleep percent per hour for each cohort using Plotly instead of Seaborn - more interactive

from dash import Dash, html, dcc, Input, Output
import plotly.express as px
import pandas as pd

df = clean_df  

app = Dash(__name__)

app.layout = html.Div([
    html.H4('Hourly Percent Sleep per Cohort'),
    dcc.Graph(id="graph"),
    dcc.Checklist(
        id="checklist",
        options=[{"label": cohort, "value": cohort} for cohort in df['Cohort'].unique()],
        value=[df['Cohort'].unique()[0]],  # Default selected cohort(s)
        inline=True  # Display checklist items horizontally
    ),
])


@app.callback(
    Output("graph", "figure"), 
    Input("checklist", "value"))
def update_line_chart(selected_cohorts):
    mask = df['Cohort'].isin(selected_cohorts)
    fig = px.line(
        df[mask], 
        x="CumulativeHour", 
        y="PercentSleep", 
        color='UniqueMouse', 
        line_group='UniqueMouse',  # Group lines by each mouse
        title="Hourly Percent Sleep for Selected Cohorts"
    )
    fig.update_layout(
        xaxis_title="Hour of Day",
        yaxis_title="Percent Sleep",
        legend_title="Mouse ID",
        template="plotly_white",
        hovermode="x unified"  # Unified hover for better interactivity
    )
    return fig


if __name__ == "__main__":
    app.run(debug=True)


In [179]:
# interactive_plot.py

from dash import Dash, html, dcc, Input, Output
import plotly.graph_objects as go
import pandas as pd

# Assumed: clean_df is already available and cleaned
# Required columns: 'Cohort', 'UniqueMouse', 'CumulativeHour', 'PercentSleep'
df = clean_df

# User-defined color map for each cohort
cohort_colors = {
    'Cohort A': '#1f77b4',  # Blue
    'Cohort B': '#ff7f0e',  # Orange
    'Cohort C': '#2ca02c',  # Green
    # Add more cohorts and their desired color codes here
}

# Dash styles to visually distinguish individual animals in a cohort
dash_styles = ['solid', 'dot', 'dash', 'dashdot']

app = Dash(__name__)

app.layout = html.Div([
    html.H4('Hourly Percent Sleep per Cohort'),
    dcc.Graph(id="graph"),
    dcc.Checklist(
        id="checklist",
        options=[{"label": cohort, "value": cohort} for cohort in df['Cohort'].unique()],
        value=[df['Cohort'].unique()[0]],
        inline=True
    ),
])

@app.callback(
    Output("graph", "figure"), 
    Input("checklist", "value"))
def update_line_chart(selected_cohorts):
    fig = go.Figure()

    for cohort in selected_cohorts:
        cohort_df = df[df['Cohort'] == cohort]
        mice = cohort_df['UniqueMouse'].unique()
        base_color = cohort_colors.get(cohort, '#000000')  # fallback to black

        for i, mouse in enumerate(mice):
            mouse_df = cohort_df[cohort_df['UniqueMouse'] == mouse]
            dash_style = dash_styles[i % len(dash_styles)]

            fig.add_trace(go.Scatter(
                x=mouse_df['CumulativeHour'],
                y=mouse_df['PercentSleep'],
                mode='lines',
                name=f"{cohort} - {mouse}",
                line=dict(color=base_color, dash=dash_style)
            ))

    fig.update_layout(
        title="Hourly Percent Sleep for Selected Cohorts",
        xaxis_title="Hour of Day",
        yaxis_title="Percent Sleep",
        legend_title="Cohort - Mouse",
        template="plotly_white",
        hovermode="x unified"
    )

    return fig

if __name__ == "__main__":
    app.run(debug=True)
