# Upload data

In [1]:
!pip install pytest



In [23]:
import pandas as pd
import os #for viewing HTML in web browser
import matplotlib.pyplot as plt
from typing import Tuple, List # If you have functions that return more than one value, 
# they will be returned in a tuple and you need this  to write that out in typehints

In [5]:
#Open df in the browser - easiest way to view all the data in this large dataset
def to_html(df: pd.DataFrame) -> str:
    '''
    Render the df in HTML. Return the full HTML address. You can paste the HTML address in the 
    address bar to see the df at any time.
    '''
    df.to_html('df_view.html')
    full_path = os.path.abspath('df_view.html')
    print(full_path)

In [7]:
#Load file
# work_filepath = "C:\Users\kylimcqueen\Downloads\all_nts_animals_baseline_tall_00.csv"
#mac_filepath = '/Users/kyli/Desktop/Neurotrauma/all_nts_animals_baseline_tall_00.csv'
filepath=("\\Users\\kylimcqueen\\Downloads\\all_nts_animals_postinjury_tall_00.csv")

In [9]:
#Check that we grabbed the filepath
print(filepath)

\Users\kylimcqueen\Downloads\all_nts_animals_postinjury_tall_00.csv


In [11]:
#Create df
df_original = pd.read_csv(filepath)

In [13]:
#Look at the df
to_html(df_original)

C:\Git\Sleep-Analysis\df_view.html


# Data Cleaning

## Remove excluded animals

We are only using data from animals in the sham vehicle and injured vehicle groups.

In [25]:
#Check for excluded animals
def whats_in_the_col(df: pd.DataFrame, data_name: str, col_name: str) -> Tuple[int, List[str]]:
    '''
    Counts the number of cells in the specified column of the dataframe that don't 
    exactly match the string "Included".
    '''
    assert len(df[col_name]) == len(df) #Assert we're accessing the whole column

    #Assert all values in the column are of the same type (string)
    #.map(type) gets the type of each value
    #.nunique() counts number of unique types
    assert df[col_name].map(type).nunique() == 1, "Not all values are the same type"
                            
    # Get values that don't match the data_name
    non_matching_mask: list = df[col_name] != data_name
    
    # Count how many cells don't match
    count: int = non_matching_mask.sum()
    
    # Get the actual values that don't match
    non_matching_values: list = df.loc[non_matching_mask, col_name].unique().tolist()
    
    return count, non_matching_values

In [27]:
count, no_match_list = whats_in_the_col(df_original, "Included", "Included")

In [29]:
print(count) # If count = 0 then every item in the column matches the data_name, "Included"
print(no_match_list) # If no_match_list is empty, then every item in the column matches the data name, "Included"

0
[]


## Handle missing values

Data is in [TIDY](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html) format. This means each row is an observation and each column is a variable. For each different cohort, there is a certain number of NaNs at the beginning and end of sleep percent column. This is because of the way the data was originally formatted in the Excel. To address missing values, we need to get rid of all the leading and lagging NaNs and keep in mind that the number of leading and lagging Nans is the same within each cohort but different between each cohort.

Note: There are 71 animals in the postinjury dataset (excluding cohort 8).

In [67]:
def trim_nan_edges(df: pd.DataFrame, cohort_col: pd.Series, mouse_col: pd.Series, data_col:pd.Series):
    '''
    Get rid of the leading and lagging NaNs at the beginning and end of the set of rows that represent 
    all observations for each mouse. Do not get rid of NaNs in the middle of observations. 
    All mice within the same cohort have the same number of leading and lagging NaNs.
    '''
    trimmed_groups: list = []

    # Loop over each cohort
    for _, cohort_group in df.groupby(cohort_col): #For the number of cohorts in Cohort
        # Loop over each UniqueMouse within the cohort
        for _, mouse_group in cohort_group.groupby(mouse_col): # For the number of unique mice in the cohort
            mouse_group: list = mouse_group.reset_index(drop=True) # Remove the index column
            valid_mask: list = mouse_group[data_col].notna() #Make a list of PercentSleep values that are not NA
            
            if valid_mask.any(): # If there are any values that are not NaNs
                # Identify this as the first row that we want to save for each mouse
                first_valid_idx = valid_mask.idxmax() 
                # Identify this as the last row we want to save for each mouse
                last_valid_idx = len(valid_mask) - valid_mask[::-1].idxmax() 
                # Take all rows from the first to last index and put them in a new list called trimmed for each mouse
                trimmed = mouse_group.iloc[first_valid_idx:last_valid_idx].copy()
                # Add that group of values to the list of values for all mice
                trimmed_groups.append(trimmed)
            
    # Combine all cleaned groups
    trimmed_df = pd.concat(trimmed_groups, ignore_index=True)
    return trimmed_df


In [None]:
trimmed_df = trim_nan_edges(df_original, "Cohort", "UniqueMouse", "PercentSleep")

In [87]:
from IPython.display import display, HTML
import pandas as pd

# For pandas DataFrames
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [134]:
#Testing cell for trim_nan_edges function

df = df_original
cohort_col = "Cohort"
mouse_col = "UniqueMouse"
data_col = "PercentSleep"

trimmed_groups: list = []
# Loop over each cohort
for _, cohort_group in df.groupby(cohort_col): #For the number of cohorts in Cohort
    # Loop over each UniqueMouse within the cohort
    print(f'Cohort: {_}')
    for _, mouse_group in cohort_group.groupby(mouse_col): # For the number of unique mice in the cohort
        print(f'    Unique Mouse: {_}')
        mouse_group: list = mouse_group.reset_index(drop=True) # Reset the index column so it's easier to iterate
        # .notna() returns True when there is a number there
        # So it will create a list for each UniqueMouse where False means there is a NaN and True means there's a number
        valid_mask: list = mouse_group[data_col].notna() 
        
        if valid_mask.any(): # If there are any values that are not NaNs
            # Identify this as the first row with a numerical value that we want to save for each mouse
            # Misleading because .idxmax() finds the index with the maximum value, but since we have a list
            # of bools, only T/F which is equivalent to 1/0, it will simply find the first 1 value, where 1 = True and 
            # True = not NaN
            first_valid_idx = valid_mask.idxmax() 
            print(f'        First valid index: {first_valid_idx}')
            # Identify this as the last row we want to save for each mouse
            # [start: stop: step] so start at beginning, stop at end, but go in reverse (-1)
            # Take the number of rows for that animal, subtract the index of the first number going backwards
            last_valid_idx = len(valid_mask) - valid_mask[::-1].idxmax() 
            print(f'        Last valid index: {last_valid_idx} = length of valid mask ({len(valid_mask)}) - the first number going backwards ({valid_mask[::-1].idxmax()})')
            # Take all rows from the first to last index and put them in a new list called trimmed for each mouse
            trimmed = mouse_group.iloc[first_valid_idx:last_valid_idx].copy()
            #print(f'        Length of trimmed list: {len(trimmed)}')
            print(f'        Trimmed list: {trimmed['PercentSleep']}')
            # Add that group of values to the list of values for all mice
            trimmed_groups.append(trimmed)

# Combine all cleaned groups
trimmed_df = pd.concat(trimmed_groups, ignore_index=True)
#print(trimmed_df)

Cohort: C1
    Unique Mouse: C1-01
        First valid index: 9
        Last valid index: 27 = length of valid mask (372) - the first number going backwards (345)
        Trimmed list: 9     36.50
10    68.06
11    60.17
12      NaN
13    47.22
14    50.17
15    83.00
16    56.11
17    71.78
18      NaN
19    27.94
20    38.33
21    63.67
22    14.50
23    57.39
24    64.44
25    26.56
26    69.17
Name: PercentSleep, dtype: float64
    Unique Mouse: C1-07
        First valid index: 9
        Last valid index: 27 = length of valid mask (372) - the first number going backwards (345)
        Trimmed list: 9      0.00
10     0.00
11     0.00
12      NaN
13    37.72
14    59.17
15    80.44
16    79.22
17    36.67
18      NaN
19    19.33
20    29.33
21    30.56
22    38.17
23    26.44
24    53.83
25    55.94
26    53.11
Name: PercentSleep, dtype: float64
    Unique Mouse: C1-10
        First valid index: 9
        Last valid index: 27 = length of valid mask (372) - the first number going bac

In [140]:
# Testing cell for trim_nan_edges function

import pandas as pd

# Assume df_original is already defined
df = df_original
cohort_col = "Cohort"
mouse_col = "UniqueMouse"
data_col = "PercentSleep"

trimmed_groups: list = []

# Loop over each cohort
df_grouped = df.groupby(cohort_col)
for cohort_name, cohort_group in df_grouped:
    print(f'Cohort: {cohort_name}')

    for mouse_name, mouse_group in cohort_group.groupby(mouse_col):
        print(f'    Unique Mouse: {mouse_name}')

        mouse_group = mouse_group.reset_index(drop=True)
        valid_mask = mouse_group[data_col].notna()

        if valid_mask.any():
            first_valid_idx = valid_mask.idxmax()
            last_valid_idx = valid_mask[valid_mask].index[-1]

            print(f'        First valid index: {first_valid_idx}')
            print(f'        Last valid index: {last_valid_idx}')

            trimmed = mouse_group.iloc[first_valid_idx:last_valid_idx + 1].copy()
            print(f'        Length of list: {len(trimmed)}')
            print(f'        Trimmed list: {trimmed["PercentSleep"]}')

            trimmed_groups.append(trimmed)

trimmed_df = pd.concat(trimmed_groups, ignore_index=True)


Cohort: C1
    Unique Mouse: C1-01
        First valid index: 9
        Last valid index: 345
        Length of list: 337
        Trimmed list: 9      36.50
10     68.06
11     60.17
12       NaN
13     47.22
14     50.17
15     83.00
16     56.11
17     71.78
18       NaN
19     27.94
20     38.33
21     63.67
22     14.50
23     57.39
24     64.44
25     26.56
26     69.17
27     68.33
28     32.50
29     49.61
30     24.44
31      5.78
32       NaN
33     83.11
34     72.94
35     88.89
36     71.72
37     62.06
38     69.11
39     55.17
40     59.06
41     21.17
42     30.06
43      2.50
44     16.89
45     78.22
46     41.44
47      3.61
48     73.67
49     83.56
50     51.39
51     46.44
52      2.89
53     44.72
54     12.44
55     33.44
56       NaN
57     65.78
58     85.17
59     74.33
60     56.94
61     84.44
62     15.67
63     56.89
64     64.22
65     53.17
66      9.72
67      2.11
68      2.89
69      2.22
70     14.00
71     62.72
72     22.06
73     49.56
74     41.5

I'm going to manually calculate the number of lines we should be putting in the trimmed list for one example animal and see if it matches in Python.
The example I'll use will be from the cohort with the least number of hours.

Notes:
All unique mice in each cohort have the same trimmed list length. But the number of hours in the list is wrong.
Two cohorts have list lengths of 0 - cohorts 3 and 7

In [None]:
# Remove duplicates

In [None]:
# Check and convert datatypes

In [None]:
# Clean text data

In [None]:
# Filter and select data