In [1]:
import os
import json
import glob
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
def extract_json_from_response(text):
    # Find the start of the JSON part (first occurrence of '{')
    start_index = text.find('{')
    if start_index == -1:
        return None  # JSON start character not found
    # Extract the JSON string
    json_str = text[start_index:]
    return json_str

def clean_and_load_json_from_file(filepath):
    with open(filepath, 'r') as file:
        text = file.read()
    # Now `text` contains the content of your file, and we can process it as before
    cleaned_json_str = extract_json_from_response(text)
    if cleaned_json_str is None:
        print("No JSON content found.")
        return None
    try:
        data = json.loads(cleaned_json_str)
        df = pd.json_normalize(data, record_path=['items'])
        return df
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON: {e}")
        return None
    
def validate_and_load(filepath, max_shape=1000, verbose=True):
    df = clean_and_load_json_from_file(filepath)
    if df is None or df.shape[0] < max_shape:
        if verbose:
            print(f"File {filepath} exceeds shape limit. Skipping.")
        return None
    return df

In [3]:
def transform_data(json_file, bw_method=None, percentile=33):
    df = clean_and_load_json_from_file(json_file)

    # Convert 'date' to datetime
    df['date'] = pd.to_datetime(df['date'])
    df['hour'] = df['minute'] // 60

    # Create a datetime column by adding the hours and minutes to the 'date'
    df['datetime'] = df['date'] + pd.to_timedelta(df['hour'] , unit='h') + pd.to_timedelta(df['minute'], unit='m')

    # Normalize the activeness and steps
    scaler = MinMaxScaler()
    df[['activeness', 'steps']] = scaler.fit_transform(df[['activeness', 'steps']])

    # Filter to only rows with 0 activeness
    sleep_data = df[df['activeness'] == 0]

    # KDE requires numerical input, so convert datetime to a numerical format (e.g., Unix timestamp)
    timestamps = sleep_data['datetime'].view(np.int64)

    # Apply KDE
    kde = stats.gaussian_kde(timestamps)
    if bw_method:
        kde.set_bandwidth(bw_method=bw_method)
    else:
        kde.set_bandwidth(bw_method=kde.factor/50.)

    # Evaluate the density on a grid of the same timestamps to identify hot zones
    densities = kde(timestamps)

    # Thresholding to determine hot zones might be based on density percentiles, e.g., top 25%
    threshold = np.percentile(densities, percentile)

    # Label the original data based on whether the timestamp falls within a hot zone
    df['timestamp'] = df['datetime'].astype('int64') // 10**9  # Convert nanoseconds to seconds

    # Create a boolean mask for sleep periods based on the density threshold
    is_sleep = densities >= threshold

    # Now let's adjust the final labeling step
    df['sleep_wake_label'] = df.apply(lambda row: 'sleep' if row.name in df[df['activeness'] == 0][is_sleep].index.tolist() else 'wake', axis=1)
    return df

In [4]:
all_suj = glob.glob('../data/**/*.json', recursive=True)
filtered_suj = []
for filepath in all_suj:
    df = validate_and_load(filepath, verbose=False)
    if df is not None:
        filtered_suj.append(filepath)
print(len(all_suj))

No JSON content found.
1964


In [32]:
filtered_suj[0].split('\\')[1]

'BOGN00001'

In [5]:
data_frames = []
for filepath in filtered_suj[:100]:
    filename = filepath.split('\\')[1]
    df = transform_data(filepath)
    single_suj_df = pd.DataFrame(df['sleep_wake_label'].value_counts())
    single_suj_df['Subject Id'] = filename
    data_frames.append(single_suj_df)

  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.int64)
  timestamps = sleep_data['datetime'].view(np.

In [6]:
len(data_frames)

100

In [13]:
data_frames[0]

Unnamed: 0_level_0,count,Subject Id
sleep_wake_label,Unnamed: 1_level_1,Unnamed: 2_level_1
wake,16671,BOGN00001
sleep,5243,BOGN00001


In [12]:
pd.concat(data_frames, axis=0)

Unnamed: 0_level_0,count,Subject Id
sleep_wake_label,Unnamed: 1_level_1,Unnamed: 2_level_1
wake,16671,BOGN00001
sleep,5243,BOGN00001
wake,17260,BOGN00004
sleep,7843,BOGN00004
wake,27065,BOGN00006
...,...,...
sleep,18787,GSDV00006
wake,31080,GSDV00007
sleep,9277,GSDV00007
wake,32816,GSDV00010


In [None]:
# Split the dataset into features (X) and target (y)
X = 
y = 

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a dictionary to store models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Train each model
for name, model in models.items():
    model.fit(X_train, y_train)

# Make predictions on the test set
for name, model in models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy}')