# AIM
- To run a cleaned-up preprocessing pipeline

Created on: 22 Mar 2022

# Workspace

In [None]:
##### MODULES
import os
import numpy as np
import pandas as pd
from pathlib import Path
from matplotlib import pyplot as plt
import matplotlib.dates
import matplotlib.patches
import datetime as dt
import feather

## custom modules
# import sys  
# sys.path.append('../../scripts')

import import_data
import clean_data
import helper
import report
import plotting

##### OPTIONS
pd.options.mode.chained_assignment = None

# autoreload external modules after saving changes to disk
%reload_ext autoreload
%autoreload 2

##### DIRECTORIES
proj_dir = Path('.') / '..' / '..'
source_data_dir = proj_dir/'data'/'source'
clean_data_dir =  proj_dir/'data'/'clean'
viz_dir = proj_dir/'viz'

# PARAMETERS

In [None]:
WRITE_TO_DISK = True
OUTPUT_DIR = clean_data_dir / "preprocessMar21"

SLEEP_TIME_START = '20:00'
SLEEP_TIME_END = '10:00'

PANDEMIC_CUTOFF   = dt.date(2020, 1, 30) 

## SUBSETTING
SUBSET_START_DATE = None
SUBSET_END_DATE   = None
SUBSET_PID        = None
#  ['0104dfff-4dcd-48ff-b912-51362f098ed0']

# Import

In [None]:
phqs_raw, slps_raw, metadata = import_data.import_data(source_data_dir)

# Preprocessing using method 2 (expanding intervals to timestamps)

create output directory

In [None]:
OUTPUT_DIR.mkdir(exist_ok=True)

clean_slps

In [None]:
with helper.Timer("clean_slps"):
    intervals_all = clean_data.clean_slps(slps_raw)

# [clean_slps]
Elapsed: 3.2 seconds


Drop intervals after COVID

In [None]:
intervals = plotting.subset_intervals(intervals_all, end_date=PANDEMIC_CUTOFF, msg="Drop intervals after pandemic")

# Drop intervals after pandemic
1855957->1844095 rows (Change = -11862) (nrow after = 99.36% of before)


!!!!! EXAMPLE INTERVAL SUBSET

In [None]:
#### Subset by criteria
# intervals = plotting.subset_intervals(intervals_all, start_date=SUBSET_START_DATE, end_date=SUBSET_END_DATE, id=SUBSET_PID, msg="Sample a small testing set")

### Slicing
intervals = intervals_all.head(2)

print(f"Intervals sampled: {intervals.shape[0]}")

Intervals sampled: 2


explode

In [None]:
FRAC = 0.1
keep_flag = int(np.floor(intervals_all.shape[0] * FRAC))
intervals = intervals_all.head(keep_flag)

with helper.Timer("explode2ts"):
    timeseries = clean_data.explode2ts(intervals)
    report.report_change_in_nrow(intervals, timeseries)

185595->7682292 rows (Change = 7496697) (nrow after = 4139.28% of before)
# [explode2ts]
Elapsed: 43.2 seconds


Subset timestamps within a defined interval

In [None]:
with helper.Timer("Subset timestamps within defined interval"):
    timeseries_subset = clean_data.subset_timeseries_within_interval(timeseries, SLEEP_TIME_START, SLEEP_TIME_END)

7682292->7054492 rows (Change = -627800) (nrow after = 91.83% of before)
# [Subset timestamps within defined interval]
Elapsed: 0.8 seconds
