In [127]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [128]:
cd ~/demres

/Users/zurfarosa/demres


In [155]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from datetime import date, timedelta

import demres
from demres.common.constants import entry_type
from demres.common import codelists
from demres.common.helper_functions import *
from demres.common.process_raw_data import *
from demres.demins.constants import Study_Design
from demres.demins.functions import get_insomnia_event_count

In [139]:
pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [140]:
pt_features.head(3)

Unnamed: 0,patid,gender,yob,pracid,index_date,isCase,final dementia medcode,data_start,data_end,matchid
0,12440330,2,1925,330,2010-03-30,True,4693.0,2000-03-22,2010-04-09,50163.0
1,16055443,2,1927,443,2010-12-20,True,1350.0,2000-12-15,2011-01-13,65618.0
2,2994148,2,1919,148,2007-02-15,True,1916.0,1997-01-20,2007-02-21,21685.0


In [141]:
pt_features.data_end.dtype

dtype('<M8[ns]')

In [132]:
medcoded_entries = pd.read_hdf('hdf/medcoded_entries.hdf')

In [142]:
medcoded_entries.head(3)

Unnamed: 0,patid,eventdate,sysdate,medcode,type
0,57001,1994-10-31,1997-03-09,3550,2
1,57001,1994-10-31,1997-03-09,9897,2
2,57001,1993-01-28,1997-03-09,4447,2


In [97]:
medcoded_entries.sysdate.dtype

dtype('<M8[ns]')

In [149]:
entries = medcoded_entries

In [223]:
pt_features.head(3)

Unnamed: 0,patid,gender,yob,pracid,index_date,isCase,final dementia medcode,data_start,data_end,matchid
0,12440330,2,1925,330,2010-03-30,True,4693.0,2000-03-22,2010-04-09,50163.0
1,16055443,2,1927,443,2010-12-20,True,1350.0,2000-12-15,2011-01-13,65618.0
2,2994148,2,1919,148,2007-02-15,True,1916.0,1997-01-20,2007-02-21,21685.0


In [224]:
# Create list of all insomnia entries, then group it to calculate each patient's insomnia count, broken down by month
insomnia_medcodes = get_medcodes_from_readcodes(codelists.insomnia_readcodes)
insom_events = entries[entries['medcode'].isin(insomnia_medcodes)]
insom_events = insom_events[pd.notnull(insom_events['eventdate'])] #drops a small number of rows (only about 64) with NaN eventdates
insom_events = insom_events[['patid','eventdate']].set_index('eventdate').groupby('patid').resample('M').count()
#convert group_by object back to dataframe
insom_events = insom_events.add_suffix('_count').reset_index()
insom_events.columns=['patid','eventdate','insom_count']
#delete zero counts
insom_events = insom_events[insom_events['insom_count']>0]
insom_events = pd.merge(insom_events,pt_features,how='inner')[['patid','eventdate','insom_count','index_date']]

In [225]:
insom_events.head(3)

Unnamed: 0,patid,eventdate,insom_count,index_date
0,4657,1997-01-31,1,2010-07-13
1,4657,1998-12-31,1,2010-07-13
2,8667,1996-10-31,1,2009-10-26


In [226]:
for window_count,window in enumerate(get_windows()):
    window_insom_events = insom_events
    window_count = str(window_count)
    print(window_count)
    # Restrict insomnia event counts to those that occur during exposure window
    relevant_event_mask = (window_insom_events['eventdate']<=(window_insom_events['index_date']-window['start_latency'])) & (window_insom_events['eventdate']>=(window_insom_events['index_date']-window['end_latency']))
    window_insom_events = window_insom_events.loc[relevant_event_mask]
    window_insom_events = window_insom_events.groupby('patid')['insom_count'].count().reset_index()
    window_insom_events.columns=['patid','insom_count_window'+window_count]
#     merge pt_features with new insomnia_event dataframe
    pt_features = pd.merge(pt_features,window_insom_events,how='left')
    pt_features['insom_count_window'+window_count].fillna(0,inplace=True)
    pt_features['insom_count_window'+window_count] = pt_features['insom_count_window'+window_count].astype(int)

0
1
2


In [227]:
pt_features.head(3)

Unnamed: 0,patid,gender,yob,pracid,index_date,isCase,final dementia medcode,data_start,data_end,matchid,insom_count_window0,insom_count_window1,insom_count_window2
0,12440330,2,1925,330,2010-03-30,True,4693.0,2000-03-22,2010-04-09,50163.0,0,0,0
1,16055443,2,1927,443,2010-12-20,True,1350.0,2000-12-15,2011-01-13,65618.0,0,0,0
2,2994148,2,1919,148,2007-02-15,True,1916.0,1997-01-20,2007-02-21,21685.0,0,1,1


In [221]:
# pt_features = pt_features.drop(['insom_count_window0','insom_count_window1'],axis=1)

In [228]:
pt_features.sort_values(by='insom_count_window0',ascending=False)

Unnamed: 0,patid,gender,yob,pracid,index_date,isCase,final dementia medcode,data_start,data_end,matchid,insom_count_window0,insom_count_window1,insom_count_window2
7712,6762193,2,1917,193,2009-02-05,False,,1997-11-14,2013-04-22,29000.0,27,1,0
9452,5114043,2,1916,43,2008-11-08,True,42279.0,1996-01-05,2012-03-10,6073.0,11,0,0
413,2287025,2,1944,25,2011-03-22,True,6578.0,2000-04-27,2011-10-12,4002.0,9,0,0
10778,5842194,2,1939,194,2010-05-06,True,6578.0,1996-08-09,2013-05-10,29106.0,9,7,2
12518,8291322,2,1921,322,2009-10-08,False,,1995-09-21,2013-04-22,49130.0,9,28,21
6950,1570199,1,1928,199,2006-07-11,True,4693.0,1996-05-12,2011-05-20,30000.0,6,5,1
2424,10871398,2,1919,398,2009-04-21,True,4693.0,1997-11-28,2011-01-20,58999.0,6,0,0
8534,199159,2,1931,159,2009-03-30,True,1350.0,1997-07-27,2013-05-13,23587.0,6,3,1
3623,179091,1,1934,91,2006-06-15,True,7664.0,1995-08-18,2009-03-05,12680.0,5,2,11
6660,11218031,2,1920,31,2009-03-02,False,,1996-08-27,2011-07-06,5094.0,5,3,0


In [230]:
cases = pt_features[pt_features['isCase']==True]

In [231]:
controls = pt_features[pt_features['isCase']==False]

In [232]:
cases['insom_count_window0'].mean()

0.13300879396984924

In [233]:
controls['insom_count_window0'].mean()

0.12060301507537688

In [234]:
cases['insom_count_window1'].mean()

0.13159547738693467

In [235]:
controls['insom_count_window1'].mean()

0.11652010050251256

In [236]:
cases['insom_count_window2'].mean()

0.12327261306532664

In [237]:
controls['insom_count_window2'].mean()

0.096262562814070349

In [238]:
len(cases),len(controls)

(6368, 6368)

In [239]:
pt_features.to_csv('data/pt_data/processed_data/pt_features_demins.csv',index=False)