In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
cd ~/demres

/Users/zurfarosa/demres


In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
import numpy as np
from datetime import date, timedelta
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor 

import demres
from demres.common.constants import entry_type
from demres.common import codelists
from demres.common.process_pt_features import *
from demres.common.process_entries import *
from demres.demins.constants import Study_Design as sd
from demres.demins.statistical_functions import *
from common.helper_functions import *

In [4]:
%matplotlib inline

In [5]:
pd.set_option('display.max_columns', None)

In [17]:
pt_features = pd.read_csv('data/pt_data/processed_data/pt_features_demins_all_dementia_10_to_5.csv',delimiter=',',parse_dates=['index_date','data_end','data_start'],infer_datetime_format=True)

In [28]:
columns_not_for_inclusion = [
    'patid', 'pracid', 'index_date', 'isCase','yob',
   'final dementia medcode', 'data_start', 'data_end', 'matchid',
    'reason_for_removal',
'non_insomnia_GP_consultations',
    
'insomnia_any',
 'insomnia_count:0',
 'insomnia_count:1_5',
 'insomnia_count:above_5',
 'non_insomnia_GP_consultations:0',
 'non_insomnia_GP_consultations:1_10',
 'non_insomnia_GP_consultations:11_100',
 'non_insomnia_GP_consultations:101_1000',
 'non_insomnia_GP_consultations:above_1000',
 'age_at_index_date:65-69',
 'age_at_index_date:70-74',
 'age_at_index_date:75-79',
 'age_at_index_date:80-84',
 'age_at_index_date:85-89',
 'age_at_index_date:90-99',
 'age_at_index_date:above_99',
 'antidepressant_pdds:00000',
 'antidepressant_pdds:00001_10',
 'antidepressant_pdds:00011_100',
 'antidepressant_pdds:00101_1000',
 'antidepressant_pdds:01001_10000',
 'antidepressant_pdds:10000_and_above',
 'antipsychotic_pdds:00000',
 'antipsychotic_pdds:00001_10',
 'antipsychotic_pdds:00011_100',
 'antipsychotic_pdds:00101_1000',
 'antipsychotic_pdds:01001_10000',
 'antipsychotic_pdds:10000_and_above',
 'other_sedative_pdds:00000',
 'other_sedative_pdds:00001_10',
 'other_sedative_pdds:00011_100',
 'other_sedative_pdds:00101_1000',
 'other_sedative_pdds:01001_10000',
 'other_sedative_pdds:10000_and_above',
 'benzodiazepines_any',
 'benzodiazepine_pdds:00000',
 'benzodiazepine_pdds:00001_10',
 'benzodiazepine_pdds:00011_100',
 'benzodiazepine_pdds:00101_1000',
 'benzodiazepine_pdds:01001_10000',
 'benzodiazepine_pdds:10000_and_above', 
 'lithium_any',
 'lithium_pdds:00000',
 'lithium_pdds:00001_10',
 'lithium_pdds:00011_100',
 'lithium_pdds:00101_1000',
 'lithium_pdds:01001_10000',
 'lithium_pdds:10000_and_above', 
 'z_drugs_any',
 'z_drug_pdds:00000',
 'z_drug_pdds:00001_10',
 'z_drug_pdds:00011_100',
 'z_drug_pdds:00101_1000',
 'z_drug_pdds:01001_10000',
 'z_drug_pdds:10000_and_above',
 'mood_stabilisers_and_AED_pdds:00000',
 'mood_stabilisers_and_AED_pdds:00001_10',
 'mood_stabilisers_and_AED_pdds:00011_100',
 'mood_stabilisers_and_AED_pdds:00101_1000',
 'mood_stabilisers_and_AED_pdds:01001_10000',
 'mood_stabilisers_and_AED_pdds:10000_and_above']

training_cols = [col for col in pt_features.columns if col not in columns_not_for_inclusion]

In [29]:
training_cols

['female',
 'age_at_index_date',
 'insomnia',
 'stroke',
 'heart_failure',
 'sleep_apnoea',
 'chronic_pulmonary_disease',
 'epilepsy',
 'other_sedatives_100_pdds',
 'antipsychotics_100_pdds',
 'antidepressants_100_pdds',
 'mental_illness',
 'mood_stabilisers_and_AEDs_100_pdds',
 'lithium_100_pdds',
 'z_drugs_100_pdds',
 'benzodiazepines_100_pdds']

In [30]:
pt_features[training_cols]

Unnamed: 0,female,age_at_index_date,insomnia,stroke,heart_failure,sleep_apnoea,chronic_pulmonary_disease,epilepsy,other_sedatives_100_pdds,antipsychotics_100_pdds,antidepressants_100_pdds,mental_illness,mood_stabilisers_and_AEDs_100_pdds,lithium_100_pdds,z_drugs_100_pdds,benzodiazepines_100_pdds
0,1,76,0,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
1,0,79,0,1,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
2,0,91,0,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
3,0,83,0,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
4,0,78,0,0,0,0,1,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
5,1,93,0,0,0,0,0,0,0.000000,2.585689,0.000000,0,0.000000,0.0,0.000000,0.000000
6,1,70,0,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000
7,1,88,1,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,13.813873
8,0,71,0,0,0,0,0,0,0.000000,0.000000,0.000000,1,0.000000,0.0,0.000000,0.000000
9,1,86,0,0,0,0,0,0,0.000000,0.000000,0.000000,0,0.000000,0.0,0.000000,0.000000


In [31]:
X = pt_features[training_cols]

In [32]:
X.columns

Index(['female', 'age_at_index_date', 'insomnia', 'stroke', 'heart_failure',
       'sleep_apnoea', 'chronic_pulmonary_disease', 'epilepsy',
       'other_sedatives_100_pdds', 'antipsychotics_100_pdds',
       'antidepressants_100_pdds', 'mental_illness',
       'mood_stabilisers_and_AEDs_100_pdds', 'lithium_100_pdds',
       'z_drugs_100_pdds', 'benzodiazepines_100_pdds'],
      dtype='object')

In [33]:
# X = X.drop(['non_insomnia_GP_consultations'],axis=1)

In [34]:
X.shape

(30418, 16)

In [35]:
variables = list(range(X.shape[1]))
vif =  list(zip(X.columns,[variance_inflation_factor(X[variables].values, ix) for ix in variables]))
vif

[('female', 3.0452256544627345),
 ('age_at_index_date', 3.6589093164408881),
 ('insomnia', 1.0483274860849985),
 ('stroke', 1.1183370448067902),
 ('heart_failure', 1.0693980357716892),
 ('sleep_apnoea', 1.0038790210259516),
 ('chronic_pulmonary_disease', 1.2558156023691587),
 ('epilepsy', 1.1291951273862113),
 ('other_sedatives_100_pdds', 1.0100696396703424),
 ('antipsychotics_100_pdds', 1.0462395826324096),
 ('antidepressants_100_pdds', 1.1355115413635091),
 ('mental_illness', 1.5429053086205959),
 ('mood_stabilisers_and_AEDs_100_pdds', 1.1208531102667167),
 ('lithium_100_pdds', 1.0368995500179452),
 ('z_drugs_100_pdds', 1.036208083969288),
 ('benzodiazepines_100_pdds', 1.086226201166488)]