In [1]:
""" My first attempt looking at this data in the context of graphical models """

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import pgmpy

import warnings
warnings.filterwarnings("ignore")

sns.set(style="ticks")
sns.set_context(context="talk")

In [2]:
infile = "/Users/kmcmanus/Documents/classes/digitalhealth_project/data/formatted_data/20200628_sleep_pos_5S_cleaned.csv"
df = pd.read_csv(infile, index_col='datetime', parse_dates=True, infer_datetime_format=True)
df["sleep_night"] = pd.to_datetime(df["sleep_night"])
df = df[["ODI", "orient_bin", "Pulse Rate(bpm)", "hour"]]
df = df.dropna()
#df = df[df["hour"] < 5]
#df.loc[df["Pulse Rate(bpm)"] > 70, "Pulse Rate(bpm)"] = 70
#df.loc[df["Pulse Rate(bpm)"] < 70, "Pulse Rate(bpm)"] = 60
df.loc[df["hour"] < 12, "hour"] = 0
df.loc[df["hour"] > 12, "hour"] = 1
print("Total # rows: {}".format(df.shape[0]))
df.head()

Total # rows: 100818


Unnamed: 0_level_0,ODI,orient_bin,Pulse Rate(bpm),hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-21 20:59:00,0.0,1.0,65.0,1
2020-04-21 20:59:05,0.0,1.0,64.0,1
2020-04-21 20:59:10,0.0,1.0,64.0,1
2020-04-21 20:59:15,0.0,1.0,64.0,1
2020-04-21 20:59:20,0.0,1.0,70.0,1


In [3]:
from pgmpy.models import BayesianModel
from pgmpy.factors.discrete import TabularCPD
from pgmpy.estimators import BayesianEstimator


# orient_bin -> ODI
# hour -> ODI
# Pulse Rate -> ODI
model = BayesianModel([('orient_bin', 'ODI'), ('hour', 'ODI'), ('Pulse Rate(bpm)', 'ODI')]) 

from pgmpy.estimators import ParameterEstimator # Literally had to go in an manually change this file
from pgmpy.estimators import BDeuScore 
pe = ParameterEstimator(model, df)
print("\n", pe.state_counts('orient_bin'))  # unconditional
print("\n", pe.state_counts('hour'))  # unconditional
print("\n", pe.state_counts('Pulse Rate(bpm)'))  # unconditional
print("\n", pe.state_counts('ODI'))  # conditional on orient_bin, hour, pulse rate


       orient_bin
-1.0       28653
 0.0       29534
 1.0       42631

     hour
0  62687
1  38131

       Pulse Rate(bpm)
47.0                1
50.0                1
51.0               31
52.0              116
53.0              592
54.0             2270
55.0             4540
56.0             6876
57.0             7631
58.0             8074
59.0             8908
60.0             9046
61.0             8799
62.0             7355
63.0             6810
64.0             6843
65.0             5490
66.0             4692
67.0             3387
68.0             2921
69.0             2094
70.0             1384
71.0              844
72.0              563
73.0              346
74.0              200
75.0              194
76.0              120
77.0              111
78.0              125
79.0               74
80.0               56
81.0               49
82.0               44
83.0               35
84.0               46
85.0               28
86.0               29
87.0               30
88.0               

In [4]:
pe.state_names['hour']

[0, 1]

In [16]:
#model.fit(df, estimator=BayesianEstimator, prior_type="BDeu") # default equivalent_sample_size=5
model.fit(df, estimator=BayesianEstimator, prior_type="K2")
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(type(cpd))
    print(cpd)



CPD of orient_bin:
<class 'pgmpy.factors.discrete.CPD.TabularCPD'>
+------------------+----------+
| orient_bin(-1.0) | 0.284207 |
+------------------+----------+
| orient_bin(0.0)  | 0.292945 |
+------------------+----------+
| orient_bin(1.0)  | 0.422848 |
+------------------+----------+
CPD of ODI:
<class 'pgmpy.factors.discrete.CPD.TabularCPD'>
+-----------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+------

In [17]:
# Ok what do I do with this?
# I can use a graphical model to improve accuracy, once I have the features
#ok = model.get_cpds(node=["ODI", "Pulse Rate(bpm)", "hour", "orient_bin"])
ok = model.get_cpds(node="ODI")

In [18]:
#print(ok)
print(ok.variables)
print(ok.get_cardinality(variables=["ODI", "Pulse Rate(bpm)", "hour", "orient_bin"]))
#print(ok.maximize(["ODI"]))
okf = ok.to_factor()
#okf_max = okf.maximize(["hour"])
#print(okf_max)
print(okf.values)
#print(ok)
print(okf)
#print(okf.values[1]) # ODI x PulseRate x hour x orient_bin
from numpy import unravel_index
max_loc = unravel_index(np.argmax(okf.values[1]), okf.values[1].shape)
#okf.values[1][1][4][2] # ODI=1, PulseRate=2nd lowest, hour=4th hour, orient_bin=2nd bin
#print(np.argmax(okf.values[1]))

min_loc = unravel_index(np.argmin(okf.values[1]), okf.values[1].shape)
print(max_loc)
print(min_loc)

['ODI', 'Pulse Rate(bpm)', 'hour', 'orient_bin']
{'ODI': 2, 'Pulse Rate(bpm)': 45, 'hour': 2, 'orient_bin': 3}
[[[[5.00000000e-01 5.00000000e-01 6.66666667e-01]
   [5.00000000e-01 5.00000000e-01 5.00000000e-01]]

  [[6.66666667e-01 5.00000000e-01 5.00000000e-01]
   [5.00000000e-01 5.00000000e-01 5.00000000e-01]]

  [[5.00000000e-01 8.80000000e-01 8.88888889e-01]
   [5.00000000e-01 5.00000000e-01 3.33333333e-01]]

  [[9.09090909e-01 9.69230769e-01 9.76744186e-01]
   [5.00000000e-01 5.00000000e-01 2.00000000e-01]]

  [[9.90654206e-01 9.95867769e-01 9.95833333e-01]
   [6.66666667e-01 8.33333333e-01 3.33333333e-01]]

  [[9.98113208e-01 9.94784876e-01 9.98944034e-01]
   [9.61538462e-01 8.33333333e-01 5.00000000e-01]]

  [[9.98507463e-01 9.96501050e-01 9.99381571e-01]
   [9.90291262e-01 9.67741935e-01 9.68750000e-01]]

  [[9.98107852e-01 9.88246356e-01 9.98625115e-01]
   [9.90825688e-01 9.93333333e-01 9.90291262e-01]]

  [[9.97598463e-01 9.88726514e-01 9.95404412e-01]
   [9.84000000e-01 9.96

In [9]:
print("For max prob")
print("value {}".format(okf.values[1][3][1][2]))
print("Pulse : {}".format(pe.state_names['Pulse Rate(bpm)'][max_loc[0]]))
print("hour : {}".format(pe.state_names['hour'][max_loc[1]]))
print("orient_bin : {}".format(pe.state_names['orient_bin'][max_loc[2]]))

For max prob
value 0.9969325153374233
Pulse : 52.0
hour : 1
orient_bin : 1.0


In [10]:
print("For min prob")
print("value {}".format(okf.values[1][6][0][2]))
print("Pulse : {}".format(pe.state_names['Pulse Rate(bpm)'][min_loc[0]]))
print("hour : {}".format(pe.state_names['hour'][min_loc[1]]))
print("orient_bin : {}".format(pe.state_names['orient_bin'][min_loc[2]]))

For min prob
value 5.733221726617054e-06
Pulse : 55.0
hour : 0
orient_bin : 1.0


In [12]:
from pgmpy.estimators import ExhaustiveSearch
from pgmpy.estimators import BDeuScore, K2Score, BicScore

bic = BicScore(df)

es = ExhaustiveSearch(df, scoring_method=bic)
best_model = es.estimate()
print(best_model.edges())

print("\nAll DAGs by score:")
for score, dag in reversed(es.all_scores()):
    print(score, dag.edges())

[('ODI', 'hour'), ('ODI', 'orient_bin'), ('hour', 'Pulse Rate(bpm)'), ('orient_bin', 'hour')]

All DAGs by score:
-607075.6937359514 [('hour', 'ODI'), ('hour', 'Pulse Rate(bpm)'), ('orient_bin', 'ODI'), ('orient_bin', 'hour')]
-607075.6937359514 [('hour', 'orient_bin'), ('hour', 'ODI'), ('hour', 'Pulse Rate(bpm)'), ('orient_bin', 'ODI')]
-607075.6937359514 [('Pulse Rate(bpm)', 'hour'), ('hour', 'orient_bin'), ('hour', 'ODI'), ('orient_bin', 'ODI')]
-607075.6937359514 [('ODI', 'hour'), ('hour', 'Pulse Rate(bpm)'), ('orient_bin', 'ODI'), ('orient_bin', 'hour')]
-607075.6937359514 [('ODI', 'hour'), ('ODI', 'orient_bin'), ('hour', 'Pulse Rate(bpm)'), ('orient_bin', 'hour')]
-607075.6937359516 [('ODI', 'orient_bin'), ('hour', 'orient_bin'), ('hour', 'ODI'), ('hour', 'Pulse Rate(bpm)')]
-607075.6937359516 [('ODI', 'orient_bin'), ('Pulse Rate(bpm)', 'hour'), ('hour', 'orient_bin'), ('hour', 'ODI')]
-607075.6937359516 [('ODI', 'hour'), ('ODI', 'orient_bin'), ('hour', 'orient_bin'), ('hour', 'P

-632674.4317469234 [('ODI', 'Pulse Rate(bpm)'), ('orient_bin', 'ODI'), ('orient_bin', 'Pulse Rate(bpm)')]
-632674.4317469234 [('ODI', 'Pulse Rate(bpm)'), ('ODI', 'orient_bin'), ('orient_bin', 'Pulse Rate(bpm)')]
-632692.3402496314 [('Pulse Rate(bpm)', 'orient_bin')]
-632692.3402496316 [('orient_bin', 'Pulse Rate(bpm)')]
-632703.2098644257 [('ODI', 'orient_bin'), ('Pulse Rate(bpm)', 'orient_bin')]
-632773.6159295021 [('ODI', 'Pulse Rate(bpm)'), ('ODI', 'orient_bin'), ('Pulse Rate(bpm)', 'orient_bin'), ('hour', 'orient_bin')]
-632773.6159295022 [('ODI', 'orient_bin'), ('Pulse Rate(bpm)', 'orient_bin'), ('Pulse Rate(bpm)', 'ODI'), ('hour', 'orient_bin')]
-632802.3940470046 [('ODI', 'orient_bin'), ('Pulse Rate(bpm)', 'orient_bin'), ('hour', 'orient_bin')]
-632857.5540785142 [('ODI', 'Pulse Rate(bpm)'), ('hour', 'ODI'), ('orient_bin', 'Pulse Rate(bpm)')]
-632857.5540785142 [('ODI', 'Pulse Rate(bpm)'), ('ODI', 'hour'), ('orient_bin', 'Pulse Rate(bpm)')]
-632989.7885568164 [('ODI', 'Pulse Rat

In [15]:
# I want to find the highest probability combination, and compare that to the lowest
# probability combination

# Just get the matrix and identify the highest

#model.get_cpds(node='ODI')
model.maximize

AttributeError: 'BayesianModel' object has no attribute 'maximize'