## Explore Core Profile Data Elements

### Metadata

* *File name:* PatientCorePopulatedTable.txt
* *Title:*  Clinical Profile from 100-patient EMRbots Dataset
* *Source:* [EMRbots.org](http://www.emrbots.org)
* *Index column:* PatientID
* *Data description:* individual-level

### Script to Translate Patient Core Data into Clinical Profile
#### Core data features 
* PatientPopulationPercentageBelowPoverty
* PatientRace
* PatientLanguage
* PatientGender
* PatientMaritalStatus

In [1]:
# Script to tabulate core data with EMRbots PatientCorePopulatedTable.txt

import json
import pandas

def process_pandas_object(f, features):

    d = f.to_dict()
    profile = {}
    #print d.keys()
    for field in features.keys():
        if features[field]["TYPE"]=="ENUM":
            data = d[features[field]["NAME"]]
            fields = []
            counts = {}
            total = 0
            for x in data.keys():
                total = total+1
                v = data[x]
                if not(v in fields):
                    fields.append(v)
                try:
                    counts[v] = counts[v]+1
                except KeyError:
                    counts[v] = 1
            percent = {}
            for x in counts.keys():
                percent[x] = counts[x]*1.0/total
            profile[field] = { "type":"ENUM", "fields":fields, "counts":counts, "percents":percent }
            try:
                profile[field]["meta"] = features[field]["META"]
            except KeyError:
                pass
        if features[field]["TYPE"]=="INT":
            data = d[features[field]["NAME"]]
            total = 0.0
            count = 0.0
            b = []
            #print field
            for x in data.keys():
                count = count+1
                v = data[x]
                try:
                    v = int(v)
                except ValueError:
                    #print v
                    v = 0
                #print v
                total = total+v
                try:
                    cuts = features[field]["VALUE_CUT"]
                    #print cuts
                    if (b==[]):
                        x = len(cuts)
                        while x > -1:
                            x = x - 1
                            b.append(0)
                    n = 0
                    while n < len(cuts) and v >= cuts[n]:
                        n = n+1
                    try:
                        b[n] = b[n] + 1
                    except NameError:
                        b = []
                        x = len(cuts)
                        while x > -1:
                            x = x - 1
                            b.append(0)
                        b[n] = 1
                except KeyError:
                    pass
                try:
                   if v < minv:
                       minv = v
                except NameError:
                    minv = v
                try:
                    if v > maxv:
                        maxv = v
                except NameError:
                    maxv = v
            mean = total / count
            sd_total = 0.0
            for x in data.keys():
                v = data[x]
                try:
                    v = int(v)
                except ValueError:
                    #print v
                    v = 0
                sd_total = sd_total + (v - mean) * (v - mean)
            sd = (sd_total / count) ** 0.5
            pp = { "type":"INT", "count":count, "mean":mean, "min":minv,
                             "max":maxv, "sd":sd }
            try:
                pp["meta"] = features[field]["META"]
            except KeyError:
                pass
            try:
                pp["ValueBins"] = features[field]["VALUE_CUT"]
                pp["ValueBinCounts"] = b
            except KeyError:
                pass
            try:
                p = features[field]["PERCENT_CUT"]
                l = []
                for x in data.keys():
                    v = data[x]
                    try:
                        v = int(v)
                    except ValueError:
                        #print v
                        v = 0
                    l.append(v)
                l.sort()
                #print field
                #for x in l:
                    #print x
                c = []
                t = 0
                for x in p:
                    t = t + x
                    #print t
                    pos = t * len(l) / 100.0
                    #print pos
                    p1 = int(pos)
                    if p1==pos:
                        p2 = p1
                    else:
                        p2 = p1+1
                    if p1 >= len(l):
                        v1 = l[len(l)-1]+1
                    else:
                        v1 = l[p1]
                    if p2 >= len(l):
                        v2 = l[len(l)-1]+1
                    else:
                        v2 = l[p2]
                        if v2==v1:
                            while p2 < len(l)-1 and l[p2]==v1:
                                p2 = p2 + 1
                                v2 = l[p2]
                            if l[p2]==v1:
                                v2 = v1+1
                    #print v1
                    #print v2
                    #print p1
                    #print p2
                    c.append(v1+(v2-v1)*0.5)
	            #print c
                b = []
                x = len(c)
                #print "BINNING"
                #print c
                while x > -1:
                     x = x - 1
                     b.append(0)
                for x in data.keys():
                    v = data[x]
                    try:
                        v = int(v)
                    except ValueError:
                        #print v
                        v = 0
                    #print v
                    n = 0
                    while n < len(c) and v >= c[n]:
                        n = n+1
                    b[n] = b[n] + 1
                pp["percentCuts"] = p
                pp["percentCutValues"] = c
                pp["percentCutCounts"] = b
            except KeyError:
                pass
            profile[field] = pp

    return profile

In [5]:
# Tabulate data

features = {

    "PatientPopulationPercentageBelowPoverty":{ "TYPE":"INT", "NAME":"PatientPopulationPercentageBelowPoverty", "PERCENT_CUT":[10,20,20,20,20,10]  },
    "PatientRace":{ "TYPE":"ENUM", "NAME":"PatientRace" },
    "PatientLanguage": { "TYPE":"ENUM", "NAME":"PatientLanguage" },
    "PatientGender":{ "TYPE":"ENUM", "NAME":"PatientGender" },
    "PatientMaritalStatus":{ "TYPE":"ENUM", "NAME":"PatientMaritalStatus", 
    "META":{"name":"PatientCorePopulatedTable.txt", "title":"Clinical Profile from 100-patient EMRbots Dataset", "source": "EMRbots.org", "index":"PatientID", "description":"individual-level"} }
}

t = pandas.read_table("100-sample/PatientCorePopulatedTable.txt")
#print(t)

#print(t.to_dict())

print(json.dumps(process_pandas_object(t, features)))


{"PatientPopulationPercentageBelowPoverty": {"type": "INT", "count": 100.0, "mean": 21.62, "min": 1, "max": 98, "sd": 23.24339906295978, "percentCuts": [10, 20, 20, 20, 20, 10], "percentCutValues": [10.5, 13.5, 15.5, 18.5, 82.5, 99.0], "percentCutCounts": [14, 19, 20, 26, 12, 9, 0]}, "PatientRace": {"type": "ENUM", "fields": ["Unknown", "African American", "Asian", "White"], "counts": {"Unknown": 13, "African American": 15, "Asian": 23, "White": 49}, "percents": {"Unknown": 0.13, "African American": 0.15, "Asian": 0.23, "White": 0.49}}, "PatientLanguage": {"type": "ENUM", "fields": ["Icelandic", "English", "Spanish", "Unknown"], "counts": {"Icelandic": 12, "English": 64, "Spanish": 18, "Unknown": 6}, "percents": {"Icelandic": 0.12, "English": 0.64, "Spanish": 0.18, "Unknown": 0.06}}, "PatientGender": {"type": "ENUM", "fields": ["Male", "Female"], "counts": {"Male": 48, "Female": 52}, "percents": {"Male": 0.48, "Female": 0.52}}, "PatientMaritalStatus": {"type": "ENUM", "fields": ["Marri