## Explore Core Profile Data Elements

### Metadata

* *File name:* PatientCorePopulatedTable.txt
* *Title:*  Clinical Profile from 100-patient EMRbots Dataset
* *Source:* [EMRbots.org](http://www.emrbots.org)
* *Index column:* PatientID
* *Data description:* individual-level

### Script to Translate Patient Core Data into Clinical Profile
#### Core data features 
* PatientPopulationPercentageBelowPoverty
* PatientRace
* PatientLanguage
* PatientGender
* PatientMaritalStatus

In [13]:
# Script to tabulate core data with EMRbots PatientCorePopulatedTable.txt

import json
import pandas

def get_crosstab_bins(profile):
    try:
        return profile["fields"]
    except KeyError:
        pass
    try:
        x = profile["valueBins"]
        x.insert(0,profile["min"])
        return x
    except KeyError:
        pass
    try:
        x = profile["percentCutValues"]
        x.insert(0,profile["min"])
        return x
    except KeyError:
        pass
    return []

def get_bin_number(profile, value):
    try:
        f = profile["fields"]
        n = 0
        for x in f:
            if str(value)==str(x):
                return n
            n = n+1
        return -1
    except KeyError:
        pass
    try:
        x = profile["valueBins"]
        n = 0
        try:
            value = float(value)
        except ValueError:
            value = 0
        while n < len(x) and float(value) >= float(x[n]):
            n = n+1
        return n
    except KeyError:
        pass
    try:
        x = profile["percentCutValues"]
        n = 0
        while n < len(x) and value >= x[n]:
            n = n+1
        return n
    except KeyError:
        pass
    return -1

def get_first_crosstab_index(binlist):
    x = []
    for q in binlist:
        x.append(0)
    return x

def get_next_crosstab_index(binlist,ind):
    n = 0
    while (n < len(binlist)):
        l = ind[n]
        l = l+1
        ind[n] = l
        if l==len(binlist[n]):
            l = 0
        else:
            return ind
        n = n+1
    return ind

def get_crosstab_value(matrix, ind):
    for x in ind:
        if x < 0:
            return 0
        if len(matrix) <= x:
            return 0
        matrix = matrix[x]
    if matrix==[]:
        return 0
    return matrix

def set_crosstab_value(matrix, ind, value):
    if ind==[]:
        return value
    if len(ind) > 1:
        x = matrix
        while len(x) <= ind[0]:
            x.append([])
        x[ind[0]] = set_crosstab_value(x[ind[0]], ind[1:], value)
        return x
    else:
        x = matrix
        while len(x) <= ind[0]:
            x.append(0)
        x[ind[0]] = set_crosstab_value(x[ind[0]], ind[1:], value)
        return x

def process_pandas_object(f, features):

    d = f.to_dict()
    profile = {}
    context = { #"version":"1.0",
                "schema":"http://clinicalProfile.ncats.io/",
                "fields":{"@id":"schema:enumValues", "@container":"@list"},
                "counts":{"@id":"schema:binCountValue", "@container":"@list"},
                "valueBins":{"@id":"schema:valueBin", "@container":"@list"},
                "valueBinCounts":{"@id":"schema:valueBinCount", "@container":"@list"},
                "percentCutCounts":{"@id":"schema:percentCutCount", "@container":"@list"},
                "percentCutValues":{"@id":"schema:percentCutValue", "@container":"@list"},
                "percentCuts":{"@id":"schema:percentCut", "@container":"@list"},
                "constituents":{"@id":"schema:field", "@container":"@list"},
                "type":"schema:fieldType",
                "min":"schema:minValue",
                "max":"schema:maxValue",
                "count":"schema:countValue",
                "mean":"schema:meanValue",
                "sd":"schema:sdValue",
                "percents":{"@id":"schema:percent", "@container":"@list"},
              }
    #print d.keys()
    for field in features.keys():
        if features[field]["TYPE"]=="ENUM":
            context[field] = "schema:enumEntry"
            data = d[features[field]["NAME"]]
            fields = []
            counts = {}
            total = 0
            for x in data.keys():
                total = total+1
                v = data[x]
                v = str(v)
                if not(v in fields):
                    fields.append(v)
                try:
                    counts[v] = counts[v]+1
                except KeyError:
                    counts[v] = 1
            percent = {}
            for x in counts.keys():
                percent[x] = counts[x]*1.0/total
            profile[field] = { "type":"ENUM", "fields":fields, "counts":counts, "percents":percent }
            try:
                profile[field]["meta"] = features[field]["META"]
            except KeyError:
                pass
        if features[field]["TYPE"]=="INT":
            context[field] = "schema:intEntry"
            data = d[features[field]["NAME"]]
            total = 0.0
            count = 0.0
            b = []
            #print field
            for x in data.keys():
                count = count+1
                v = data[x]
                try:
                    v = int(v)
                except ValueError:
                    #print v
                    v = 0
                #print v
                total = total+v
                try:
                    cuts = features[field]["VALUE_CUT"]
                    #print cuts
                    if (b==[]):
                        x = len(cuts)
                        while x > -1:
                            x = x - 1
                            b.append(0)
                    n = 0
                    while n < len(cuts) and v >= cuts[n]:
                        n = n+1
                    try:
                        b[n] = b[n] + 1
                    except NameError:
                        b = []
                        x = len(cuts)
                        while x > -1:
                            x = x - 1
                            b.append(0)
                        b[n] = 1
                except KeyError:
                    pass
                try:
                   if v < minv:
                       minv = v
                except NameError:
                    minv = v
                try:
                    if v > maxv:
                        maxv = v
                except NameError:
                    maxv = v
            mean = total / count
            sd_total = 0.0
            for x in data.keys():
                v = data[x]
                try:
                    v = int(v)
                except ValueError:
                    #print v
                    v = 0
                sd_total = sd_total + (v - mean) * (v - mean)
            sd = (sd_total / count) ** 0.5
            pp = { "type":"INT", "count":count, "mean":mean, "min":minv,
                             "max":maxv, "sd":sd }
            try:
                pp["meta"] = features[field]["META"]
            except KeyError:
                pass
            try:
                pp["valueBins"] = features[field]["VALUE_CUT"]
                pp["valueBinCounts"] = b
            except KeyError:
                pass
            try:
                p = features[field]["PERCENT_CUT"]
                l = []
                for x in data.keys():
                    v = data[x]
                    try:
                        v = int(v)
                    except ValueError:
                        #print v
                        v = 0
                    l.append(v)
                l.sort()
                #print field
                #for x in l:
                    #print x
                c = []
                t = 0
                for x in p:
                    t = t + x
                    #print t
                    pos = t * len(l) / 100.0
                    #print pos
                    p1 = int(pos)
                    if p1==pos:
                        p2 = p1
                    else:
                        p2 = p1+1
                    if p1 >= len(l):
                        v1 = l[len(l)-1]+1
                    else:
                        v1 = l[p1]
                    if p2 >= len(l):
                        v2 = l[len(l)-1]+1
                    else:
                        v2 = l[p2]
                        if v2==v1:
                            while p2 < len(l)-1 and l[p2]==v1:
                                p2 = p2 + 1
                                v2 = l[p2]
                            if l[p2]==v1:
                                v2 = v1+1
                    #print v1
                    #print v2
                    #print p1
                    #print p2
                    c.append(v1+(v2-v1)*0.5)
	            #print c
                b = []
                x = len(c)
                #print "BINNING"
                #print c
                while x > -1:
                     x = x - 1
                     b.append(0)
                for x in data.keys():
                    v = data[x]
                    try:
                        v = int(v)
                    except ValueError:
                        #print v
                        v = 0
                    #print v
                    n = 0
                    while n < len(c) and v >= c[n]:
                        n = n+1
                    b[n] = b[n] + 1
                pp["percentCuts"] = p
                pp["percentCutValues"] = c
                pp["percentCutCounts"] = b
            except KeyError:
                pass
            profile[field] = pp

    for field in features.keys():
        if features[field]["TYPE"]=="CROSSTAB":
            context[field] = "schema:crosstabEntry"
            #print field
            constituents = features[field]["CONSTITUENTS"]
            data = []
            bins = []
            for c in constituents:
                data.append(d[features[c]["NAME"]])
                bins.append(get_crosstab_bins(features[c]))
            counts = []
            for x in data[0].keys():
                #print x
                n = 0
                value = []
                while n < len(constituents):
                    value.append(data[n][x])
                    n = n+1
                ind = []
                n = 0
                while n < len(constituents):
                    ind.append(get_bin_number(profile[constituents[n]],value[n]))
                    n = n+1
                #print counts
                #print ind
                v = get_crosstab_value(counts, ind)
                #print v
                v = v+1
                #print counts
                counts = set_crosstab_value(counts, ind, v)
           # print counts
            crosstab = { "type":"CROSSTAB", "tabulations":counts, "constituents":constituents }
            try:
                crosstab["meta"] = features[field]["META"]
            except KeyError:
                pass
            profile[field] = crosstab

    profile["@context"] = context
    profile["@id"] = "http://clinicalProfile.ncats.io/cp"
    profile["@type"] = "http://clinicalProfile/cp_type"
    return profile

In [14]:
# Tabulate data

features = {

    "PatientPopulationPercentageBelowPoverty":{ "TYPE":"INT", "NAME":"PatientPopulationPercentageBelowPoverty", "PERCENT_CUT":[10,20,20,20,20,10]  },
    "PatientRace":{ "TYPE":"ENUM", "NAME":"PatientRace" },
    "PatientLanguage": { "TYPE":"ENUM", "NAME":"PatientLanguage" },
    "PatientGender":{ "TYPE":"ENUM", "NAME":"PatientGender" },
    "PatientMaritalStatus":{ "TYPE":"ENUM", "NAME":"PatientMaritalStatus", 
    "META":{"name":"PatientCorePopulatedTable.txt", "title":"Clinical Profile from 100-patient EMRbots Dataset", "source": "EMRbots.org", "index":"PatientID", "description":"individual-level"} },
    "MaritalxGen"
}

t = pandas.read_table("100-sample/PatientCorePopulatedTable.txt")
#print(t)

#print(t.to_dict())

print(json.dumps(process_pandas_object(t, features)))


{"@type": "http://clinicalProfile/cp_type", "PatientPopulationPercentageBelowPoverty": {"type": "INT", "count": 100.0, "sd": 23.24339906295978, "max": 98, "percentCutValues": [10.5, 13.5, 15.5, 18.5, 82.5, 99.0], "mean": 21.62, "percentCuts": [10, 20, 20, 20, 20, 10], "percentCutCounts": [14, 19, 20, 26, 12, 9, 0], "min": 1}, "@context": {"percentCutCounts": {"@id": "schema:percentCutCount", "@container": "@list"}, "percents": {"@id": "schema:percent", "@container": "@list"}, "max": "schema:maxValue", "valueBins": {"@id": "schema:valueBin", "@container": "@list"}, "PatientPopulationPercentageBelowPoverty": "schema:intEntry", "mean": "schema:meanValue", "PatientRace": "schema:enumEntry", "fields": {"@id": "schema:enumValues", "@container": "@list"}, "PatientMaritalStatus": "schema:enumEntry", "PatientGender": "schema:enumEntry", "type": "schema:fieldType", "count": "schema:countValue", "valueBinCounts": {"@id": "schema:valueBinCount", "@container": "@list"}, "sd": "schema:sdValue", "min