In [None]:
%config IPCompleter.greedy=True

In [None]:
import numpy as np
import pandas as pd
from scipy import stats, optimize
import csv
import matplotlib.pyplot as plt
import pymare
import itertools
from pprint import pprint
import pickle

In [None]:
class Curve:
    def __init__(self, inputFeature, inputGain, inputCover, inputFrequency, inputRank, inputX, inputY):
        self.feature = inputFeature
        self.gain = inputGain
        self.cover = inputCover
        self.frequency = inputFrequency
        self.rank = inputRank
        self.x = inputX
        self.y = inputY

    def getMinX(self):
        return np.min(self.x)
    
    def getMaxX(self):
        return np.max(self.x)
        
    def normalizeCurve(self, minJ, maxJ):
        self.norX = (self.x-minJ)/(maxJ-minJ)*2-1
        self.minX = minJ
        self.maxX = maxJ

    def fitLegendre(self, deg):
        self.fit = np.polynomial.legendre.Legendre.fit(self.norX, self.y, deg, domain = [-1,1], full=True)
        
    def getFit(self):
        return self.fit[0]    

    def plotCurve(self, show=True):
        [fx, fy] = self.fit[0].linspace()
        fx = (fx+1)/2*(self.maxX-self.minX)+self.minX
        fig = plt.figure(1)
        plt.title("Fitted Legendgre Curve", fontsize='16')	
        plt.xlabel("X",fontsize='13')	
        plt.ylabel("log(oddRatio)",fontsize='13')	

        plt.scatter(self.x, self.y, label='o')	
        plt.plot(fx, fy)	

        plt.legend(['raw', 'fitted'],loc='best')
        plt.grid()	
        plt.xlim(np.min(self.x), np.max(self.x))
        plt.ylim(np.min(self.y), np.max(self.y))
        if show:
            plt.show()
        self.plt = plt
        return plt  
        

In [None]:
class DiscreteCurve:
    def __init__(self, inputFeature, inputGain, inputCover, inputFrequency, inputRank, inputX, inputY):
        self.feature = inputFeature
        self.gain = inputGain
        self.cover = inputCover
        self.frequency = inputFrequency
        self.rank = inputRank
        self.x = inputX
        self.y = inputY
    
    def getY(self):
        return self.y

In [None]:
class Site:
    def __init__(self, inputSite, inputPredTask, inputFsType):
        self.curve = dict()
        self.discreteCurve = dict()
        self.site = inputSite        
        self.pred_task = inputPredTask
        self.fs_type = inputFsType
        
    def addCurve(self, inputFeature, inputGain, inputCover, inputFrequency, inputRank, inputX, inputY):
        if len(inputX) > 2:
            self.curve[inputFeature] = Curve(inputFeature, inputGain, inputCover, inputFrequency, inputRank, inputX, inputY)
        else:
            self.discreteCurve[inputFeature] = DiscreteCurve(inputFeature, inputGain, inputCover, inputFrequency, inputRank, inputX, inputY)
            
    def getCurve(self):
        return self.curve

    def getMetaCood(self, topFeatures):
        cood = list()
        for f in topFeatures:
            if f in self.curve:
                cood.extend(self.curve[f].getFit().coef)
            else:
                cood.extend(self.curve[f].getY())
        return np.array(cood)
    
    def getTopCurve(self, inputRank):
        return [key for key, value in self.curve.items() if value.rank <= inputRank]

In [None]:
class Task:
    def __init__(self, inputPredTask, inputFsType, inputGrp, inputOverallMeasure):
        self.pred_task = inputPredTask
        self.fs_type = inputFsType        
        self.grp = inputGrp
        self.overall_meas = inputOverallMeasure
        self.value = dict()
        self.var = dict()
        
    def addResult(self, inputResult):
        self.result = inputResult
        
    def addValue(self, site, value, var):
        self.value[site] = value
        self.var[site] = var

In [None]:
# Define necessary parameters
data = dict()
taskList = dict()
siteList = ['KUMC2018', 'MCRI2018', 'MCW2018', 'MU2018', 'UNMC2018', 'UTSW2018']
pred_task = ['stg01', 'stg02up', 'stg12up']
fs_type = ['full', 'rm']
grp = ['Overall']
overall_meas = ['opt_npv5', 'opt_ppv', 'opt_sens', 'opt_spec', 'prauc1', 'roauc']
basepath = './data/'

topCurveNum = 5
deg = 5
crit = 0.0

In [None]:
# Read data from disk if exist
try:
    data = pickle.load(open("tempdata.p", "rb"))
except IOError:
# Read curve from cvs file
    for datakey in itertools.product(*[siteList, pred_task, fs_type]):
        filepath = basepath+datakey[0]+"/1d_"+datakey[2]+'_'+datakey[1]+".csv"
        siteData = Site(datakey[0], datakey[1], datakey[2])
        tempData = pd.read_csv(filepath)
        featureList = pd.unique(tempData['Feature'])
        for fea in featureList:
            tempData2 = tempData[tempData['Feature'] == fea]
            siteData.addCurve(fea, tempData2['Gain'].iloc[0], tempData2['Cover'].iloc[0], tempData2['Frequency'].iloc[0], tempData2['rank'].iloc[0], tempData2['val'], tempData2['eff_mean'])
        data[datakey] = siteData
# Save data to temp file for reuse
    pickle.dump(data, open("tempdata.p", "wb"))        

In [None]:
# Create task list and top Feature List
for datakey in itertools.product(*[pred_task, fs_type, grp, overall_meas]):
    taskList[datakey] = Task(datakey[0], datakey[1], datakey[2], datakey[3])
for s in siteList:
    filepath = basepath+s+"/site_perfsumm_1d.csv"
    tempDataS = pd.read_csv(filepath)  
    for p in pred_task:
        tempDataP = tempDataS[tempDataS['pred_task'] == p]
        for f in fs_type:
            tempDataF = tempDataP[tempDataP['fs_type'] == f]
            for g in grp:
                tempDataG = tempDataF[tempDataF['grp'] == g]
                for o in overall_meas:
                    tempDataO = tempDataG[tempDataG['overall_meas'] == o]  
                    pprint(tempDataO['meas_mean'])
                    taskList[(p, f, g, o)].addValue(s, tempDataO['meas_mean'], tempDataO['meas_sd'])

In [None]:
#Select only top N features from all site and consolidate into one list
topFeatures = dict()
topFeaturesSite = dict()
for datakey in itertools.product(*[pred_task, fs_type]):
    tempTopFeature = list()
    for s in siteList:
        tempTopFeature.extend(data[(s, datakey[0], datakey[1])].getTopCurve(topCurveNum))
    topFeatures[datakey] = np.unique(np.array(tempTopFeature))

In [None]:
# Check if all top features exists in all site (if not remove feature from consideration)
# May use imputation technique in future
missingfea = dict()
for datakey in itertools.product(*[pred_task, fs_type]):
    removefeadict = dict()
    for tf in topFeatures[datakey]:
        haskey = [tf in data[(s,datakey[0], datakey[1])].getCurve() for s in siteList]
        if not all(haskey):
            removefeadict[tf] = [siteList[i] for i in range(len(siteList)) if not haskey[i]]
#            print(tf + " is missing in " + str(removefeadict[tf]) + " for " + str((datakey[0], datakey[1])))
    missingfea[datakey] = removefeadict
    topFeatures[datakey] = [x for x in topFeatures[datakey] if not x in missingfea[datakey]]

In [None]:
# Normalize curve domain to [-1,1]
for datakey in itertools.product(*[pred_task, fs_type]):
    for tf in topFeatures[datakey]:
        minJ = float('inf')
        maxJ = float('-inf')
        for k, siteData in data.items():
            if (k[1],k[2]) == datakey:
                minJ = min(minJ, siteData.getCurve()[tf].getMinX())
                maxJ = max(maxJ, siteData.getCurve()[tf].getMaxX())
        for k, siteData in data.items():
            if (k[1],k[2]) == datakey:
                siteData.getCurve()[tf].normalizeCurve(minJ, maxJ)

In [None]:
# Fit Legendre
for k, siteData in data.items():
    for tf in topFeatures[(k[1], k[2])]:       
        siteData.getCurve()[tf].fitLegendre(deg)

In [None]:
# Display fitting
for k in itertools.product(*[siteList, pred_task, fs_type]):
    for tf in topFeatures[(k[1], k[2])]: 
        print(data[k].getCurve()[tf].getFit().coef)
#         data[k].getCurve()[tf].plotCurve(show=True)

In [None]:
# Meta Regression
for k, t in taskList.items():
    y = np.array([t.value[s] for s in siteList])
    v = np.array([t.var[s] for s in siteList])
    X = data[(siteList[0], t.pred_task, t.fs_type)].getMetaCood(topFeatures[(t.pred_task, t.fs_type)])  
    for i in range(1, len(siteList)):
        X = np.vstack((X, data[(siteList[i], t.pred_task, t.fs_type)].getMetaCood(topFeatures[(t.pred_task, t.fs_type)])))

    t.result = pymare.meta_regression(y, v, X, add_intercept=True, method='REML')

In [None]:
print(t.result.to_df())

In [None]:
for k, t in taskList.items():
    for i in range(1, len(siteList)):
        pprint(data[(siteList[i], t.pred_task, t.fs_type)].getCurve()['2160-0'].getFit().coef)