In [2]:
import pandas as ps
import globalConstants as gConst
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from scipy import stats

In [1]:
class MTurkAnalyzer:
    def __init__(self,timeCutoff):
        self.timeCutoff = timeCutoff
        self.OUTCOME_LABELS = gConst.OUTCOME_LABELS
        
    def setRecords(self,rawRecords):
        self.allRecords = rawRecords
        
    def setRecordsByCSV(self,csvFilepath):
        self.allRecords = ps.read_csv(csvFilepath)
        
    def saveRecordsToCSV(self,csvFilepath):
        self.allRecords.to_csv(csvFilepath,index=False)
        
    def calcTSByState(self,mtRecords,label,stateName):
        stateSubset = mtRecords[mtRecords['locale']==stateName]
        stateSubset = stateSubset.sort_values(by=['mt_accept'])
        
    def getBinaryVal(self,inputVal):
        if(inputVal < 50):
            return 0
        return 1
    
    def calcProbRepeatFail(self,nRepeat):
        return pow(0.5,nRepeat)
    
    def calcQAStats(self):
        numFailQA = self.allRecords['QA_flag'].sum()
        numFailedWorker = self.allRecords['worker_flag'].sum()
        numFailTime = self.allRecords['time_flag'].sum()
        numRepeatHits = self.cleanRecords['mt_work_id'].value_counts().value_counts()
    
    def getOrdVal(self,inputVal):
        if(inputVal < 16):
            return 0
        if(inputVal < 33):
            return 1
        if(inputVal < 50):
            return 2
        if(inputVal ==50):
            return 3
        if(inputVal < 67):
            return 4
        if(inputVal < 84):
            return 5
        return 6
    
    def calcAbsMeasuresOneLabel(self,queryLabel):
        queryString = "abs_cont_outcome_" + queryLabel
        abs_ord = []
        abs_bin = []
        contVal = self.allRecords[queryString]
        for curVal in contVal:
            abs_bin.append(self.getBinaryVal(curVal))
            abs_ord.append(self.getOrdVal(curVal))
        self.allRecords['binary_outcome_' + queryLabel] = abs_bin
        self.allRecords['abs_ord_outcome_' + queryLabel] = abs_ord
    
    def calcAbsMeasuresAllLabels(self):
        for queryLabel in self.OUTCOME_LABELS:
            self.calcAbsMeasuresOneLabel(queryLabel)
    
    def calcQAFlagOneRecord(self,vote,leftCarCat):
        if(vote <50 and leftCarCat == True): return 0
        if(vote > 50 and leftCarCat == False): return 0
        return 1
    
    def calcQAFlagAllRecords(self):
        QA_flag = []
        carVote = list(self.allRecords['abs_cont_outcome_cars'])
        carCat = list(self.allRecords['car_cat'])
        for index in range(len(carVote)):
            curVote = carVote[index]
            curCat = carCat[index]
            QA_flag.append(self.calcQAFlagOneRecord(curVote,curCat))
        self.allRecords['QA_flag'] = QA_flag
    
    def getSingleTVIntensity(self,index,flag,windowSize=500):
        startIndex = max(0,index-windowSize)
        endIndex = min(len(self.allRecords['mt_accept']),index+windowSize)
        intensityToAdjust = self.getIntensity(self.allRecords['abs_cont_outcome_cars'][index])
        dataWindow = self.allRecords.iloc[startIndex:endIndex]
        validCarVotes = (dataWindow[dataWindow[flag]==0])['abs_cont_outcome_cars']
        windowIntensity = []
        for vote in validCarVotes:
            windowIntensity.append(self.getIntensity(vote))
        minIntensity = min(windowIntensity)
        maxIntensity = max(windowIntensity)
        adjustedIntensity = (intensityToAdjust - minIntensity)/(maxIntensity-minIntensity)
        normAdjust = min(max(0.5,1.5 - adjustedIntensity),1.5)
        return([adjustedIntensity,normAdjust])
    
    def calcMiscLabelIntensities(self):
        dF = self.allRecords
        newDF = ps.DataFrame()
        newLabel = "abs_cont_outcome_" + self.OUTCOME_LABELS[0]
        newDF['sumIntensities'] = np.maximum(dF[newLabel]-50,50 - dF[newLabel])
        for label in self.OUTCOME_LABELS[1:]:
            newLabel = "abs_cont_outcome_" + label
            tempVals = np.maximum(dF[newLabel]-50,50 - dF[newLabel])
            newDF['sumIntensities'] = newDF['sumIntensities'] + tempVals
        newDF['sumIntensities'] = newDF['sumIntensities']/(len(self.OUTCOME_LABELS)*50)
        newDF['sumIntensities'] = 1.5 - newDF['sumIntensities']
        return(newDF)
    
    
    def calcMiscLabelIntensitiesTskillAdjust(self):
        dF = self.allRecords
        newDF = ps.DataFrame()
        newLabel = "abs_cont_outcome_" + self.OUTCOME_LABELS[0]
        newDF['sumIntensities'] = np.maximum(dF[newLabel]-50,50 - dF[newLabel])
        for label in self.OUTCOME_LABELS[1:]:
            newLabel = "abs_cont_outcome_" + label
            tempVals = np.maximum(dF[newLabel]-50,50 - dF[newLabel])
            newDF['sumIntensities'] = newDF['sumIntensities'] + tempVals
        newDF['sumIntensities'] = newDF['sumIntensities']/(len(self.OUTCOME_LABELS)*50)
        newDF['sumIntensities'] = 1.5 - newDF['sumIntensities']
        return(newDF)
    
    def getNumRecordsByState(self):
        plt.rcParams['font.size'] = 16.0
        plt.rcParams['figure.figsize'] = (20,15)
        recordSubset = self.cleanRecords[self.cleanRecords['mt_locale']!='Other']
        recordSubset['mt_locale'].value_counts().plot(kind='bar')
        numSoFar = recordSubset['mt_locale'].value_counts()
        numSoFar.to_csv(gConst.NUM_RECORDS_BY_STATE_CSV)
    
    def getTVIntensities(self,flag,windowSize=500):
        normIntensities = []
        normAdjust = []
        for index in range(len(df['mt_accept'])):
            tempIntensity,tempAdjust = self.getSingleTVIntensity(index,df,flag,windowSize)
            normIntensities.append(tempIntensity)
            normAdjust.append(tempAdjust)
        self.allRecords['normCar'] = normIntensities
        self.allRecords['normAdjust'] = normAdjust
    
    def calcPVal(self,samp1,samp2):
        N = len(samp1)
        var_1 = samp1.var(ddof=1)
        var_2 = samp2.var(ddof=1)
        s = np.sqrt((var_1 + var_2 )/2)
        t = (samp1.mean() - samp2.mean())/(s*np.sqrt(2/N))
        df = 2*N - 2
        p = 1 - stats.t.cdf(t,df=df)
        return(p)
    
    def getAdjustFig(self,saveFile=False):
        fig,axs = plt.subplots(1,3,figsize=(20,5),dpi=100)
        fig.tight_layout(pad=5.0)
        plt.rcParams.update({'font.size': 14})
        cm = plt.cm.Greens
        xLabels = [r'normality adjustment',r'absolute safe vote',r'relative safe vote']
        histCat = ['normAdjust','abs_cont_outcome_safe','rel_cont_outcome_safe']
        for axsIndex in range(len(xLabels)):
            axs[axsIndex].hist(self.cleanRecords[histCat[axsIndex]], bins='auto',color='green')
            axs[axsIndex].set_xlabel(xLabels[axsIndex],fontsize=14)
            axs[axsIndex].set_ylabel(r'count',fontsize=14)
            axs[axsIndex].set_ylim(0,2200)
        if(saveFile):
            plt.savefig(gConst.NORM_ADJUST_FIG)
        plt.show()
        
    def createHistograms(self,saveFile=False):
        categories = []
        for outcome in gConst.OUTCOME_LABELS:
            categories.append('abs_cont_outcome_' + outcome)
            categories.append('rel_cont_outcome_' + outcome)
        curData = self.cleanRecords
        fig, axs = plt.subplots(4, 2,figsize=(15, 20), dpi=100)
        colIndex = 0
        rowIndex=0
        fig.tight_layout(pad=5.0)
        plt.rcParams.update({'font.size': 14})

        for curIndex in range(len(categories)):
            curCat = categories[curIndex]
            if(curIndex%2==0 and curIndex>0):
                rowIndex+=1
            colIndex = curIndex%2
            curAxs = axs[rowIndex,colIndex]
            curAxs.hist(curData[curCat], bins=20,color="green")
            curAxs.title.set_text(curCat)
            curAxs.title.set_fontsize(16)
            curAxs.set_ylim(0,2200)
            curAxs.set_xlabel(r'vote',fontsize=14)
            curAxs.set_ylabel(r'count',fontsize=14)
        
        if(saveFile):
            fig.savefig(gConst.VOTE_HISTOGRAM_FIG)
    
    def stratifyEarlyLateByState(self,minRecordNum=30):
        curData = self.cleanRecords
        curData = curData.sort_values(by=['mt_submit'])
        numSoFar = curData['mt_locale'].value_counts()
        q1 = curData[curData['mt_locale']=='Other']
        q2 = curData[curData['mt_locale']=='Other']
        q3 = curData[curData['mt_locale']=='Other']
        q4 = curData[curData['mt_locale']=='Other']
        bin_labels = ['Q1', 'Q2', 'Q3', 'Q4']
        for state in numSoFar.keys():
            if(numSoFar[state]) >=minRecordNum:
                stateSubset = curData[curData['mt_locale']==state]
                numRecordsInQuartile = int(numSoFar[state]/4)
                q1 = q1.append(stateSubset.iloc[0:numRecordsInQuartile])
                q2 = q2.append(stateSubset.iloc[numRecordsInQuartile:numRecordsInQuartile*2])
                q3 = q3.append(stateSubset.iloc[numRecordsInQuartile*2:numRecordsInQuartile*3])
                q4 = q4.append(stateSubset.iloc[numRecordsInQuartile*3:numRecordsInQuartile*4])
        q1=q1[q1['mt_locale']!='Other']
        q2=q2[q2['mt_locale']!='Other'] 
        q3=q3[q3['mt_locale']!='Other']
        q4=q4[q4['mt_locale']!='Other'] 
        return([q1,q2,q3,q4])         
    
    def getStaticIntensities(self,flag):
        validData = self.allRecords[self.allRecords[flag]==0]
        validCarVotes = validData['abs_cont_outcome_cars']
        validIntensities = list(map(self.getIntensity,validCarVotes))
        minIntensity = min(validIntensities)
        maxIntensity = max(validIntensities)
        normIntensities = []
        for vote in self.allRecords['abs_cont_outcome_cars']:
            curNormIntensity = (self.getIntensity(vote)-minIntensity)/(maxIntensity-minIntensity)
            normIntensities.append(curNormIntensity)
        otherLabelIntensities = self.calcMiscLabelIntensities()
        normAdjust = []
        for normIndex in range(len(normIntensities)):
            carNormContrib = min(max(0.5,1.5 - normIntensities[normIndex]),1.5)
            otherVoteContrib = float(min(max(0.5,otherLabelIntensities['sumIntensities'].iloc[normIndex]),1.5))
            #normAdjust.append(max(carNormContrib,(carNormContrib*0.5+otherVoteContrib*0.5)))
            #normAdjust.append(max(carNormContrib,otherVoteContrib))
            normAdjust.append(otherVoteContrib)
        self.allRecords['normCar'] = normIntensities
        self.allRecords['normAdjust'] = normAdjust
    
    def calcCarNorms(self,flag='n_flags',timeVarying=False,windowSize=500,timeSort='mt_accept'):
        self.allRecords = self.allRecords.sort_values(by=[timeSort])
        if(timeVarying):
            self.getTVIntensities(flag,windowSize)
        else:
            self.getStaticIntensities(flag)
    
    def getIntensity(self,inputVal):
        return(abs(inputVal-50))
    
    def getNumRecords(self,allRecords=False):
        numRecords = len(self.allRecords['normCar']) if allRecords else len(self.cleanRecords['normCar'])
        return numRecords
    
    def getRecordAtIndex(self,index,allRecords=False):
        recordToReturn = self.allRecords.iloc[index] if allRecords else self.cleanRecords.iloc[index]
        return(recordToReturn)
    
    def convertIntensityToScale(self,curVal,adjIntensity):
        returnVal = max(0,50 - adjIntensity) if curVal < 50 else min(50+adjIntensity,100)
        return(returnVal)
    
    def getRelCont(self,curVal,normAdjust):
        absIntensity = self.getIntensity(curVal)
        adjIntensity = absIntensity*normAdjust
        return(self.convertIntensityToScale(curVal,adjIntensity))
    
    def calcRelMeasureOneLabel(self,queryLabel):
        queryString = "abs_cont_outcome_" + queryLabel
        adjKey = 'normAdjust2' if 'normAdjust2' in list(self.allRecords.keys()) else 'normAdjust'
        normAdj = self.allRecords[adjKey]
        absMeasure = self.allRecords[queryString]
        rel_cont = []
        rel_ord = []
        for index in range(len(list(absMeasure))):
            curVal = absMeasure[index]
            normVal = normAdj[index]
            rel_cont_cur = self.getRelCont(curVal,normVal)
            rel_cont.append(rel_cont_cur)
            rel_ord.append(self.getOrdVal(rel_cont_cur))
        self.allRecords['rel_cont_outcome_' + queryLabel] = rel_cont
        self.allRecords['rel_ord_outcome_' + queryLabel] = rel_ord
    
    def calcRelMeasuresAllLabels(self):
        for queryLabel in self.OUTCOME_LABELS:
            self.calcRelMeasureOneLabel(queryLabel)
            
    def calcProbRepeatFail(self,nRepeat):
        return pow(0.5,int(nRepeat))
    
    def estUncaughtErr(self,inData,percentFail,returnDict=False):
        #percentFail = inData['QA_flag'].mean()*2
        repeatCounts = inData['mt_work_id'].value_counts().value_counts()
        errStatsDict = ps.DataFrame(repeatCounts)
        errStatsDict.columns = ['n']
        errStatsDict['repeat'] = list(repeatCounts.keys())
        errStatsDict['probFail'] = list(map(self.calcProbRepeatFail,list(errStatsDict['repeat'])))
        errStatsDict['probContrib'] = errStatsDict['probFail']*errStatsDict['n']*errStatsDict['repeat']
        errStatsDict['nRecords'] = errStatsDict['n']*errStatsDict['repeat']
        sumRecords = int(errStatsDict['nRecords'].sum())
        errStatsDict['percentContrib'] = errStatsDict['nRecords']/(sumRecords*0.01)
        percentErr = (percentFail*errStatsDict.sum()['probContrib']/sumRecords)*100
        if(returnDict):
            return([errStatsDict,percentErr])
        return percentErr
    
    def calcErrStats(self):
        n,err,labels = [],[],[]
        err.append(self.allRecords['QA_flag'].mean()*2*100)
        n.append(self.allRecords['QA_flag'].count())
        labels.append("All Records")
        err.append(err[0]/2)
        n.append(self.allRecords[self.allRecords['QA_flag']==0]['QA_flag'].count())
        labels.append("QA Screened")
        err.append(self.estUncaughtErr(self.allRecords[self.allRecords['worker_flag']==0],self.allRecords['QA_flag'].mean()*2))
        #err.append(self.estUncaughtErr(self.allRecords))
        n.append(self.allRecords[self.allRecords['worker_flag']==0]['QA_flag'].count())
        labels.append("Worker Screened")
        timeScreen = self.allRecords[self.allRecords['mt_elapsed']>=30]
        percErr = timeScreen['QA_flag'].mean()*2
        timePercErr = self.estUncaughtErr(timeScreen[timeScreen['n_flags']==0],percErr)
        #timePercErr = self.estUncaughtErr(timeScreen)
        err.append(timePercErr)
        n.append(self.cleanRecords['QA_flag'].count())
        labels.append("Time Screened")
        errStats = ps.DataFrame({"dataset":labels,"err rate":err,"n":n})
        repeatStats,errVal = self.estUncaughtErr(self.cleanRecords,0,returnDict=True)
        repeatStats = repeatStats.sort_values(by="repeat")
        repeatStats = repeatStats[['n','repeat','percentContrib']]
        repeatStats.columns=['n participants','records/participant','% contribution']
        return([errStats,repeatStats])
    
    def calcTimeQuartileDiffs(self,absVal=False,cutOff = 55):
        quartileSet = self.stratifyEarlyLateByState()
        statsDict = {}
        catIndex = 0
        qs = ['q1','q2','q3','q4']
        prefix = 'abs_cont_outcome_' if absVal else 'rel_cont_outcome_'
        for label in gConst.OUTCOME_LABELS:
            cat = prefix + label
            cutoffCat = 'abs_cont_outcome_' + label
            index = 0
            means,stds,n = [[] for _ in range(3)]
            p = [-9999]
            for quartile in quartileSet:
                labelData = quartile[cat]
                intensity = np.asarray(abs(quartile[cat]-50))
                intensityCutoff = np.asarray(abs(quartile[cutoffCat]-50))
                intensity = intensity[intensityCutoff<cutOff]
                means.append(intensity.mean())
                stds.append(intensity.std())
                n.append(len(intensity))
                if(index==0):
                    refSet = intensity
                if(index>0):
                    p.append(self.calcPVal(refSet,intensity))
                index+=1
            dF = ps.DataFrame()
            dF['mean'] = means
            dF['std'] = stds
            dF['n'] = n
            dF['pVal'] = p
            dF['quartile'] = qs
            statsDict[cat] = dF
            catIndex+=1
        return(statsDict)
    
    def identifyBadWorkers(self):
        flaggedRecords = self.allRecords[self.allRecords['QA_flag']==1]
        flaggedWorkers = set(flaggedRecords['mt_work_id'])
        flaggedWorkerRecords = self.allRecords[self.allRecords['mt_work_id'].isin(flaggedWorkers)]
        cleanWorkerRecords = self.allRecords[~self.allRecords['mt_work_id'].isin(flaggedWorkers)]
        cleanWorkers = set(cleanWorkerRecords['mt_work_id'])
        flaggedWorkerRecords['worker_flag']=1
        cleanWorkerRecords['worker_flag']=0
        self.allRecords = flaggedWorkerRecords.append(cleanWorkerRecords)
    
    def addTimeFlag(self):
        belowCutoffRecords = self.allRecords[self.allRecords['mt_elapsed']<self.timeCutoff]
        aboveCutoffRecords = self.allRecords[self.allRecords['mt_elapsed']>=self.timeCutoff]
        belowCutoffRecords['time_flag'] = 1
        aboveCutoffRecords['time_flag'] = 0 
        self.allRecords = belowCutoffRecords.append(aboveCutoffRecords)
    
    def addQualityFlags(self):
        self.calcQAFlagAllRecords()
        self.identifyBadWorkers()
        self.addTimeFlag()
        self.allRecords['n_flags'] = self.allRecords['time_flag'] + self.allRecords['QA_flag'] + self.allRecords['worker_flag']
    
    def cleanUpRecords(self):
        self.calcAbsMeasuresAllLabels()
        self.addQualityFlags()
        self.calcCarNorms()
        self.calcRelMeasuresAllLabels()
        self.cleanRecords = self.allRecords[self.allRecords['n_flags']==0]
    
    def recordsToCSV(self,outputFilename):
        self.allRecords.to_csv(outputFilename,index=False)
        
    def getAllRecords(self):
        return self.allRecords
    
    def getCleanRecords(self):
        return self.cleanRecords