# Edges - Data loading

In [None]:
import pandas as pd
from plotnine import *
from plotnine.data import *
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, precision_recall_curve
import numpy as np

def prepareDF(df):
    df['edgeID'] = df['vertex1ID'] + '_' + df['vertex2ID']
    df['edgeIDIsSorted'] = df['vertex1ID'] > df['vertex2ID']
    filtered=df[df['evidence']>0]
    return filtered[filtered['cliqueSize']>1]

def readDF(weightedConfig,unweightedConfig,datasetName):
    maxRecallConfig = 'max_recall'
    pathMaxRecall='data/cliqueData/' + datasetName + '/' + maxRecallConfig + '/edges.csv'
    pathWeighted='data/cliqueData/' +datasetName + '/' + weightedConfig +'/edges.csv'
    pathUnweighted='data/cliqueData/' +datasetName + '/' + unweightedConfig +'/edges.csv'
    dfEdgesMaxRecall = prepareDF(pd.read_csv(pathMaxRecall))
    dfEdgesWeighted = prepareDF(pd.read_csv(pathWeighted))
    dfEdgesUnweighted = prepareDF(pd.read_csv(pathUnweighted))
    datasetResult = {'maxRecall':dfEdgesMaxRecall,weightedConfig:dfEdgesWeighted,unweightedConfig:dfEdgesUnweighted}
    allDfs[datasetName] = datasetResult

weightedConfigSocrata = "alpha_3.1E-5"
unweightedConfigSocrata = "baselineNoWeight"
datasetNamesSocrata = ["austintexas","chicago", "gov.maryland",  "oregon",  "utah"]
weightedConfigWikipedia = "alpha_5.18E-4"
unweightedConfigWikipedia = "baselineNoWeight"
datasetNamesWikipedia = ["education","football","military", "politics", "tv_and_film"]

allDfs = {}

for datasetName in datasetNamesSocrata:
    print("Reading",datasetName)
    readDF(weightedConfigSocrata,unweightedConfigSocrata,datasetName)
    
for datasetName in datasetNamesWikipedia:
    print("Reading",datasetName)
    readDF(weightedConfigWikipedia,unweightedConfigWikipedia,datasetName)
    
print(allDfs[datasetNamesSocrata[0]]["maxRecall"].dtypes)

Reading austintexas




Reading chicago
Reading gov.maryland
Reading oregon
Reading utah
Reading education


# Precision Recall and F1:

In [3]:
def getPrecision(df,filterByInteresting):
    dfToUse=df
    if(filterByInteresting):
        dfToUse=df[df['evidence']>0]
    truth=dfToUse['remainsValid']
    dfToUse['predicted']=True
    predicted=dfToUse['predicted']
    return accuracy_score(truth, predicted)

def getNumCorrectReturnedAllEdges(df,filterByInteresting):
    dfToUse=df
    if(filterByInteresting):
        dfToUse=df[df['evidence']>0]
    numCorrect=len(dfToUse[dfToUse['remainsValid']].index)
    return numCorrect

def getValidEdgeSet(df):
    return set(df['vertex1ID'] + '_' + df['vertex2ID'])

def printSingleCSVString(dsName,configName,newline):
    dfMaxRecall=allDfs[dsName]['maxRecall']
    dfConfigResult=allDfs[dsName][configName]
    dfConfigResult=dfConfigResult[dfConfigResult['evidence']>0]
    validEdgesFromMaxRecall = dfMaxRecall[dfMaxRecall['remainsValid']]
    validEdgesFromConfig = dfConfigResult[dfConfigResult['remainsValid']]
    allPositives = set(validEdgesFromMaxRecall['edgeID'])
    truePositives = set(validEdgesFromConfig['edgeID'])
    recall = round(len(allPositives.intersection(truePositives)) / len(allPositives),2)
    precision = round(getPrecision(dfConfigResult,True),2)
    f1 = round(2*recall*precision/(recall+precision),2)
    if(newline):
        print(precision,recall,f1,sep=",")
    else:
        print(dsName,precision,recall,str(f1) + ",",sep=",",end="")

def printCSVString(dsName,configName,configNameBaseline):
    #print(configName,configNameBaseline)
    printSingleCSVString(dsName,configNameBaseline,False)
    printSingleCSVString(dsName,configName,True)

def processDS(dsName,configName):
    print("--------------------------------------------------------------------------------")
    print(dsName,configName)
    dfMaxRecall=allDfs[dsName]['maxRecall']
    dfConfigResult=allDfs[dsName][configName]
    dfConfigResult=dfConfigResult[dfConfigResult['evidence']>0]
    validEdgesFromMaxRecall = dfMaxRecall[dfMaxRecall['remainsValid']]
    validEdgesFromConfig = dfConfigResult[dfConfigResult['remainsValid']]
    allPositives = set(validEdgesFromMaxRecall['edgeID'])
    truePositives = set(validEdgesFromConfig['edgeID'])
    recall = len(allPositives.intersection(truePositives)) / len(allPositives)
    precision = getPrecision(dfConfigResult,True)
    f1 = 2*recall*precision/(recall+precision)
    print("Precision",precision)
    print("Recall",recall)
    print("F-Measure",f1)

for datasetName in datasetNamesSocrata:
    printCSVString(datasetName,configNameSocrataOurApproach,configNameSocrataBaselineNoWeight)
for datasetName in datasetNamesWikipedia:
    printCSVString(datasetName,configNameWikipediaOurApproach,configNameWikipediaBaselineNoWeight)

austintexas,0.83,0.22,0.35,0.94,0.6,0.73
chicago,0.85,0.87,0.86,0.9,0.8,0.85
gov.maryland,0.23,0.17,0.2,0.25,0.17,0.2
oregon,0.57,0.34,0.43,0.35,0.85,0.5
utah,0.91,0.98,0.94,0.93,0.93,0.93
education,0.79,0.94,0.86,0.79,0.91,0.85
football,0.68,0.57,0.62,0.69,0.53,0.6
military,0.53,0.27,0.36,0.61,0.58,0.59
politics,0.53,0.5,0.51,0.57,0.66,0.61
tv_and_film,0.72,0.39,0.51,0.72,0.52,0.6


# Exporting Role Matchings for Exploration

In [4]:
#Intersting failures (edges labelled as invalid sorted descendingly by score)

for dsName in datasetNamesWikipedia:
    chicagoResult = allDfs[dsName][configNameWikipediaOurApproach]
    chicagoInvalid = chicagoResult[chicagoResult['remainsValid']==False]
    chicagoInvalid = chicagoInvalid.sort_values('score',ascending=False)
    chicagoInvalid.to_csv("exportedData/invalidEdges/"+dsName+".csv")
    chicagoInvalid
    
for dsName in datasetNamesSocrata:
    chicagoResult = allDfs[dsName][configNameSocrataOurApproach]
    chicagoInvalid = chicagoResult[chicagoResult['remainsValid']==False]
    chicagoInvalid = chicagoInvalid.sort_values('score',ascending=False)
    chicagoInvalid.to_csv("exportedData/invalidEdges/"+dsName+".csv")
    chicagoInvalid    
    

In [9]:
#exporting max recall edge set:

def exportEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    path = 'exportedData/maxRecallEdges/' + datasetName + ".csv"
    ds['edgeID'].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportEdgeIDs(configNameSocrataBaselineNoWeight,datasetName)
    
for datasetName in datasetNamesWikipedia:
    exportEdgeIDs(configNameWikipediaBaselineNoWeight,datasetName)


In [10]:
#exporting all true positive edges
def exportTruePositiveEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    ds = ds[ds['remainsValid']]
    path = 'exportedData/truePositiveEdges/' + datasetName + ".csv"
    ds[['vertex1ID','vertex2ID']].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportTruePositiveEdgeIDs(configNameSocrataOurApproach,datasetName)
    
for datasetName in datasetNamesWikipedia:
    exportTruePositiveEdgeIDs(configNameWikipediaOurApproach,datasetName)

# Clique Precision

In [4]:
#header: ComponentID,Method,cliqueID,cliqueSize,edgesTotal,validEdges,totalEvidence,fractionOfVerticesWithEvidence,score,alpha

def readCliqueDF(configName,baselineConfigName,datasetName):
    pathConfig='data/cliqueData/' +datasetName + '/' + configName +'/cliques.csv'
    cliqueDF = pd.read_csv(pathConfig)
    cliqueDF['remainsValid'] = (cliqueDF['validEdges'] == cliqueDF['edgesTotal'])
    cliqueDF = cliqueDF[cliqueDF['cliqueSize']>1]
    
    pathConfigBaseline='data/cliqueData/' +datasetName + '/' + baselineConfigName +'/cliques.csv'
    baselineDF = pd.read_csv(pathConfigBaseline)
    baselineDF['remainsValid'] = (baselineDF['validEdges'] == baselineDF['edgesTotal'])
    baselineDF = baselineDF[baselineDF['cliqueSize']>1]
    
    datasetResult = {configName:cliqueDF,baselineConfigName:baselineDF}
    print("Dataset",datasetName,"has ",len(cliqueDF.index),"cliques and", sum(cliqueDF.edgesTotal)," edges of which ",sum(cliqueDF.validEdges),"are valid")
    allCliqueDfs[datasetName] = datasetResult

configNameSocrata = "alpha_3.1E-5"
configNameSocrataBaseline = "baselineNoWeight"
datasetNamesSocrata = ["chicago", "austintexas", "gov.maryland",  "oregon",  "utah"]
configNameWikipedia = "alpha_5.18E-4"
configNameWikipediaBaseline = "baselineNoWeight"
datasetNamesWikipedia = ["politics","military", "tv_and_film","education","football"]


allCliqueDfs = {}

for datasetName in datasetNamesSocrata:
    print("Reading",datasetName)
    readCliqueDF(configNameSocrata,configNameSocrataBaseline,datasetName)
    
for datasetName in datasetNamesWikipedia:
    print("Reading",datasetName)
    readCliqueDF(configNameWikipedia,configNameWikipediaBaseline,datasetName)
    
print(allCliqueDfs['chicago'][configNameSocrata].dtypes)

Reading chicago
Dataset chicago has  20317 cliques and 23297  edges of which  21025 are valid
Reading austintexas
Dataset austintexas has  27076 cliques and 81834  edges of which  76675 are valid
Reading gov.maryland
Dataset gov.maryland has  1292 cliques and 2420  edges of which  606 are valid
Reading oregon
Dataset oregon has  4741 cliques and 38381  edges of which  14296 are valid
Reading utah
Dataset utah has  13106 cliques and 22172  edges of which  21729 are valid
Reading politics
Dataset politics has  10853 cliques and 69428  edges of which  45764 are valid
Reading military
Dataset military has  16449 cliques and 141711  edges of which  93018 are valid
Reading tv_and_film




Dataset tv_and_film has  50704 cliques and 584225  edges of which  477343 are valid
Reading education
Dataset education has  19213 cliques and 1545031  edges of which  1307725 are valid
Reading football
Dataset football has  83985 cliques and 819385  edges of which  603066 are valid
ComponentID                        object
Method                             object
cliqueID                            int64
cliqueSize                          int64
edgesTotal                          int64
validEdges                          int64
totalEvidence                       int64
fractionOfVerticesWithEvidence    float64
score                             float64
alpha                             float64
remainsValid                         bool
dtype: object




In [5]:
def processCliqueDS(dsName,configName,datasource):
    #print("--------------------------------------------------------------------------------")
    #print(dsName,configName)
    dfConfigResult=allCliqueDfs[dsName][configName]
    dfConfigResult=dfConfigResult[dfConfigResult['totalEvidence']>0]
    #print(dfMaxRecall)
    #print(dfConfigResult)
    validEdgesFromConfig = dfConfigResult[dfConfigResult['remainsValid']]
    precision = len(validEdgesFromConfig.index) / len(dfConfigResult.index)
    weightedPrecision = sum(validEdgesFromConfig['edgesTotal']) / sum(dfConfigResult['edgesTotal']) 
    #print("Precision",precision)
    #print("Weighted Precision",weightedPrecision)
    #print("Rounded Precision &",round(precision,2))
    print(round(precision,2))
    resultRow = {'datasource':datasource,'dataset':dsName,'configName':configName,'Precision':precision}
    resultDFRows.append(resultRow)

resultDFRows=[]
for datasetName in datasetNamesSocrata:
    processCliqueDS(datasetName,configNameSocrata,"socrata")
    
for datasetName in datasetNamesWikipedia:
    processCliqueDS(datasetName,configNameWikipedia,"wikipedia")
    
print("--------------------------------------------------------------------------------")
    
for datasetName in datasetNamesSocrata:
    processCliqueDS(datasetName,configNameSocrataBaseline,"socrata")
    
for datasetName in datasetNamesWikipedia:
    processCliqueDS(datasetName,configNameWikipediaBaseline,"wikipedia")
    
resultCliqueDF = pd.DataFrame(resultDFRows)

#plot = (ggplot(resultCliqueDF, aes(x='dataset',ymin=0,ymax=1, y='Precision',fill='dataset')) 
#      + geom_col(show_legend=False)
#      #+ geom_point(size=3)
#      #+ scale_x_continuous(name="Matching Time [%]")
#      #+ scale_y_continuous(name="Validity")
#      + theme(text=element_text(size=14),axis_text_x=element_text(rotation=45, hjust=1))
#      #+ theme(axis_text_x=element_text(size=12),axis_text_y=element_text(size=12))
#      + labs(title='',x=''))
#print(plot)
#fname = 'exportedPlots/cliquePrecision.jpg'
#plot.save(filename = fname)
        
        
        

0.9
0.89
0.24
0.72
0.93
0.37
0.33
0.45
0.36
0.43
--------------------------------------------------------------------------------
0.87
0.85
0.26
0.67
0.91
0.33
0.3
0.42
0.31
0.36
