# Edges - Data loading

In [1]:
import pandas as pd
from plotnine import *
from plotnine.data import *
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, precision_recall_curve
import numpy as np


#####################################################################
#### REPLACE THIS WITH THE PATH TO YOUR DATA ########################
pathToData = '/san2/data/change-exploration/roleMerging/finalExperiments/finalResultFiles/' 
#####################################################################
#####################################################################

def prepareDF(df):
    df['edgeID'] = df['vertex1ID'] + '_' + df['vertex2ID']
    df['edgeIDIsSorted'] = df['vertex1ID'] > df['vertex2ID']
    filtered=df[df['evidence']>0]
    return filtered[filtered['cliqueSize']>1]

def readDF(weightedConfig,unweightedConfig,datasetName):
    maxRecallConfig = 'max_recall'
    pathMaxRecall=pathToData + datasetName + '/' + maxRecallConfig + '/edges.csv'
    pathWeighted=pathToData + datasetName + '/' + weightedConfig +'/edges.csv'
    pathUnweighted=pathToData + datasetName + '/' + unweightedConfig +'/edges.csv'
    dfEdgesMaxRecall = prepareDF(pd.read_csv(pathMaxRecall))
    dfEdgesWeighted = prepareDF(pd.read_csv(pathWeighted))
    dfEdgesUnweighted = prepareDF(pd.read_csv(pathUnweighted))
    datasetResult = {'maxRecall':dfEdgesMaxRecall,weightedConfig:dfEdgesWeighted,unweightedConfig:dfEdgesUnweighted}
    allDfs[datasetName] = datasetResult

weightedConfig = "weighted"
unweightedConfig = "unweighted"
datasetNames = ["austintexas","chicago", "gov.maryland",  "oregon",  "utah", "education","football","military", "politics", "tv_and_film"]

allDfs = {}

for datasetName in datasetNames:
    print("Reading",datasetName)
    readDF(weightedConfig,unweightedConfig,datasetName)


Reading austintexas




Reading chicago




Reading gov.maryland
Reading oregon
Reading utah
Reading education
Reading football




Reading military




Reading politics
Reading tv_and_film




NameError: name 'datasetNamesSocrata' is not defined

# Error Exploration

In [31]:
for ds in ["education","football","military", "politics", "tv_and_film"]:
    print("------------------------------")
    print(ds)
    oldFootballDF = prepareDF(pd.read_csv("/san2/data/change-exploration/roleMerging/optimization/evaluation/"+ds+"/alpha_5.18E-4/edges.csv"))
    newFootballDF = allDfs[ds]['weighted']
    edgesOld = set(oldFootballDF['edgeID'])
    edgesNew = set(newFootballDF['edgeID'])
    print(len(edgesOld))
    print(len(edgesNew))
    inBoth = edgesOld.intersection(edgesNew)
    print(len(inBoth))
    
for ds in ["austintexas","chicago", "gov.maryland",  "oregon",  "utah"]:
    print("------------------------------")
    print(ds)
    oldFootballDF = prepareDF(pd.read_csv("/san2/data/change-exploration/roleMerging/optimization/evaluation/"+ds+"/alpha_3.1E-5/edges.csv"))
    newFootballDF = allDfs[ds]['weighted']
    edgesOld = set(oldFootballDF['edgeID'])
    edgesNew = set(newFootballDF['edgeID'])
    print(len(edgesOld))
    print(len(edgesNew))
    inBoth = edgesOld.intersection(edgesNew)
    print(len(inBoth))

------------------------------
education
1109776
1110531
1108254
------------------------------
football




689315
478501
1604
------------------------------
military
124510
182448
63509
------------------------------
politics
54416
47745
592
------------------------------
tv_and_film




386086
395445
105
------------------------------
austintexas
80808
118932
77977
------------------------------
chicago
22890
193796
22105
------------------------------
gov.maryland
2411
53082
1830
------------------------------
oregon
37063
37300
36807
------------------------------
utah
6269
6695
6223


689315
478501
1604


In [28]:
politics= allDfs['politics']['unweighted']
print(politics[politics['cliqueID']==6773])

Empty DataFrame
Columns: [ComponentID, Method, cliqueID, cliqueSize, vertex1ID, vertex2ID, remainsValid, evidence, score, edgeID, edgeIDIsSorted]
Index: []


# Precision Recall and F1:

In [5]:
def getPrecision(df,filterByInteresting):
    dfToUse=df
    if(filterByInteresting):
        dfToUse=df[df['evidence']>0]
    truth=dfToUse['remainsValid']
    dfToUse['predicted']=True
    predicted=dfToUse['predicted']
    return accuracy_score(truth, predicted)

def printSingleCSVString(dsName,configName,newline):
    dfMaxRecall=allDfs[dsName]['maxRecall']
    dfConfigResult=allDfs[dsName][configName]
    dfConfigResult=dfConfigResult[dfConfigResult['evidence']>0]
    validEdgesFromMaxRecall = dfMaxRecall[dfMaxRecall['remainsValid']]
    validEdgesFromConfig = dfConfigResult[dfConfigResult['remainsValid']]
    allPositives = set(validEdgesFromMaxRecall['edgeID'])
    truePositives = set(validEdgesFromConfig['edgeID'])
    recall = round(len(allPositives.intersection(truePositives)) / len(allPositives),2)
    precision = round(getPrecision(dfConfigResult,True),2)
    f1 = round(2*recall*precision/(recall+precision),2)
    if(newline):
        print(precision,recall,f1,sep=",")
    else:
        print(dsName,precision,recall,str(f1) + ",",sep=",",end="")

def printCSVString(dsName,weightedConfigName,unweightedConfigName):
    printSingleCSVString(dsName,unweightedConfigName,False)
    printSingleCSVString(dsName,weightedConfigName,True)

print("Dataset","unweighted","unweighted","unweighted","weighted","weighted","weighted",sep=",")

print("","precision","recall","f1","precision","recall","f1",sep=",")

for datasetName in datasetNames:
    printCSVString(datasetName,weightedConfig,unweightedConfig)

Dataset,unweighted,unweighted,unweighted,weighted,weighted,weighted
,precision,recall,f1,precision,recall,f1
austintexas,0.83,0.22,0.35,0.7,0.64,0.67
chicago,0.85,0.87,0.86,0.31,0.84,0.45
gov.maryland,0.23,0.17,0.2,0.15,0.53,0.23
oregon,0.57,0.34,0.43,0.35,0.86,0.5
utah,0.91,0.98,0.94,0.89,0.95,0.92
education,0.79,0.94,0.86,0.79,0.91,0.85
football,0.68,0.57,0.62,



0.0,0.0,nan
military,0.53,0.27,0.36,0.62,0.5,0.55
politics,0.53,0.5,0.51,0.01,0.01,0.01
tv_and_film,0.72,0.39,0.51,0.0,0.0,nan




# Exporting Role Matchings for Exploration

In [11]:
#Intersting failures (edges labelled as invalid sorted descendingly by score)


###########################################################################
#### REPLACE THIS WITH THE PATH TO YOUR Output Dir ########################
exportPath = 'exportedData/' 
###########################################################################
###########################################################################

for dsName in datasetNamesWikipedia:
    df = allDfs[dsName][weightedConfigWikipedia]
    invalid = df[df['remainsValid']==False]
    invalid = invalid.sort_values('score',ascending=False)
    invalid.to_csv( exportPath + "invalidEdges_"+dsName+".csv")
    
for dsName in datasetNamesSocrata:
    df = allDfs[dsName][weightedConfigSocrata]
    invalid = df[df['remainsValid']==False]
    invalid = invalid.sort_values('score',ascending=False)
    invalid.to_csv(exportPath + "invalidEdges_"+dsName+".csv")    
    

In [12]:
#exporting max recall edge set:

def exportEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    path = exportPath + 'maxRecallEdges_' + datasetName + ".csv"
    ds['edgeID'].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportEdgeIDs(unweightedConfigSocrata,datasetName) #does not matter which config we pass here, max recall is the same for both
    
for datasetName in datasetNamesWikipedia:
    exportEdgeIDs(unweightedConfigWikipedia,datasetName)


In [13]:
#exporting all true positive edges
def exportTruePositiveEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    ds = ds[ds['remainsValid']]
    path = exportPath + 'truePositiveEdges_' + datasetName + ".csv"
    ds[['vertex1ID','vertex2ID']].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportTruePositiveEdgeIDs(weightedConfigSocrata,datasetName)
    
for datasetName in datasetNamesWikipedia:
    exportTruePositiveEdgeIDs(weightedConfigWikipedia,datasetName)

# Clique Precision

In [8]:
#header: ComponentID,Method,cliqueID,cliqueSize,edgesTotal,validEdges,totalEvidence,fractionOfVerticesWithEvidence,score,alpha

def readCliqueDF(weightedConfig,unweightedConfig,datasetName):
    pathWeighted=pathToData +datasetName + '/' + weightedConfig +'/cliques.csv'
    weightedDF = pd.read_csv(pathWeighted)
    weightedDF['remainsValid'] = (weightedDF['validEdges'] == weightedDF['edgesTotal'])
    weightedDF = weightedDF[weightedDF['cliqueSize']>1]
    
    pathUnweighted=pathToData +datasetName + '/' + unweightedConfig +'/cliques.csv'
    unweightedDF = pd.read_csv(pathUnweighted)
    unweightedDF['remainsValid'] = (unweightedDF['validEdges'] == unweightedDF['edgesTotal'])
    unweightedDF = unweightedDF[unweightedDF['cliqueSize']>1]
    
    datasetResult = {weightedConfig:weightedDF,unweightedConfig:unweightedDF}
    allCliqueDfs[datasetName] = datasetResult

weightedConfig = "weighted"
unweightedConfig = "unweighted"
datasetNames = ["austintexas","chicago", "gov.maryland",  "oregon",  "utah", "education","football","military", "politics", "tv_and_film"]

allCliqueDfs = {}

for datasetName in datasetNames:
    print("Reading",datasetName)
    readCliqueDF(weightedConfig,unweightedConfig,datasetName)
    
print(allCliqueDfs['chicago'][weightedConfig].dtypes)

Reading austintexas
Reading chicago
Reading gov.maryland
Reading oregon
Reading utah
Reading education
Reading football




Reading military
Reading politics
Reading tv_and_film
ComponentID                        object
Method                             object
cliqueID                            int64
cliqueSize                          int64
edgesTotal                          int64
validEdges                          int64
totalEvidence                       int64
fractionOfVerticesWithEvidence    float64
score                             float64
alpha                             float64
remainsValid                         bool
dtype: object




In [10]:
def printPrecision(dsName,configName):
    df=allCliqueDfs[dsName][configName]
    df=df[df['totalEvidence']>0]
    validEdgesFromConfig = df[df['remainsValid']]
    precision = len(validEdgesFromConfig.index) / len(df.index)
    weightedPrecision = sum(validEdgesFromConfig['edgesTotal']) / sum(df['edgesTotal'])
    print(dsName,round(precision,2),sep=",")
    resultRow = {'dataset':dsName,'configName':configName,'Precision':precision}
    resultDFRows.append(resultRow)

resultDFRows=[]

print("---------------------------------Weighted-----------------------------------------------")
print("Dataset","CliquePrecision",sep=",")
for datasetName in datasetNames:
    printPrecision(datasetName,weightedConfig)

print("---------------------------------Unweighted-----------------------------------------------")
print("Dataset","CliquePrecision",sep=",")
print("Dataset","CliquePrecision",sep=",")
for datasetName in datasetNames:
    printPrecision(datasetName,unweightedConfig)

resultCliqueDF = pd.DataFrame(resultDFRows)
        
        

---------------------------------Weighted-----------------------------------------------
Dataset,CliquePrecision
austintexas,0.92
chicago,0.92
gov.maryland,0.26
oregon,0.71
utah,0.93
education,0.36
football,0.0
military,0.29
politics,0.01
tv_and_film,0.0
---------------------------------Unweighted-----------------------------------------------
Dataset,CliquePrecision
Dataset,CliquePrecision
austintexas,0.85
chicago,0.87
gov.maryland,0.26
oregon,0.67
utah,0.91
education,0.31
football,0.36
military,0.3
politics,0.33
tv_and_film,0.42


In [15]:
sum(allCliqueDfs['football']['weighted']['remainsValid']) / len(allCliqueDfs['football']['weighted'].index) #59072,MDMCP,174607,2,1,1,0,0.0,2.4126667995005846E-5,5.18E-4

0.03128255984735316