# Edges - Data loading

In [8]:
import pandas as pd
from plotnine import *
from plotnine.data import *
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix, precision_recall_curve
import numpy as np


#####################################################################
#### REPLACE THIS WITH THE PATH TO YOUR DATA ########################
pathToData = 'data/cliqueData/' 
#####################################################################
#####################################################################

def prepareDF(df):
    df['edgeID'] = df['vertex1ID'] + '_' + df['vertex2ID']
    df['edgeIDIsSorted'] = df['vertex1ID'] > df['vertex2ID']
    filtered=df[df['evidence']>0]
    return filtered[filtered['cliqueSize']>1]

def readDF(weightedConfig,unweightedConfig,datasetName):
    maxRecallConfig = 'max_recall'
    pathMaxRecall=pathToData + datasetName + '/' + maxRecallConfig + '/edges.csv'
    pathWeighted=pathToData + datasetName + '/' + weightedConfig +'/edges.csv'
    pathUnweighted=pathToData + datasetName + '/' + unweightedConfig +'/edges.csv'
    dfEdgesMaxRecall = prepareDF(pd.read_csv(pathMaxRecall))
    dfEdgesWeighted = prepareDF(pd.read_csv(pathWeighted))
    dfEdgesUnweighted = prepareDF(pd.read_csv(pathUnweighted))
    datasetResult = {'maxRecall':dfEdgesMaxRecall,weightedConfig:dfEdgesWeighted,unweightedConfig:dfEdgesUnweighted}
    allDfs[datasetName] = datasetResult

weightedConfigSocrata = "alpha_3.1E-5"
unweightedConfigSocrata = "baselineNoWeight"
datasetNamesSocrata = ["austintexas","chicago", "gov.maryland",  "oregon",  "utah"]
weightedConfigWikipedia = "alpha_5.18E-4"
unweightedConfigWikipedia = "baselineNoWeight"
datasetNamesWikipedia = ["education","football","military", "politics", "tv_and_film"]

allDfs = {}

for datasetName in datasetNamesSocrata:
    print("Reading",datasetName)
    readDF(weightedConfigSocrata,unweightedConfigSocrata,datasetName)
    
for datasetName in datasetNamesWikipedia:
    print("Reading",datasetName)
    readDF(weightedConfigWikipedia,unweightedConfigWikipedia,datasetName)
    
print(allDfs[datasetNamesSocrata[0]]["maxRecall"].dtypes)

Reading austintexas




Reading chicago
Reading gov.maryland
Reading oregon
Reading utah
Reading education
Reading football




Reading military




Reading politics
Reading tv_and_film




ComponentID        object
Method             object
cliqueID            int64
cliqueSize          int64
vertex1ID          object
vertex2ID          object
remainsValid         bool
evidence            int64
score             float64
edgeID             object
edgeIDIsSorted       bool
dtype: object


# Precision Recall and F1:

In [19]:
def getPrecision(df,filterByInteresting):
    dfToUse=df
    if(filterByInteresting):
        dfToUse=df[df['evidence']>0]
    truth=dfToUse['remainsValid']
    dfToUse['predicted']=True
    predicted=dfToUse['predicted']
    return accuracy_score(truth, predicted)

def printSingleCSVString(dsName,configName,newline):
    dfMaxRecall=allDfs[dsName]['maxRecall']
    dfConfigResult=allDfs[dsName][configName]
    dfConfigResult=dfConfigResult[dfConfigResult['evidence']>0]
    validEdgesFromMaxRecall = dfMaxRecall[dfMaxRecall['remainsValid']]
    validEdgesFromConfig = dfConfigResult[dfConfigResult['remainsValid']]
    allPositives = set(validEdgesFromMaxRecall['edgeID'])
    truePositives = set(validEdgesFromConfig['edgeID'])
    recall = round(len(allPositives.intersection(truePositives)) / len(allPositives),2)
    precision = round(getPrecision(dfConfigResult,True),2)
    f1 = round(2*recall*precision/(recall+precision),2)
    if(newline):
        print(precision,recall,f1,sep=",")
    else:
        print(dsName,precision,recall,str(f1) + ",",sep=",",end="")

def printCSVString(dsName,weightedConfigName,unweightedConfigName):
    printSingleCSVString(dsName,unweightedConfigName,False)
    printSingleCSVString(dsName,weightedConfigName,True)

print("Dataset","unweighted","unweighted","unweighted","weighted","weighted","weighted",sep=",")

print("","precision","recall","f1","precision","recall","f1",sep=",")

for datasetName in datasetNamesSocrata:
    printCSVString(datasetName,weightedConfigSocrata,unweightedConfigSocrata)
for datasetName in datasetNamesWikipedia:
    printCSVString(datasetName,weightedConfigWikipedia,unweightedConfigSocrata)

Dataset,unweighted,unweighted,unweighted,weighted,weighted,weighted
,precision,recall,f1,precision,recall,f1
chicago,0.85,0.87,0.86,0.9,0.8,0.85
austintexas,0.83,0.22,0.35,0.94,0.6,0.73
gov.maryland,0.23,0.17,0.2,0.25,0.17,0.2
oregon,0.57,0.34,0.43,0.35,0.85,0.5
utah,0.91,0.98,0.94,0.93,0.93,0.93
politics,0.53,0.5,0.51,0.57,0.66,0.61
military,0.53,0.27,0.36,0.61,0.58,0.59
tv_and_film,0.72,0.39,0.51,0.72,0.52,0.6
education,0.79,0.94,0.86,0.79,0.91,0.85
football,0.68,0.57,0.62,0.69,0.53,0.6


# Exporting Role Matchings for Exploration

In [11]:
#Intersting failures (edges labelled as invalid sorted descendingly by score)


###########################################################################
#### REPLACE THIS WITH THE PATH TO YOUR Output Dir ########################
exportPath = 'exportedData/' 
###########################################################################
###########################################################################

for dsName in datasetNamesWikipedia:
    df = allDfs[dsName][weightedConfigWikipedia]
    invalid = df[df['remainsValid']==False]
    invalid = invalid.sort_values('score',ascending=False)
    invalid.to_csv( exportPath + "invalidEdges_"+dsName+".csv")
    
for dsName in datasetNamesSocrata:
    df = allDfs[dsName][weightedConfigSocrata]
    invalid = df[df['remainsValid']==False]
    invalid = invalid.sort_values('score',ascending=False)
    invalid.to_csv(exportPath + "invalidEdges_"+dsName+".csv")    
    

In [12]:
#exporting max recall edge set:

def exportEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    path = exportPath + 'maxRecallEdges_' + datasetName + ".csv"
    ds['edgeID'].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportEdgeIDs(unweightedConfigSocrata,datasetName) #does not matter which config we pass here, max recall is the same for both
    
for datasetName in datasetNamesWikipedia:
    exportEdgeIDs(unweightedConfigWikipedia,datasetName)


In [13]:
#exporting all true positive edges
def exportTruePositiveEdgeIDs(configName,datasetName):
    ds = allDfs[datasetName][configName]
    ds = ds[ds['remainsValid']]
    path = exportPath + 'truePositiveEdges_' + datasetName + ".csv"
    ds[['vertex1ID','vertex2ID']].to_csv(path)

for datasetName in datasetNamesSocrata:
    exportTruePositiveEdgeIDs(weightedConfigSocrata,datasetName)
    
for datasetName in datasetNamesWikipedia:
    exportTruePositiveEdgeIDs(weightedConfigWikipedia,datasetName)

# Clique Precision

In [15]:
#header: ComponentID,Method,cliqueID,cliqueSize,edgesTotal,validEdges,totalEvidence,fractionOfVerticesWithEvidence,score,alpha

def readCliqueDF(weightedConfig,unweightedConfig,datasetName):
    pathWeighted='data/cliqueData/' +datasetName + '/' + weightedConfig +'/cliques.csv'
    weightedDF = pd.read_csv(pathWeighted)
    weightedDF['remainsValid'] = (weightedDF['validEdges'] == weightedDF['edgesTotal'])
    weightedDF = weightedDF[weightedDF['cliqueSize']>1]
    
    pathUnweighted='data/cliqueData/' +datasetName + '/' + unweightedConfig +'/cliques.csv'
    unweightedDF = pd.read_csv(pathUnweighted)
    unweightedDF['remainsValid'] = (unweightedDF['validEdges'] == unweightedDF['edgesTotal'])
    unweightedDF = unweightedDF[unweightedDF['cliqueSize']>1]
    
    datasetResult = {weightedConfig:weightedDF,unweightedConfig:unweightedDF}
    allCliqueDfs[datasetName] = datasetResult

weightedConfigSocrata = "alpha_3.1E-5"
unweightedConfigSocrata = "baselineNoWeight"
datasetNamesSocrata = ["chicago", "austintexas", "gov.maryland",  "oregon",  "utah"]
weightedConfigWikipedia = "alpha_5.18E-4"
unweightedConfigWikipedia = "baselineNoWeight"
datasetNamesWikipedia = ["politics","military", "tv_and_film","education","football"]


allCliqueDfs = {}

for datasetName in datasetNamesSocrata:
    print("Reading",datasetName)
    readCliqueDF(weightedConfigSocrata,unweightedConfigSocrata,datasetName)
    
for datasetName in datasetNamesWikipedia:
    print("Reading",datasetName)
    readCliqueDF(weightedConfigWikipedia,unweightedConfigWikipedia,datasetName)
    
print(allCliqueDfs['chicago'][weightedConfigSocrata].dtypes)

Reading chicago
Reading austintexas
Reading gov.maryland
Reading oregon
Reading utah
Reading politics
Reading military
Reading tv_and_film
Reading education
Reading football




ComponentID                        object
Method                             object
cliqueID                            int64
cliqueSize                          int64
edgesTotal                          int64
validEdges                          int64
totalEvidence                       int64
fractionOfVerticesWithEvidence    float64
score                             float64
alpha                             float64
remainsValid                         bool
dtype: object




In [23]:
def printPrecision(dsName,configName,datasource):
    df=allCliqueDfs[dsName][configName]
    df=df[df['totalEvidence']>0]
    validEdgesFromConfig = df[df['remainsValid']]
    precision = len(validEdgesFromConfig.index) / len(df.index)
    weightedPrecision = sum(validEdgesFromConfig['edgesTotal']) / sum(df['edgesTotal'])
    print(dsName,round(precision,2),sep=",")
    resultRow = {'datasource':datasource,'dataset':dsName,'configName':configName,'Precision':precision}
    resultDFRows.append(resultRow)

resultDFRows=[]

print("---------------------------------Weighted-----------------------------------------------")
print("Dataset","CliquePrecision",sep=",")
for datasetName in datasetNamesSocrata:
    printPrecision(datasetName,weightedConfigSocrata,"socrata")

for datasetName in datasetNamesWikipedia:
    printPrecision(datasetName,weightedConfigWikipedia,"wikipedia")

print("---------------------------------Unweighted-----------------------------------------------")
print("Dataset","CliquePrecision",sep=",")
for datasetName in datasetNamesSocrata:
    printPrecision(datasetName,unweightedConfigSocrata,"socrata")

for datasetName in datasetNamesWikipedia:
    printPrecision(datasetName,unweightedConfigWikipedia,"wikipedia")

resultCliqueDF = pd.DataFrame(resultDFRows)
        
        

---------------------------------Weighted-----------------------------------------------
Dataset,CliquePrecision
chicago,0.9
austintexas,0.89
gov.maryland,0.24
oregon,0.72
utah,0.93
politics,0.37
military,0.33
tv_and_film,0.45
education,0.36
football,0.43
---------------------------------Unweighted-----------------------------------------------
Dataset,CliquePrecision
chicago,0.87
austintexas,0.85
gov.maryland,0.26
oregon,0.67
utah,0.91
politics,0.33
military,0.3
tv_and_film,0.42
education,0.31
football,0.36
