In [2]:
import sys
import datetime
import DWM10_GetParms
import DWM20_TokenizerFunctions
import DWM25_Global_Token_Replace_NewDict ## added to import token replace code
import DWM30_BuildRefList
import DWM40_BuildBlocks
import DWM50_IterateBlocks
import DWM70_GeneratePairs
import DWM80_TransitiveClosure
import DWM90_IterateClusters
import DWM97_ClusterProfile
import DWM99_ERmetrics

In [4]:
# Main Driver for Refactored Data Washing Machine
# Version 1.20 creates a log file with same information being written to console
# Version 1.30 creates cluster profile at end of program and evaluates ER statistics
# Version 1.40 FK - added module DWM25 to do global level token replacement
#              JRT - added DWM65_ScoringMatrix to allow ScoringMatrix as a comparitor type
version = 1.40
# date time is used to label the logfile
now = datetime.datetime.now()
tag = str(now.year)+str(now.month)+str(now.day)+'_'+str(now.hour)+'_'+str(now.minute)
logFile = open('DWM_Log_'+tag+'.txt','w')
print("Data Washing Machine Refactor Version",version)
print("Data Washing Machine Refactor Version",version, file=logFile)
parmFileName = input('Enter Parameter File Name->')
parms = DWM10_GetParms.getParms(parmFileName)
tokenFreqDict = {}
inputFileName = parms['inputFileName']
periodIndex = inputFileName.rfind('.')
inputPrefix = inputFileName[0:periodIndex]
inputSuffix = inputFileName[periodIndex+1:]
tokenizedFileName = inputPrefix+'-Tokenized.txt'
DWM20_TokenizerFunctions.tokenizeInput(logFile, parms, tokenFreqDict, tokenizedFileName)
#print("** Token Frequency Dictionary \n",tokenFreqDict)
#Following read the parms for the global replacement code
runReplacement = parms['runReplacement']
minFreqStdToken = parms['minFreqStdToken']
minLenStdToken = parms['minLenStdToken']
maxFreqErrToken = parms['maxFreqErrToken']

#if configured, Run global replacement
if runReplacement:
    DWM25_Global_Token_Replace_NewDict.globalReplace(logFile, inputPrefix, minFreqStdToken, minLenStdToken, maxFreqErrToken)
    tokenizedFileName = inputPrefix+'-TokenReplace.txt' # define new global replaced file as input to be processed down stream
refList = DWM30_BuildRefList.buildRefList(logFile, tokenizedFileName)
moreToDo = True
linkIndex =[]
print('\n>>Starting Iterations')
print('\n>>Starting Iterations', file=logFile)
mu = parms['mu']
print('mu start value=', mu)
print('mu start value=', mu, file=logFile)
muIterate = parms['muIterate']
print('mu iterate value=', muIterate)
print('mu iterate value=', muIterate, file=logFile)
epsilon = parms['epsilon']
print('epsilon start value=', epsilon)
print('epsilon start value=', epsilon, file=logFile)
epsilonIterate = parms['epsilonIterate']
print('epsilon iterate value=', epsilonIterate)
print('epsilon iterate value=', epsilonIterate, file=logFile)
comparator = parms['comparator']
print('comparator =', comparator)
print('comparator =', comparator, file=logFile)

while moreToDo:
    print('\n****New Iteration\nSize of refList =', len(refList), 'Size of linkIndex =', len(linkIndex))   
    print('\n****New Iteration\nSize of refList =', len(refList), 'Size of linkIndex =', len(linkIndex), file=logFile)  
    blockList = DWM40_BuildBlocks.buildBlocks(logFile, refList, parms, tokenFreqDict)
    if len(blockList)==0:
        print('--Ending because blockList is empty')
        print('--Ending because blockList is empty', file=logFile)
        break
    blockList.sort()
    compareCache = DWM50_IterateBlocks.iterateBlocks(logFile, comparator, mu, blockList)
    pairList = DWM70_GeneratePairs.generatePairs(logFile, mu, compareCache)
    if len(pairList)==0:
        print('Ending because pairList is empty')
        print('Ending because pairList is empty', file=logFile)
        break
    clusterList = DWM80_TransitiveClosure.transitiveClosure(logFile, pairList)
    if len(clusterList)==0:
        print('--Ending because clusterList is empty') 
        print('--Ending because clusterList is empty', file=logFile)
        break  
    DWM90_IterateClusters.iterateClusters(logFile, epsilon, clusterList, refList, linkIndex)
    print('\n>>End of Iteration, Resetting mu and epsilon')
    print('\n>>End of Iteration, Resetting mu and epsilon', file=logFile)
    mu += muIterate
    mu = round(mu, 2)
    print('>>>New Value of mu = ',mu)
    print('>>>New Value of mu = ',mu, file=logFile)
    epsilon += epsilonIterate
    print('>>>New Value of epsilon = ',epsilon)
    print('>>>New Value of epsilon = ',epsilon, file=logFile)
    if mu > 1.0:
        moreToDo = False
        print('Ending because mu > 1.0')
        print('Ending because mu > 1.0', file=logFile)
# End of iterations
# Add unclustered references to linkIndex
for x in refList:
    refID = x[1]
    body = x[2]
    newTuple = (refID, refID)
    linkIndex.append(newTuple)
# sort linkIndex by cluster IDs
linkIndex.sort()
# write out linkFile, but put RefID first and ClusterID second
linkFileName = inputPrefix+'-LinkIndex.txt'
linkFile = open(linkFileName,'w')
linkFile.write('RefID, ClusterID\n')
for c in linkIndex:
    linkFile.write(c[1]+','+c[0]+'\n')
linkFile.close()
print('Record written to',linkFileName, '=',len(linkIndex))
print('Record written to',linkFileName, '=',len(linkIndex), file=logFile)
# Generate Cluster Profile
profile = DWM97_ClusterProfile.generateProfile(linkIndex)
print('\nCluster Profile')
print('\nCluster Profile', file=logFile)
print('Size\tCount')
print('Size\tCount', file=logFile)
total = 0
for key in sorted(profile.keys()) :
    clusterTotal = key*profile[key]
    total +=clusterTotal
    print(key, '\t', profile[key], '\t', clusterTotal)
    print(key, '\t', profile[key], '\t', clusterTotal, file=logFile)
print('\tTotal\t', total)
print('\tTotal\t', total, file=logFile)
# Generat ER Metrics if truthFileName was given
if 'truthFileName' in parms:
    truthFileName = (parms['truthFileName']).strip()
    if len(truthFileName)>0:
        DWM99_ERmetrics.generateMetrics(logFile, linkIndex, truthFileName)
print("End of Program")
print("End of Program", file=logFile)
logFile.close()

Data Washing Machine Refactor Version 1.4
Enter Parameter File Name->S2-parms-nr.txt

>> Starting DWM20
Input Reference File Name = S2G.txt
Input File has Header Records = True
Input File Delimiter = ,
Tokenizer Function Type = Splitter
Remove Duplicate Reference Tokens = True
Tokenized Reference Output File Name = S2G-Tokenized.txt

Total References Read= 100
Total Tokens Found = 1247
Total Unique Tokens = 464

>>Starting DWM30
Total References Read from  S2G-Tokenized.txt = 100

>>Starting Iterations
mu start value= 0.5
mu iterate value= 0.1
epsilon start value= 3.5
epsilon iterate value= 0.0
comparator = Cosine

****New Iteration
Size of refList = 100 Size of linkIndex = 0

>>Starting DWM40
beta = 6
sigma = 12
Stop Words excluded= 391
Total Blocking Records Created 564

>>Starting DWM50
Total Blocks Processed = 214
Total Pairs in Compare Cache = 234

>>Starting DWM70
Total Pairs Linked = 54  at mu= 0.5

>>Starting DWM80
Total Closure Iterations = 2

>>Starting DWM90
Total Clusters P