Separated relation classifier into build and use scripts

jakelever · Mar 15, 2017 · c5fcb97 · c5fcb97
1 parent 73a87c5
commit c5fcb97
Show file tree

Hide file tree

Showing 3 changed files with 366 additions and 0 deletions.
diff --git a/core/BuildRelationModel.py b/core/BuildRelationModel.py
@@ -0,0 +1,131 @@
+import sys
+import fileinput
+import argparse
+import time
+import itertools
+import pickle
+import random
+import codecs
+from collections import defaultdict
+from sklearn import svm
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction import DictVectorizer
+from scipy.sparse import coo_matrix, hstack, vstack
+import numpy as np
+import json
+
+from ClassifierStuff import *
+from SentenceModel import *
+
+from CandidateBuilder import generateRelationCandidates,findTrigger
+
+def createRelationClassifier(sentenceAndEventData,targetRelations,targetArguments,parameters=None,generateClassifier=True,sentenceRange=0,doFiltering=False):
+	classes,examples,relTypes = generateRelationCandidates(sentenceAndEventData,targetRelations,targetArguments,sentenceRange,doFiltering)
+	assert min(classes) == 0, "Expecting negative cases in relation examples"
+	assert max(classes) > 0, "Expecting positive cases in relation examples"
+
+	vectors,vectorizer,featureSelector = buildVectorizer(classes,examples,parameters)
+
+	classifier = None
+	if generateClassifier:
+		classifier = buildClassifierFromVectors(classes,vectors,parameters)
+
+	data = (classes,examples,vectors,relTypes)
+	return data,vectorizer,featureSelector,classifier
+
+# It's the main bit. Yay!
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description='VERSE Relation Extraction tool')
+
+	parser.add_argument('--trainingFile', required=True, type=str, help='Parsed-text file containing the training data')
+	parser.add_argument('--relationDescriptions', required=True, type=str, help='Description file containing list of relation types with arguments to predict')
+	parser.add_argument('--parameters', type=str, help='Parameters to use for feature construction, selection and classification')
+	parser.add_argument('--modelFile', type=str, help='Output filename for data with predicted modifications')
+	args = parser.parse_args()
+
+	parameters = {}
+	if args.parameters:
+		for arg in args.parameters.split(';'):
+			name,value = arg.strip().split(":")
+			parameters[name.strip()] = value.strip()
+
+	sentenceRange = 0
+	if "sentenceRange" in parameters:
+		sentenceRange = int(parameters["sentenceRange"])
+
+	trainFilename = args.trainingFile
+	with open(trainFilename, 'r') as f:
+		trainingSentenceAndEventData = pickle.load(f)
+	print "Loaded " + trainFilename
+
+	tmpTargetRelations = set()
+	for filename,data in trainingSentenceAndEventData.iteritems():
+		sentenceData = data[0]
+		relations = data[1]
+
+		for (relName,id1,id2) in relations:
+			sentenceid1,locs1 = findTrigger(sentenceData,id1)
+			sentenceid2,locs2 = findTrigger(sentenceData,id2)
+			type1 = sentenceData[sentenceid1].locsToTriggerTypes[tuple(locs1)]
+			type2 = sentenceData[sentenceid2].locsToTriggerTypes[tuple(locs2)]
+			tmpTargetRelations.add((relName,type1,type2))
+
+	print "#"*30
+	for relName,type1,type2 in tmpTargetRelations:
+		print "%s\t%s\t%s" % (relName,type1,type2)
+	print "#"*30
+
+	doFiltering = False
+	if 'doFiltering' in parameters and parameters['doFiltering'] == 'True':
+		doFiltering = True
+
+	#targetRelations = []
+	targetRelations,targetArguments = set(),set()
+	#typeLookup = {}
+	with open(args.relationDescriptions,'r') as f:
+		for line in f:
+			nameAndArgs,type1,type2 = line.strip().split('\t')
+
+			# Pull out the name of arguments and sort by the argument names
+			nameAndArgsSplit = nameAndArgs.split(';')
+
+			# Basically don't do anything if we aren't given the argument names
+			if len(nameAndArgsSplit) == 1:
+				targetRelations.add(tuple(nameAndArgsSplit))
+				targetArguments.add((type1,type2))
+			else: # Or do sort by argument names (if they are provided)
+				relName,argName1,argName2 = nameAndArgs.split(';')
+				relArgs = [(argName1,type1),(argName2,type2)]
+				relArgs = sorted(relArgs)
+
+				targetRelations.add((relName,relArgs[0][0],relArgs[1][0]))
+				targetArguments.add((relArgs[0][1],relArgs[1][1]))
+
+	targetRelations = list(targetRelations)
+	targetRelations = sorted(targetRelations)
+
+	targetRelationsToIDs = { arg:i+1 for i,arg in enumerate(targetRelations) }
+
+	print "-"*30
+	for targetRelation in targetRelations:
+		print targetRelation
+	print "-"*30
+	for targetArgument in targetArguments:
+		print targetArgument
+	print "-"*30
+
+	relData,argVec,argFS,argClf = createRelationClassifier(trainingSentenceAndEventData,targetRelationsToIDs,targetArguments,parameters,True,sentenceRange,doFiltering)
+
+	model = {}
+
+	model['parameters'] = parameters;
+	model['targetRelations'] = targetRelations;
+	model['targetRelationsToIDs'] = targetRelationsToIDs;
+	model['targetArguments'] = targetArguments;
+
+	model['argVec'] = argVec;
+	model['argFS'] = argFS;
+	model['argClf'] = argClf;
+
+	with open(args.modelFile,'w') as f:
+		pickle.dump(model,f)
diff --git a/core/CandidateBuilder.py b/core/CandidateBuilder.py
@@ -0,0 +1,111 @@
+import itertools
+from ClassifierStuff import Example
+
+def findEventTrigger(sentenceData,triggerid):
+	for sentenceid, sentence in enumerate(sentenceData):
+		if triggerid in sentence.predictedEntityLocs:
+			return sentenceid,sentence.predictedEntityLocs[triggerid]
+	raise RuntimeError('Unable to find location of event trigger ID ('+str(triggerid)+') in sentences')
+
+def findArgumentTrigger(sentenceData,triggerid):
+	for sentenceid, sentence in enumerate(sentenceData):
+		if triggerid in sentence.knownEntityLocs:
+			return sentenceid,sentence.knownEntityLocs[triggerid]
+	raise RuntimeError('Unable to find location of argument trigger ID ('+str(triggerid)+') in sentences')
+
+def findTrigger(sentenceData,triggerid):
+	for sentenceid, sentence in enumerate(sentenceData):
+		if triggerid in sentence.predictedEntityLocs:
+			return sentenceid,sentence.predictedEntityLocs[triggerid]
+		if triggerid in sentence.knownEntityLocs:
+			return sentenceid,sentence.knownEntityLocs[triggerid]
+	raise RuntimeError('Unable to find location of trigger ID ('+str(triggerid)+') in sentences')
+
+
+def generateRelationCandidates(sentenceAndEventData,targetRelations,targetArguments,sentenceRange,doFiltering):
+	examples = []
+	classes = []
+	relTypes = []
+
+	for filename in sentenceAndEventData:
+		#print filename
+		(sentenceData,relations,modifiers) = sentenceAndEventData[filename]
+
+		positiveRelations = {}
+		positiveRelationsProcessed = []
+		for (relName,id1,id2) in relations:
+			sentenceid1,locs1 = findTrigger(sentenceData,id1)
+			sentenceid2,locs2 = findTrigger(sentenceData,id2)
+
+			type1 = sentenceData[sentenceid1].locsToTriggerTypes[tuple(locs1)]
+			type2 = sentenceData[sentenceid2].locsToTriggerTypes[tuple(locs2)]
+			#if sentenceid1 != sentenceid2:
+			#	print "WARNING: Relation split across sentences (%s and %s)" % (id1,id2)
+			#	continue
+			#sentenceid = sentenceid1
+
+			#print "POSITIVE", relName, type1, type2
+
+			#key = (relName,type1,type2)
+			#key = relName
+
+			#print relName
+			if not relName in targetRelations:
+				continue
+
+			key = (sentenceid1,tuple(locs1),sentenceid2,tuple(locs2))
+			classid = targetRelations[relName]
+			positiveRelations[key] = classid
+			#positiveRelations[key] = True
+
+
+		# Now we go through all sentences and create examples for all possible token combinations
+		# Then check if any are already marked as positive and add to the appropriate list of examples
+		for sentenceid1 in range(len(sentenceData)):
+			for sentenceid2 in range(max(sentenceid1-sentenceRange,0),min(sentenceid1+sentenceRange+1,len(sentenceData))):
+				#print sentenceid1,sentenceid2
+				sentence1,sentence2 = sentenceData[sentenceid1],sentenceData[sentenceid2]
+
+				eventLocsAndTypes1 = [ (sentence1.predictedEntityLocs[id],sentence1.predictedEntityTypes[id]) for id in sentence1.predictedEntityTypes ]
+				argsLocsAndTypes1 = [ (sentence1.knownEntityLocs[id],sentence1.knownEntityTypes[id]) for id in sentence1.knownEntityTypes ]
+				possibleLocsAndTypes1 = eventLocsAndTypes1 + argsLocsAndTypes1
+
+				eventLocsAndTypes2 = [ (sentence2.predictedEntityLocs[id],sentence2.predictedEntityTypes[id]) for id in sentence2.predictedEntityTypes ]
+				argsLocsAndTypes2 = [ (sentence2.knownEntityLocs[id],sentence2.knownEntityTypes[id]) for id in sentence2.knownEntityTypes ]
+				possibleLocsAndTypes2 = eventLocsAndTypes2 + argsLocsAndTypes2
+
+				for (locs1,type1),(locs2,type2) in itertools.product(possibleLocsAndTypes1,possibleLocsAndTypes2):
+					if sentenceid1 == sentenceid2 and locs1 == locs2:
+						continue
+
+					key = (type1,type2)
+					if doFiltering and not key in targetArguments:
+						continue
+
+					#print "POTENTIAL", type1, type2
+
+					key = (sentenceid1,tuple(locs1),sentenceid2,tuple(locs2))
+					example = Example(filename, sentenceData, arg1_sentenceid=sentenceid1, arg1_locs=locs1, arg2_sentenceid=sentenceid2, arg2_locs=locs2)
+					examples.append(example)
+
+					thisClass = 0
+					if key in positiveRelations:
+						thisClass = positiveRelations[key]
+						#thisClass = 1
+						positiveRelationsProcessed.append(key)
+					classes.append(thisClass)
+					relTypes.append((type1,type2))
+
+		#print filename
+		for key in positiveRelations:
+			#assert key in allArgTriggerLocsProcessed, 'Unprocessed event trigger found: ' + str(key)
+			if not key in positiveRelationsProcessed:
+				print 'WARNING: Unprocessed argument trigger found: %s in file: %s' % (str(key), filename) 
+
+	#for c,e in zip(classes,examples):
+	#	print c,e
+
+	#sys.exit(0)
+
+	return classes, examples, relTypes
+
diff --git a/core/UseRelationModel.py b/core/UseRelationModel.py
@@ -0,0 +1,124 @@
+import sys
+import fileinput
+import argparse
+import time
+import itertools
+import pickle
+import random
+import codecs
+from collections import defaultdict
+from sklearn import svm
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.feature_extraction import DictVectorizer
+from scipy.sparse import coo_matrix, hstack, vstack
+import numpy as np
+import json
+
+from ClassifierStuff import *
+from SentenceModel import *
+
+from CandidateBuilder import generateRelationCandidates
+
+# It's the main bit. Yay!
+if __name__ == "__main__":
+	parser = argparse.ArgumentParser(description='VERSE Relation Extraction tool')
+
+	parser.add_argument('--modelFile', required=True, type=str, help='')
+	parser.add_argument('--testingFile', required=True, type=str, help='Parsed-text file containing the test data to predict modifications for')
+	parser.add_argument('--outFile', type=str, help='Output filename for data with predicted modifications')
+	args = parser.parse_args()
+
+	with open(args.modelFile) as f:
+		model = pickle.load(f)
+
+	parameters = model['parameters'];
+	targetRelations = model['targetRelations'];
+	targetRelationsToIDs = model['targetRelationsToIDs'];
+	targetArguments = model['targetArguments'];
+
+	argVec = model['argVec'];
+	argFS = model['argFS'];
+	argClf = model['argClf'];
+
+	sentenceRange = 0
+	if "sentenceRange" in parameters:
+		sentenceRange = int(parameters["sentenceRange"])
+
+	doFiltering = False
+	if 'doFiltering' in parameters and parameters['doFiltering'] == 'True':
+		doFiltering = True
+
+	with open(args.testingFile, 'r') as f:
+		testingSentenceAndEventData = pickle.load(f)
+	print "Loaded " + args.testingFile
+
+	# Empty the test data of any existing predictions (in case we load the wrong test file)
+	for filename in testingSentenceAndEventData:
+		(sentenceData,relations,modifiers) = testingSentenceAndEventData[filename]
+		# Empty relations
+		relations = []
+
+		testingSentenceAndEventData[filename] = (sentenceData,relations,modifiers)
+
+	print "generate Argument Examples..."
+	_,aExamples,aTypes = generateRelationCandidates(testingSentenceAndEventData,targetRelationsToIDs,targetArguments,sentenceRange,doFiltering)
+
+	print "vectorize, trim and predict..."
+
+	aVectors = argVec.vectorize(aExamples)
+	if not argFS is None:
+		aVectors = argFS.transform(aVectors)
+	aVectors = coo_matrix(aVectors)
+
+	aPredictions = argClf.predict(aVectors)
+	aProbs = argClf.predict_proba(aVectors)
+	probColumns = { c:i for i,c in enumerate(argClf.classes_) }
+
+	#predictedEventID = 1
+	predictedTriggerID = 1000
+
+	predictedEventIDPerFile = Counter()
+
+	for i,(p,example) in enumerate(zip(aPredictions,aExamples)):
+		if p != 0:
+			relType = targetRelations[p-1]
+
+			#eventType = thisRelation[1]
+			#argTypes = thisRelation[2:]
+			#assert len(argTypes) == 2, "Only processing binary relations for triggerless events"
+
+			#eventType = thisRelation[0]
+
+			sentenceFilename = example.filename
+			sentenceID1,arg1Locs = example.arguments[0]
+			sentenceID2,arg2Locs = example.arguments[1]
+
+
+			sentence1 = testingSentenceAndEventData[sentenceFilename][0][sentenceID1]
+			sentence2 = testingSentenceAndEventData[sentenceFilename][0][sentenceID2]
+
+			sentence1.invertTriggers()
+			sentence2.invertTriggers()
+
+			arg1ID = sentence1.locsToTriggerIDs[tuple(arg1Locs)]
+			arg2ID = sentence2.locsToTriggerIDs[tuple(arg2Locs)]
+
+			type1ID = sentence1.locsToTriggerTypes[tuple(arg1Locs)]
+			type2ID = sentence2.locsToTriggerTypes[tuple(arg2Locs)]
+
+			#relType = typeLookup[type1ID]
+
+			relations = testingSentenceAndEventData[sentenceFilename][1]
+
+			prob = aProbs[i,probColumns[p]]
+
+			newR = (relType,arg1ID,arg2ID,prob)
+			#print "ADDING", newR
+			relations.append(newR)
+			#print "TEST",sentenceFilename,sentenceID1,sentenceID2,arg1Locs,arg2Locs,relType
+
+
+	with open(args.outFile, 'w') as f:
+		pickle.dump(testingSentenceAndEventData,f)
+
+	print "Complete."