Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Separated relation classifier into build and use scripts
- Loading branch information
Showing
3 changed files
with
366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
import sys | ||
import fileinput | ||
import argparse | ||
import time | ||
import itertools | ||
import pickle | ||
import random | ||
import codecs | ||
from collections import defaultdict | ||
from sklearn import svm | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.feature_extraction import DictVectorizer | ||
from scipy.sparse import coo_matrix, hstack, vstack | ||
import numpy as np | ||
import json | ||
|
||
from ClassifierStuff import * | ||
from SentenceModel import * | ||
|
||
from CandidateBuilder import generateRelationCandidates,findTrigger | ||
|
||
def createRelationClassifier(sentenceAndEventData,targetRelations,targetArguments,parameters=None,generateClassifier=True,sentenceRange=0,doFiltering=False): | ||
classes,examples,relTypes = generateRelationCandidates(sentenceAndEventData,targetRelations,targetArguments,sentenceRange,doFiltering) | ||
assert min(classes) == 0, "Expecting negative cases in relation examples" | ||
assert max(classes) > 0, "Expecting positive cases in relation examples" | ||
|
||
vectors,vectorizer,featureSelector = buildVectorizer(classes,examples,parameters) | ||
|
||
classifier = None | ||
if generateClassifier: | ||
classifier = buildClassifierFromVectors(classes,vectors,parameters) | ||
|
||
data = (classes,examples,vectors,relTypes) | ||
return data,vectorizer,featureSelector,classifier | ||
|
||
# It's the main bit. Yay! | ||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='VERSE Relation Extraction tool') | ||
|
||
parser.add_argument('--trainingFile', required=True, type=str, help='Parsed-text file containing the training data') | ||
parser.add_argument('--relationDescriptions', required=True, type=str, help='Description file containing list of relation types with arguments to predict') | ||
parser.add_argument('--parameters', type=str, help='Parameters to use for feature construction, selection and classification') | ||
parser.add_argument('--modelFile', type=str, help='Output filename for data with predicted modifications') | ||
args = parser.parse_args() | ||
|
||
parameters = {} | ||
if args.parameters: | ||
for arg in args.parameters.split(';'): | ||
name,value = arg.strip().split(":") | ||
parameters[name.strip()] = value.strip() | ||
|
||
sentenceRange = 0 | ||
if "sentenceRange" in parameters: | ||
sentenceRange = int(parameters["sentenceRange"]) | ||
|
||
trainFilename = args.trainingFile | ||
with open(trainFilename, 'r') as f: | ||
trainingSentenceAndEventData = pickle.load(f) | ||
print "Loaded " + trainFilename | ||
|
||
tmpTargetRelations = set() | ||
for filename,data in trainingSentenceAndEventData.iteritems(): | ||
sentenceData = data[0] | ||
relations = data[1] | ||
|
||
for (relName,id1,id2) in relations: | ||
sentenceid1,locs1 = findTrigger(sentenceData,id1) | ||
sentenceid2,locs2 = findTrigger(sentenceData,id2) | ||
type1 = sentenceData[sentenceid1].locsToTriggerTypes[tuple(locs1)] | ||
type2 = sentenceData[sentenceid2].locsToTriggerTypes[tuple(locs2)] | ||
tmpTargetRelations.add((relName,type1,type2)) | ||
|
||
print "#"*30 | ||
for relName,type1,type2 in tmpTargetRelations: | ||
print "%s\t%s\t%s" % (relName,type1,type2) | ||
print "#"*30 | ||
|
||
doFiltering = False | ||
if 'doFiltering' in parameters and parameters['doFiltering'] == 'True': | ||
doFiltering = True | ||
|
||
#targetRelations = [] | ||
targetRelations,targetArguments = set(),set() | ||
#typeLookup = {} | ||
with open(args.relationDescriptions,'r') as f: | ||
for line in f: | ||
nameAndArgs,type1,type2 = line.strip().split('\t') | ||
|
||
# Pull out the name of arguments and sort by the argument names | ||
nameAndArgsSplit = nameAndArgs.split(';') | ||
|
||
# Basically don't do anything if we aren't given the argument names | ||
if len(nameAndArgsSplit) == 1: | ||
targetRelations.add(tuple(nameAndArgsSplit)) | ||
targetArguments.add((type1,type2)) | ||
else: # Or do sort by argument names (if they are provided) | ||
relName,argName1,argName2 = nameAndArgs.split(';') | ||
relArgs = [(argName1,type1),(argName2,type2)] | ||
relArgs = sorted(relArgs) | ||
|
||
targetRelations.add((relName,relArgs[0][0],relArgs[1][0])) | ||
targetArguments.add((relArgs[0][1],relArgs[1][1])) | ||
|
||
targetRelations = list(targetRelations) | ||
targetRelations = sorted(targetRelations) | ||
|
||
targetRelationsToIDs = { arg:i+1 for i,arg in enumerate(targetRelations) } | ||
|
||
print "-"*30 | ||
for targetRelation in targetRelations: | ||
print targetRelation | ||
print "-"*30 | ||
for targetArgument in targetArguments: | ||
print targetArgument | ||
print "-"*30 | ||
|
||
relData,argVec,argFS,argClf = createRelationClassifier(trainingSentenceAndEventData,targetRelationsToIDs,targetArguments,parameters,True,sentenceRange,doFiltering) | ||
|
||
model = {} | ||
|
||
model['parameters'] = parameters; | ||
model['targetRelations'] = targetRelations; | ||
model['targetRelationsToIDs'] = targetRelationsToIDs; | ||
model['targetArguments'] = targetArguments; | ||
|
||
model['argVec'] = argVec; | ||
model['argFS'] = argFS; | ||
model['argClf'] = argClf; | ||
|
||
with open(args.modelFile,'w') as f: | ||
pickle.dump(model,f) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import itertools | ||
from ClassifierStuff import Example | ||
|
||
def findEventTrigger(sentenceData,triggerid): | ||
for sentenceid, sentence in enumerate(sentenceData): | ||
if triggerid in sentence.predictedEntityLocs: | ||
return sentenceid,sentence.predictedEntityLocs[triggerid] | ||
raise RuntimeError('Unable to find location of event trigger ID ('+str(triggerid)+') in sentences') | ||
|
||
def findArgumentTrigger(sentenceData,triggerid): | ||
for sentenceid, sentence in enumerate(sentenceData): | ||
if triggerid in sentence.knownEntityLocs: | ||
return sentenceid,sentence.knownEntityLocs[triggerid] | ||
raise RuntimeError('Unable to find location of argument trigger ID ('+str(triggerid)+') in sentences') | ||
|
||
def findTrigger(sentenceData,triggerid): | ||
for sentenceid, sentence in enumerate(sentenceData): | ||
if triggerid in sentence.predictedEntityLocs: | ||
return sentenceid,sentence.predictedEntityLocs[triggerid] | ||
if triggerid in sentence.knownEntityLocs: | ||
return sentenceid,sentence.knownEntityLocs[triggerid] | ||
raise RuntimeError('Unable to find location of trigger ID ('+str(triggerid)+') in sentences') | ||
|
||
|
||
def generateRelationCandidates(sentenceAndEventData,targetRelations,targetArguments,sentenceRange,doFiltering): | ||
examples = [] | ||
classes = [] | ||
relTypes = [] | ||
|
||
for filename in sentenceAndEventData: | ||
#print filename | ||
(sentenceData,relations,modifiers) = sentenceAndEventData[filename] | ||
|
||
positiveRelations = {} | ||
positiveRelationsProcessed = [] | ||
for (relName,id1,id2) in relations: | ||
sentenceid1,locs1 = findTrigger(sentenceData,id1) | ||
sentenceid2,locs2 = findTrigger(sentenceData,id2) | ||
|
||
type1 = sentenceData[sentenceid1].locsToTriggerTypes[tuple(locs1)] | ||
type2 = sentenceData[sentenceid2].locsToTriggerTypes[tuple(locs2)] | ||
#if sentenceid1 != sentenceid2: | ||
# print "WARNING: Relation split across sentences (%s and %s)" % (id1,id2) | ||
# continue | ||
#sentenceid = sentenceid1 | ||
|
||
#print "POSITIVE", relName, type1, type2 | ||
|
||
#key = (relName,type1,type2) | ||
#key = relName | ||
|
||
#print relName | ||
if not relName in targetRelations: | ||
continue | ||
|
||
key = (sentenceid1,tuple(locs1),sentenceid2,tuple(locs2)) | ||
classid = targetRelations[relName] | ||
positiveRelations[key] = classid | ||
#positiveRelations[key] = True | ||
|
||
|
||
# Now we go through all sentences and create examples for all possible token combinations | ||
# Then check if any are already marked as positive and add to the appropriate list of examples | ||
for sentenceid1 in range(len(sentenceData)): | ||
for sentenceid2 in range(max(sentenceid1-sentenceRange,0),min(sentenceid1+sentenceRange+1,len(sentenceData))): | ||
#print sentenceid1,sentenceid2 | ||
sentence1,sentence2 = sentenceData[sentenceid1],sentenceData[sentenceid2] | ||
|
||
eventLocsAndTypes1 = [ (sentence1.predictedEntityLocs[id],sentence1.predictedEntityTypes[id]) for id in sentence1.predictedEntityTypes ] | ||
argsLocsAndTypes1 = [ (sentence1.knownEntityLocs[id],sentence1.knownEntityTypes[id]) for id in sentence1.knownEntityTypes ] | ||
possibleLocsAndTypes1 = eventLocsAndTypes1 + argsLocsAndTypes1 | ||
|
||
eventLocsAndTypes2 = [ (sentence2.predictedEntityLocs[id],sentence2.predictedEntityTypes[id]) for id in sentence2.predictedEntityTypes ] | ||
argsLocsAndTypes2 = [ (sentence2.knownEntityLocs[id],sentence2.knownEntityTypes[id]) for id in sentence2.knownEntityTypes ] | ||
possibleLocsAndTypes2 = eventLocsAndTypes2 + argsLocsAndTypes2 | ||
|
||
for (locs1,type1),(locs2,type2) in itertools.product(possibleLocsAndTypes1,possibleLocsAndTypes2): | ||
if sentenceid1 == sentenceid2 and locs1 == locs2: | ||
continue | ||
|
||
key = (type1,type2) | ||
if doFiltering and not key in targetArguments: | ||
continue | ||
|
||
#print "POTENTIAL", type1, type2 | ||
|
||
key = (sentenceid1,tuple(locs1),sentenceid2,tuple(locs2)) | ||
example = Example(filename, sentenceData, arg1_sentenceid=sentenceid1, arg1_locs=locs1, arg2_sentenceid=sentenceid2, arg2_locs=locs2) | ||
examples.append(example) | ||
|
||
thisClass = 0 | ||
if key in positiveRelations: | ||
thisClass = positiveRelations[key] | ||
#thisClass = 1 | ||
positiveRelationsProcessed.append(key) | ||
classes.append(thisClass) | ||
relTypes.append((type1,type2)) | ||
|
||
#print filename | ||
for key in positiveRelations: | ||
#assert key in allArgTriggerLocsProcessed, 'Unprocessed event trigger found: ' + str(key) | ||
if not key in positiveRelationsProcessed: | ||
print 'WARNING: Unprocessed argument trigger found: %s in file: %s' % (str(key), filename) | ||
|
||
#for c,e in zip(classes,examples): | ||
# print c,e | ||
|
||
#sys.exit(0) | ||
|
||
return classes, examples, relTypes | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import sys | ||
import fileinput | ||
import argparse | ||
import time | ||
import itertools | ||
import pickle | ||
import random | ||
import codecs | ||
from collections import defaultdict | ||
from sklearn import svm | ||
from sklearn.feature_extraction.text import CountVectorizer | ||
from sklearn.feature_extraction import DictVectorizer | ||
from scipy.sparse import coo_matrix, hstack, vstack | ||
import numpy as np | ||
import json | ||
|
||
from ClassifierStuff import * | ||
from SentenceModel import * | ||
|
||
from CandidateBuilder import generateRelationCandidates | ||
|
||
# It's the main bit. Yay! | ||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description='VERSE Relation Extraction tool') | ||
|
||
parser.add_argument('--modelFile', required=True, type=str, help='') | ||
parser.add_argument('--testingFile', required=True, type=str, help='Parsed-text file containing the test data to predict modifications for') | ||
parser.add_argument('--outFile', type=str, help='Output filename for data with predicted modifications') | ||
args = parser.parse_args() | ||
|
||
with open(args.modelFile) as f: | ||
model = pickle.load(f) | ||
|
||
parameters = model['parameters']; | ||
targetRelations = model['targetRelations']; | ||
targetRelationsToIDs = model['targetRelationsToIDs']; | ||
targetArguments = model['targetArguments']; | ||
|
||
argVec = model['argVec']; | ||
argFS = model['argFS']; | ||
argClf = model['argClf']; | ||
|
||
sentenceRange = 0 | ||
if "sentenceRange" in parameters: | ||
sentenceRange = int(parameters["sentenceRange"]) | ||
|
||
doFiltering = False | ||
if 'doFiltering' in parameters and parameters['doFiltering'] == 'True': | ||
doFiltering = True | ||
|
||
with open(args.testingFile, 'r') as f: | ||
testingSentenceAndEventData = pickle.load(f) | ||
print "Loaded " + args.testingFile | ||
|
||
# Empty the test data of any existing predictions (in case we load the wrong test file) | ||
for filename in testingSentenceAndEventData: | ||
(sentenceData,relations,modifiers) = testingSentenceAndEventData[filename] | ||
# Empty relations | ||
relations = [] | ||
|
||
testingSentenceAndEventData[filename] = (sentenceData,relations,modifiers) | ||
|
||
print "generate Argument Examples..." | ||
_,aExamples,aTypes = generateRelationCandidates(testingSentenceAndEventData,targetRelationsToIDs,targetArguments,sentenceRange,doFiltering) | ||
|
||
print "vectorize, trim and predict..." | ||
|
||
aVectors = argVec.vectorize(aExamples) | ||
if not argFS is None: | ||
aVectors = argFS.transform(aVectors) | ||
aVectors = coo_matrix(aVectors) | ||
|
||
aPredictions = argClf.predict(aVectors) | ||
aProbs = argClf.predict_proba(aVectors) | ||
probColumns = { c:i for i,c in enumerate(argClf.classes_) } | ||
|
||
#predictedEventID = 1 | ||
predictedTriggerID = 1000 | ||
|
||
predictedEventIDPerFile = Counter() | ||
|
||
for i,(p,example) in enumerate(zip(aPredictions,aExamples)): | ||
if p != 0: | ||
relType = targetRelations[p-1] | ||
|
||
#eventType = thisRelation[1] | ||
#argTypes = thisRelation[2:] | ||
#assert len(argTypes) == 2, "Only processing binary relations for triggerless events" | ||
|
||
#eventType = thisRelation[0] | ||
|
||
sentenceFilename = example.filename | ||
sentenceID1,arg1Locs = example.arguments[0] | ||
sentenceID2,arg2Locs = example.arguments[1] | ||
|
||
|
||
sentence1 = testingSentenceAndEventData[sentenceFilename][0][sentenceID1] | ||
sentence2 = testingSentenceAndEventData[sentenceFilename][0][sentenceID2] | ||
|
||
sentence1.invertTriggers() | ||
sentence2.invertTriggers() | ||
|
||
arg1ID = sentence1.locsToTriggerIDs[tuple(arg1Locs)] | ||
arg2ID = sentence2.locsToTriggerIDs[tuple(arg2Locs)] | ||
|
||
type1ID = sentence1.locsToTriggerTypes[tuple(arg1Locs)] | ||
type2ID = sentence2.locsToTriggerTypes[tuple(arg2Locs)] | ||
|
||
#relType = typeLookup[type1ID] | ||
|
||
relations = testingSentenceAndEventData[sentenceFilename][1] | ||
|
||
prob = aProbs[i,probColumns[p]] | ||
|
||
newR = (relType,arg1ID,arg2ID,prob) | ||
#print "ADDING", newR | ||
relations.append(newR) | ||
#print "TEST",sentenceFilename,sentenceID1,sentenceID2,arg1Locs,arg2Locs,relType | ||
|
||
|
||
with open(args.outFile, 'w') as f: | ||
pickle.dump(testingSentenceAndEventData,f) | ||
|
||
print "Complete." |