In [287]:
import networkx as nx
import os
import csv
from pycorenlp import StanfordCoreNLP
import matplotlib.pyplot as plt
import pylab
import subprocess
import re
import operator

# basedir = os.path.abspath(os.path.dirname(__file__))
nlp = StanfordCoreNLP('http://localhost:9000')
annotators = 'tokenize,pos,lemma,ner,parse,natlog'
input_file = "training_sentences.csv"

###################################
# Start CoreNLP server
################################### 
# cd /users/hundman/src/stanford-corenlp-full-2015-12-09
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer 9000

In [288]:
incorrect_parse_trees = [14,15,16,17,18,19,20,43,50,51,60,61,62,63,64]

In [289]:
#NOTES
##############################################################
# INCORRECT parse trees (CoreNLP): (14,15,16,17)(18,19)(20,21,22,23)(27)
# 14-17 no dependency tree built (just big circle)
# 18-19 - no dependency tree built (just big circle)
# 20 - incorrect parse (HyMap sentence)  (20-23 are same sentence)
# 27,67,68,69 - image(27)/channels(67-69) is technically correct parse, but we really want to identify resolution
# 43 - blooming onion (brackets)
# 50,51 - blooming onion (brackets)
# 60-64 - blooming onion - unit is connected to another word with "/" -> (PRISM: 2.5 m/AVNIR: 10 m)

# FIXABLE
# 28 - just wrong, need to fix
# 21-22 - compound where unit isn't connected directly (10, 20 & 60 m)
# 31 - gotta find number
# 52 - parses fine, just need to handle (1km-resolution)
# 57 - parses fine, just need to handle (1-km-resolution)
# 58 - this one is hard (not explicity stated, but inferred from prior mention in parenthesis 
#      -> ETM+pan-chromatic channel (15 m resolution) and ALOS PRISM data (2.5 m))
# 72 - need to handle 10 to 90m

In [316]:
class Annotations:
    def __init__(self, tokens, dependencies):
        self.tokens = tokens
        self.deps = dependencies
        self.pos_lookup = {}
    
    def build_pos_lookup(self):
        """dict of part of speech tags for each token. Used to look up POS when iterating through dependencies
        { <index> : {
                "word" : "",
                "pos" : ""
            }
        }
        """
        self.pos_lookup[0] = {"pos" : "", "word": ""} #0 not included in tokens, but included in dependencies
        for f in self.tokens:
            self.pos_lookup[f["index"]] = {}
            self.pos_lookup[f["index"]]["word"] = f["word"]
            self.pos_lookup[f["index"]]["pos"] = f["pos"]
            
            
    def find_unit_and_format(self, G, sentence):
        for node in G.nodes(data=True):
            node = Node(node[0], node[1]["word"], node[1]["pos"])

            # 20 m -> unit matches word and word index is in list of tokens that constitute measurement
            if node.word == sentence.unit and node.idx in str(sentence.measure_tokens):
                return ("space_between", node.idx)

            # 10-m -> number and unit are attached
            if node.word == sentence.num + "-" + sentence.unit and node.word in str(sentence.measure_tokens):
                return ("hyphenated", node.idx)

            # 90m -> if there is a number in the token, replace number with nothing, leaving the unit
            if re.search("\d+(.\d+)*", node.word) != None:
                if sentence.unit == re.sub("\d+(.\d+)*", "", node.word):
                    return ("attached", node.idx)

        return (None,None)
    
    def check_output(self, sentence, stats):
#         for dep in self.deps:
#             print dep
        for dep in self.deps:
            if not "dependentGloss" in dep or not "governorGloss" in dep:
                stats.parse_error(sentence)
                return False
        return True
                
      
    
class Token:
    def __init__(self, index, word, pos, start, end):
        self.idx = index
        self.word = word
        self.pos = pos
        self.start = start
        self.end = end    
        
        
class Dependency:
    def __init__(self, governor, governorGloss, dependent, dependentGloss, dep_type):
        self.gov = governorGloss
        self.gov_idx = governor
        self.dep = dependentGloss
        self.dep_idx = dependent
        self.dep_type = dep_type
        
        
class Sentence:
    def __init__(self, text, measurement_and_type, measurement, entity, unit, number, measurement_type, line_num):
        self.text = text
        self.extraction_goal = measurement_and_type
        self.measurement = measurement
        self.entity = entity
        self.unit = unit
        self.num = number
        self.measure_type = measurement_type
        self.line_num = line_num
        self.measure_tokens = []
           
    def check_for_measurement(self, token):
        """Identify the token index numbers for the measurement phrase to 
        ensure correct tokens are being considered later"""
        measure_start = self.text.index(self.measurement)
        if token.start >= measure_start and token.end - 1 <= measure_start + len(self.measurement): #-1 for ending period
            self.measure_tokens.append(token.idx)
    
#     def cleanup(self):
        

class Node:
    def __init__(self, idx, word, pos):
        self.idx = idx
        self.word = word
        self.pos = pos
        
class Stats:
    def __init__(self):
        self.pattern_cnts = {}
        self.pattern_cnts["type"] = {}
        self.pattern_cnts["number"] = {}
       
        self.errors = {}
        self.errors["parse"] = []
        self.errors["type"] = {}
        self.errors["number"] = {}
        
        self.errors_cnts = {}
        self.errors_cnts["parse"] = 0
        self.errors_cnts["type"] = {}
        self.errors_cnts["number"] = {}
        
        self.total_sentences = 0
        self.correct_type_cnt = 0
        self.correct_num_cnt = 0
        self.types_found = []
    
    def parse_error(self, sentence):
#         print "Parse error: (" + str(sentence.line_num) + ") " + sentence.text
        self.errors["parse"].append(sentence.line_num)
        self.errors_cnts["parse"] += 1
        
    def evaluate(self, sentence, pattern="Unknown", prediction="None", prediction_type=None):
        
        print str(prediction_type) + " pattern: " + pattern
       
        if pattern in self.pattern_cnts[prediction_type]:
            self.pattern_cnts[prediction_type][pattern] += 1
        else:
            self.pattern_cnts[prediction_type][pattern] = 1
           
        matching = sentence.measure_type if prediction_type == "type" else sentence.num
        if prediction == matching:
            pass
        else:
            if pattern in self.errors[prediction_type] and pattern in self.errors_cnts[prediction_type]:
                self.errors[prediction_type][pattern].append(sentence.line_num)
                self.errors_cnts[prediction_type][pattern] += 1
            else:
                self.errors[prediction_type][pattern] = [sentence.line_num]
                self.errors_cnts[prediction_type][pattern] = 1
    
                
        

In [323]:
def remove_period(dep, last_token_idx):
    if dep.gov_idx == last_token_idx:
        dep.gov = dep.gov.replace(".","")
    if dep.dep_idx == last_token_idx:
        dep.dep = dep.dep.replace(".","")
    return dep


def get_connected_word(edge, unit_idx, annotated, sentence):
    if edge[0] == unit_idx and annotated.pos_lookup[int(edge[1])]["word"] != sentence.num:
        return edge[1] 
    elif edge[1] == unit_idx and annotated.pos_lookup[int(edge[0])]["word"] != sentence.num:
        return edge[0]
    else:
        return None
    
def get_connected_num(edge, unit_idx, annotated, sentence):
    if edge[0] == unit_idx and annotated.pos_lookup[int(edge[1])]["word"] == sentence.num:
        return edge[1] 
    elif edge[1] == unit_idx and annotated.pos_lookup[int(edge[0])]["word"] == sentence.num:
        return edge[0]
    else:
        return None

        
def get_number(G, sentence, annotated):
    # look for first degree connections
    for edge in G.edges(data=True):
        connected_idx = get_connected_num(edge, unit_idx, annotated, sentence)
        if edge[2]['dep'] == "nummod" and connected_idx != None:
            p_number = G.node[connected_idx]['word']
            stats.evaluate(sentence, "1.1", p_number, "number")
            return p_number
    
    # look for number (CD) in second degree connections
    for edge in G.edges(data=True):
        connected_idx = get_connected_num(edge, unit_idx, annotated, sentence)
        if connected_idx:
            second_degree = G.edges([connected_idx])
            for edge in second_degree:
                for node in edge:
                    if annotated.pos_lookup[int(node)]['pos'] == "CD":
                        p_number = G.node[node]['word']
                        stats.evaluate(sentence, "1.2", p_number, "number")
                        return p_number
        
        
        
def build_graph(annotated, show=False):
    G = nx.Graph()
    node_labels, edge_labels = {}, {}
    for idx, dep in enumerate(annotated.deps):
        dep = Dependency(dep["governor"], dep["governorGloss"], dep["dependent"], dep["dependentGloss"], dep["dep"])
        dep = remove_period(dep, len(annotated.tokens))
        
        # nodes, edges
        G.add_node(str(dep.gov_idx), word=dep.gov, pos=annotated.pos_lookup[dep.gov_idx]["pos"])
        G.add_node(str(dep.dep_idx), word=dep.dep, pos=annotated.pos_lookup[dep.dep_idx]["pos"])
        G.add_edge(str(dep.dep_idx), str(dep.gov_idx), dep=dep.dep_type)

        #labels
        node_labels[dep.gov_idx] = dep.gov + " : " + annotated.pos_lookup[dep.gov_idx]["pos"]
        node_labels[dep.dep_idx] = dep.dep + " : " + annotated.pos_lookup[dep.dep_idx]["pos"]
        edge_labels[(str(dep.dep_idx), str(dep.gov_idx))] = dep.dep_type
        
    pos=nx.spring_layout(G)
    
#     nx.draw_networkx(G, pos, labels=node_labels)
#     nx.draw_networkx_edge_labels(G,pos,edge_labels=edge_labels)
#     pylab.show()
    
    return G
    
    

def space_between_patterns(G, sentence, annotated):
    """ Typical patterns when unit and number are separated by a single space (10 m)"""
    for edge in G.edges(data=True):
        connected_idx = get_connected_word(edge, unit_idx, annotated, sentence)
        
        if connected_idx:
            # sentence 79
            if edge[2]['dep'] == "nsubj":
                p_type = G.node[connected_idx]['word']
                stats.evaluate(sentence, "1.1", p_type, "type")
                return p_type

            #if p_number is present, then measurement type is connected via "appos", "nmod:of", "nmod:with", "nmod:npmod"
            elif edge[2]['dep'] == "appos" or edge[2]['dep'] == "nmod:of" or edge[2]['dep'] == "nmod:with": #Might need to check for noun NN, or NNS
                print edge[2]['dep']
                p_type = G.node[connected_idx]['word']
                stats.evaluate(sentence, "1.2", p_type, "type")
                return p_type

            # eventually will be an indication of inexact value (about)
            elif edge[2]['dep'] == "nmod:about": 
                p_type = G.node[connected_idx]['word']
                stats.evaluate(sentence, "1.3", p_type, "type")
                return p_type

            # inspired by sentence on line 9
            elif edge[2]['dep'] == "compound" and "NN" in G.node[connected_idx]["pos"]:
                p_type = G.node[connected_idx]['word']
                stats.evaluate(sentence, "1.4", p_type, "type")
                return p_type
            
            # inspired by sentence on line 26
            elif edge[2]['dep'] == "nmod:npmod":
                p_type = evaluate_target_dep(connected_idx, G.edges(data=True), annotated)
                stats.evaluate(sentence, "1.6", p_type, "type")
                return p_type

                    
            # inspired by sentence on line 31
            elif edge[2]['dep'] == "nmod:as":
                p_type = G.node[connected_idx]['word']
                stats.evaluate(sentence, "1.7", p_type, "type")
                return p_type


    return None
        
    
    
def get_dep_nodes(edges, annotations, connected_idx):
    dep_nodes = []
    for edge in edges:
        if edge[2]["dep"] == "dep" and connected_idx == edge[0]: 
            dep_nodes.append(edge[1])
        if edge[2]["dep"] == "dep" and connected_idx == edge[1]: 
            dep_nodes.append(edge[0])
    return dep_nodes
            
    
    
def evaluate_target_dep(dep_idx, edges, annotations):
    # NOT HANDLING MORE THAN ONE 'NN' COMING FROM 'JJ' RIGHT NOW
    if "NN" in annotations.pos_lookup[int(dep_idx)]["pos"]:
        return annotations.pos_lookup[int(dep_idx)]["word"]
    if "JJ" in annotations.pos_lookup[int(dep_idx)]["pos"]:
        for edge in edges:
            if "mod" in edge[2]['dep'] and dep_idx == edge[0]:
                if "NN" in annotations.pos_lookup[int(edge[1])]["pos"]:
                    return annotations.pos_lookup[int(edge[1])]["word"]
            elif "mod" in edge[2]['dep'] and dep_idx == edge[1]:
                if "NN" in annotations.pos_lookup[int(edge[0])]["pos"]:
                    return annotations.pos_lookup[int(edge[0])]["word"]
            else:
                print "UNSEEN 'DEP' COMBO"
              
            
            
def determine_target_dep(dep_indices, stats, annotations, edges):
    # check and see if any of dep node words have been found already in patterns with more certainty
    # if not, choose the one with fewer edges coming from it
    counts = {}
    for dep_idx in dep_indices:
        if annotations.pos_lookup[int(dep_idx)]["word"] in stats.types_found:
            return dep_idx
        else:
            for edge in edges:
                if dep_idx in edge[0] or dep_idx in edge[1]:
                    if dep_idx in counts:
                        counts[dep_idx] += 1
                    else:
                        counts[dep_idx] = 1
            return min(counts.iteritems(), key=operator.itemgetter(1))[0]
            
    
    
    
def uncertain_parse_patterns(G, sentence, annotations, stats):
    """ 
    If none of typical patterns exist for measurements with space between (10 m), 
    the dependency type coming from the unit will be "dep", and a different set of patterns
    will be evaluated here.
    """
    try:
        for edge in G.edges(data=True):
            connected_idx = get_connected_word(edge, unit_idx, annotations, sentence)

            if connected_idx:
                p_type = None
                dep_indices = get_dep_nodes(G.edges(data=True), annotations, connected_idx)
                if len(dep_indices) == 1:
                    p_type = evaluate_target_dep(dep_indices[0], G.edges(data=True), annotations)
                    stats.evaluate(sentence, "1.5.1", p_type, "type")
                    return p_type
                else:
                    dep_idx = determine_target_dep(dep_indices, stats, annotations, G.edges(data=True))
                    p_type = evaluate_target_dep(dep_idx, G.edges(data=True), annotations)
                    stats.evaluate(sentence, "1.5.2", p_type, "type")
                    return p_type
    except:
        return None
    
    
        
def hyphenated_patterns(g, sentence, annotated):
    """ Typical patterns when unit and number are attached with a hyphen (10-m)"""
    for edge in G.edges(data=True):
        connected_idx = get_connected_word(edge, unit_idx, annotated, sentence)
        
        if edge[2]['dep'] == "amod":
            p_type = G.node[connected_idx]['word']
            stats.evaluate(sentence, "2.1", p_type, "type")
            return True
        
#     

In [324]:
stats = Stats()

with open(input_file, "rU") as f:
    sentences = csv.reader(f)
    for row in sentences:
#         if sentences.line_num == 15:
        if sentences.line_num > 1: #skip header
            stats.total_sentences += 1
            
            # load sentence data into class object and clean sentence 
            sentence = Sentence(row[1], row[4] + row[13], row[4], row[3],row[14],row[13],row[15], sentences.line_num)
#             sentence.cleanup()
            
            # Parse sentence, load annotations into class object
            output = nlp.annotate(sentence.text, properties={'annotators': annotators, 'outputFormat':'json'})
            annotations = Annotations(output["sentences"][0]["tokens"], output["sentences"][0]["collapsed-ccprocessed-dependencies"])
            
            # Check for legitimate output from coreNLP
            if annotations.check_output(sentence, stats) == True:
            
                # Build part of speech lookup for a given token index
                annotations.build_pos_lookup()

                # Load parse tree into graph
                G = build_graph(annotations, show=True)

                # find measurement tokens (used to find exact unit index later)
                for x in annotations.tokens:
                    token = Token(x["index"], x["word"], x["pos"], x["characterOffsetBegin"], x["characterOffsetEnd"])
                    sentence.check_for_measurement(token)

                #
                p_number, p_type, p_adjectives, p_type_idx = None, None, None, None

                # Find exactly where the unit token is (even if partial match)
                unit_idx = annotations.find_unit_and_format(G, sentence)[1]

                # Determine the format the measurement (space, hyphen, no space, range)
                measurement_format = annotations.find_unit_and_format(G, sentence)[0]

                p_number = get_number(G, sentence, annotations)

                if measurement_format == "space_between":
                    p_type = space_between_patterns(G, sentence, annotations)
                    if p_type == None:
                        p_type = uncertain_parse_patterns(G, sentence, annotations, stats)
                elif measurement_format == "hyphenated":
                    p_type = hyphenated_patterns(G, sentence, annotations)
                else:
                    stats.evaluate(sentence, "Unknown", "None", "type")


                # Sentence results (make function)
                print sentence.text
                print sentence.line_num
                print ""
                print "Actual Number: " + sentence.num
                print "Predicted Number: " + str(p_number)
                print ""
                
                print ""
                print "Actual Type: " + sentence.measure_type
                print "Predicted Type: " + str(p_type)
                print ""
                if p_type == sentence.measure_type:
                    stats.correct_type_cnt += 1
                    print "type: True"
                else:
                    print "type: False"
                if p_number == sentence.num:
                    stats.correct_num_cnt += 1
                    print "number: True"
                else:
                    print "number: False"
                print "========================"
                print ""
   
    # All results (make function)
    print "Total Sentences: " + str(stats.total_sentences)
    print ""
    print "Number of parse errors: " + str(stats.errors["parse"])
    print "Parse errors: " + str(stats.errors_cnts["parse"])
    print ""
    print "Correct numbers: " + str(stats.correct_num_cnt)
    print "Incorrect numbers: "
    for key, value in stats.errors["number"].iteritems():
        print "   " + key + ": " + str(value)
    print ""
    print "Correct types: " + str(stats.correct_type_cnt)
    print "Incorrect types: "
    for key, value in stats.errors["type"].iteritems():
        print "   " + key + ": " + str(value)
    
            
# self.pattern_cnts = {}
#         self.pattern_cnts["type"] = {}
#         self.pattern_cnts["number"] = {}
       
#         self.errors = {}
#         self.errors["type"] = {}
#         self.errors["number"] = {}
        
#         self.errors_cnts = {}
#         self.errors_cnts["type"] = {}
#         self.errors_cnts["number"] = {}
        
#         self.total_sentences = 0
#         self.correct_type_cnt = 0
#         self.correct_num_cnt = 0

            

type pattern: 1.4
To measure surface displacements between pairs of Landsat 8 OLI panchromatic images (band 8, 15 m pixel resolution) resulting from ice flow, we find peaks in normalized cross-correlation surfaces calculated at integer pixel offsets between image chips.
2

Actual Number: 15
Predicted Number: None


Actual Type: resolution
Predicted Type: resolution

type: True
number: False

number pattern: 1.1
appos
type pattern: 1.2
The spatial resolution of Landsat 8 (30 m) does not support distinction between single trees and tree clusters nor their defoliation state.
3

Actual Number: 30
Predicted Number: 30


Actual Type: resolution
Predicted Type: resolution

type: True
number: True

type pattern: 1.4
Landsat 7 spatial resolution is similar to Landsat 8, with 15 m ground-equivalent pixels for the panchromatic band, and 30 m pixels for the visible, near infrared, and short-wave infrared bands.
4

Actual Number: 15
Predicted Number: None


Actual Type: pixels
Predicted Type: pixel