In [4]:
include("../src/generalized-chart-parser/chartparser.jl")
using .ChartParser: run_chartparser, MetaRule, CompletionAutomaton, CFGrammar, CFRule, score



In [5]:
using LogProbs

In [37]:
function DiffCFGrammar(
        category_rules_strings::Vector{Vector{C}},
        terminal_rules_strings::Vector{Vector{T}},
        startsymbols::Vector{C},
        Score,
        dependent_components=identity::Function) where {C,T}

    if length(terminal_rules_strings[1]) > 2 #if probability included in strings

        category_rules_with_probs = [
            (MetaRule([CFRule(s[2],[s[3:end]...])]), Score(parse(Float64, s[1]), islog=true))
            for s in category_rules_strings]
        terminal_rules_with_probs = [
            (MetaRule([CFRule(s[2],[s[3:end]...])]), Score(parse(Float64, s[1]), islog=true))
            for s in terminal_rules_strings]

        category_rules = [mr for (mr,p) in category_rules_with_probs]
        terminal_rules = [mr for (mr,p) in terminal_rules_with_probs]

        comp_automtn = CompletionAutomaton(C, Tuple{C, MetaRule{C, C}})
        for mr in category_rules
            add_rule!(comp_automtn, mr)
        end

        terminal_dict = Dict{T, Vector{Tuple{C, MetaRule{C, T}}}}()
        for mr in terminal_rules
            for lhs in lhss(mr)
                t = mr(lhs)[1]
                if haskey(terminal_dict, t)
                    push!(terminal_dict[t], (lhs, mr))
                else
                    terminal_dict[t] = [(lhs, mr)]
                end
            end
        end

        all_rules_with_probs = [category_rules_with_probs;terminal_rules_with_probs]

        cond = SimpleCond(Dict(
            cat => CatDist(Dict(
                mr => p for (mr,p) in all_rules_with_probs if isapplicable(mr, cat)
            ))
            for cat in unique(Base.reduce(append!, [lhss(mr) for (mr,p) in all_rules_with_probs], init=T[])))
        )

    else
        category_rules = [MetaRule([CFRule(s[1],[s[2:end]...])]) for s in category_rules_strings]
        terminal_rules = [MetaRule([CFRule(s[1],[s[2:end]...])]) for s in terminal_rules_strings]

            comp_automtn = CompletionAutomaton(C, Tuple{C, MetaRule{C, C}})
        for mr in category_rules
            add_rule!(comp_automtn, mr)
        end

        terminal_dict = Dict{T, Vector{Tuple{C, MetaRule{C, T}}}}()
        for r in terminal_rules
            for lhs in lhss(r)
            t = r(lhs)[1]
                if haskey(terminal_dict, t)
                    push!(terminal_dict[t], (lhs, r))
                else
                    terminal_dict[t] = [(lhs, r)]
                end
            end
        end

        applicable_rules = Dict{C, Vector{MetaRule}}()
        for r in MetaRule[category_rules; terminal_rules]
            for c in lhss(r)
                if haskey(applicable_rules, c)
                    push!(applicable_rules[c], r)
                else
                    applicable_rules[c] = MetaRule[r]
                end
            end
        end

        cond = SimpleCond(
            Dict(
                dependent_components(c) => let rules = applicable_rules[c]
                    n = length(rules)
                    k = count(isa.(rules, MetaRule{C, T})) # number terminal rules
                    probs = [Score(p) in [fill(1.0, n-k); fill(1/k, k)]]
                    CatDist(rules, probs)
                end
                for c in keys(applicable_rules)
            )
        )                   
    end
    CFGrammar(comp_automtn, startsymbols, terminal_dict, cond, dependent_components)
end

DiffCFGrammar (generic function with 2 methods)

In [40]:
function split_category_from_terminal_rules(rules_string::AbstractString)
    rule_string_lists = map(split, split(rules_string, "\n"))
    terminal_rule_stringlists = Vector{String}[]
    category_rule_stringlists = Vector{String}[]
    for lst in rule_string_lists
        if !isempty(lst)
            if lst[end][1] == '_'
                corrected_lst = [s[1] == '_' ? s[2:end] : s for s in lst]
                push!(terminal_rule_stringlists, corrected_lst)
            else
                push!(category_rule_stringlists, lst)
            end
        end
    end
    category_rule_stringlists, terminal_rule_stringlists
end


function DiffGrammar(rules_string::AbstractString, startsymbols, Score=LogProb)
    category_rules, terminal_rules = split_category_from_terminal_rules(rules_string)
    category_rules = [ s[2:end] for s in category_rules ]
    terminal_rules = [ s[2:end] for s in terminal_rules ]
    end
    DiffCFGrammar(category_rules, terminal_rules, startsymbols, Score)
end

UndefVarError: UndefVarError: startsymbols not defined

In [27]:
function read_treebank_grammar_uniform_diff(number_sentences::Int, Score=LogProb)
    @assert number_sentences in (1, 5, 50, 500, 972)
    grammarfile = joinpath("..","src","WSJ", "WSJ_data", "WSJ.$number_sentences.grammar.txt")
    io = open(grammarfile, "r")
    grammarstring = read(io, String)
    DiffGrammar(grammarstring, ["START"])
end

read_treebank_grammar_uniform (generic function with 2 methods)

In [28]:
grammar = read_treebank_grammar_uniform(5)

CFGrammar{String,String,Main.ChartParser.PCFGrammar.Distributions.SimpleCond{String,Main.ChartParser.PCFGrammar.Distributions.DirCat{Main.ChartParser.PCFGrammar.MetaRule{String,String},Float64},Array{Main.ChartParser.PCFGrammar.MetaRule{String,String},1}},typeof(identity)}(CompletionAutomaton{String,Tuple{String,Main.ChartParser.PCFGrammar.MetaRule{String,String}}}(Dict{String,Int64}[Dict("RB" => 63,"NNS" => 29,"NN" => 21,"PP" => 77,"DT" => 23,"JJ" => 57,"VP" => 61,"EDOLAR" => 6,"SINV" => 73,"TO" => 54…), Dict("PP" => 53,"NNS" => 56,"ECOMMA" => 30,"CC" => 3,"VP" => 14,"NN" => 35), Dict("NP" => 4), Dict("POS" => 5), Dict(), Dict("CD" => 7), Dict("CD" => 8), Dict(), Dict("VP" => 18,"PRN" => 45), Dict()  …  Dict("ECOMMA" => 78), Dict("NP" => 79), Dict("VP" => 80), Dict("EDOT" => 81), Dict(), Dict(), Dict(), Dict("ADJP" => 85), Dict(), Dict()], Array{Tuple{String,Main.ChartParser.PCFGrammar.MetaRule{String,String}},1}[[], [], [], [("NP", MetaRule(rules49))], [("NP", MetaRule(rules1))], [],