In [1]:
include("../src/generalized-chart-parser/chartparser.jl")
using .ChartParser: run_chartparser, MetaRule, CFGrammar, CFRule, score


In [2]:

using LogProbs

In [3]:
## Test 1

ascend = MetaRule([CFRule(i, [i, i+1]) for i in (1:10)])
double = MetaRule([CFRule(i, [i, i]) for i in (1:10)])
terminate = MetaRule([CFRule(i, [string(i)]) for i in (1:10)])


test_grammar = CFGrammar([ascend, double], [terminate], [1])

#returns a ParseForest obj that contains all the completed parses from chart
forest = run_chartparser(["1" for i in 1:3], test_grammar)
print(forest)

Main.ChartParser.ParseForest{Main.ChartParser.IntervalRange,Int64,String,MetaRule{Int64,Int64},MetaRule{Int64,String},Int64,LogProb}(Main.ChartParser.Constituent{Main.ChartParser.IntervalRange,Int64,String,MetaRule{Int64,Int64},MetaRule{Int64,String},Int64,LogProb}[Main.ChartParser.Constituent{Main.ChartParser.IntervalRange,Int64,String,MetaRule{Int64,Int64},MetaRule{Int64,String},Int64,LogProb}(Main.ChartParser.ConstituentKey{Main.ChartParser.IntervalRange,Int64}(Main.ChartParser.IntervalRange(0, 3), 1), nothing, LogProb(0.008230452674897129), 2, nothing, Main.ChartParser.EdgeCompletion{Main.ChartParser.Edge{Main.ChartParser.IntervalRange,Int64,LogProb,Main.ChartParser.Constituent{Main.ChartParser.IntervalRange,Int64,String,MetaRule{Int64,Int64},MetaRule{Int64,String},Int64,LogProb}},MetaRule{Int64,Int64},LogProb}[Main.ChartParser.EdgeCompletion{Main.ChartParser.Edge{Main.ChartParser.IntervalRange,Int64,LogProb,Main.ChartParser.Constituent{Main.ChartParser.IntervalRange,Int64,String,M

["1", "1", "1"])

In [4]:
function split_category_from_terminal_rules(rules_string::AbstractString)
    rule_string_lists = map(split, split(rules_string, "\n"))
    terminal_rule_stringlists = Vector{String}[]
    category_rule_stringlists = Vector{String}[]
    for lst in rule_string_lists
        if !isempty(lst)
            if lst[end][1] == '_'
                corrected_lst = [s[1] == '_' ? s[2:end] : s for s in lst]
                push!(terminal_rule_stringlists, corrected_lst)
            else
                push!(category_rule_stringlists, lst)
            end
        end
    end
    category_rule_stringlists, terminal_rule_stringlists
end

function Grammar(rules_string::AbstractString, startsymbols, Score=LogProb)
    category_rules, terminal_rules = split_category_from_terminal_rules(rules_string)
    CFGrammar(category_rules, terminal_rules, startsymbols, Score)
end

Grammar (generic function with 2 methods)

In [5]:
function read_treebank_grammar(number_sentences::Int, Score=LogProb)
    @assert number_sentences in (1, 5, 50, 500, 972)
    grammarfile = joinpath("..","src","WSJ", "WSJ_data", "WSJ.$number_sentences.grammar.txt")
    io = open(grammarfile, "r")
    grammarstring = read(io, String)
    Grammar(grammarstring, ["START"])
end

read_treebank_grammar (generic function with 2 methods)

In [6]:
function compute_treebank_scores(number_sentences::Int, Score=LogProb)
    @assert number_sentences in (1, 5, 50, 500, 972)
    grammar=read_treebank_grammar(number_sentences, Score)
    sentences = map(split, readlines(joinpath("..","src","WSJ", "WSJ_data", "WSJ.$number_sentences.sentences.txt")))
    scores = zeros(Score, length(sentences))
    for (i,s) in enumerate(sentences)
        scores[i] = score(run_chartparser(s, grammar))
    end
    scores
end

compute_treebank_scores (generic function with 2 methods)

In [17]:
function treebank_goldstandard_scores(number_sentences::Int, Score=LogProb)
     @assert number_sentences in (1, 5, 50, 500, 972)
    lines = readlines(joinpath("..","src","WSJ", "WSJ_data", "WSJ.$number_sentences.scores.txt"))
    scores = zeros(Score, length(lines))
    for (i,ln) in enumerate(lines)
        scores[i] = Score(parse(Float64, ln), islog=true)
    end
    scores
end

treebank_goldstandard_scores (generic function with 2 methods)

In [18]:
parser_scores1 = compute_treebank_scores(1)
gold_scores1 = treebank_goldstandard_scores(1)

1-element Array{LogProb,1}:
 LogProb(3.0021655020111464e-21)

In [9]:
parser_scores1

1-element Array{LogProb,1}:
 LogProb(3.0021800153636974e-21)

In [19]:
gold_scores1

1-element Array{LogProb,1}:
 LogProb(3.0021655020111464e-21)

In [22]:
parser_scores5 = compute_treebank_scores(5)

5-element Array{LogProb,1}:
 LogProb(4.3995116067547213e-35)
  LogProb(2.129870504765186e-49)
 LogProb(2.0056786835357826e-28)
 LogProb(2.8489015121513312e-27)
  LogProb(7.332156520780076e-34)

In [24]:
gold_scores5 = treebank_goldstandard_scores(5)

5-element Array{LogProb,1}:
  LogProb(4.399489151673981e-35)
 LogProb(2.1298578165678774e-49)
 LogProb(2.0056710289225466e-28)
 LogProb(2.8488894256850852e-27)
  LogProb(7.332125679036439e-34)

In [25]:
gold_scores50 = treebank_goldstandard_scores(50)

50-element Array{LogProb,1}:
  LogProb(2.981175848344592e-55)
 LogProb(1.2906245229672493e-75)
  LogProb(8.014680943439216e-44)
 LogProb(2.2485813146848427e-43)
  LogProb(3.740805122635451e-52)
 LogProb(1.2934198552150024e-35)
 LogProb(1.3242799721968899e-80)
  LogProb(8.436181223164396e-83)
 LogProb(7.160955933336326e-117)
 LogProb(6.086310131631381e-104)
 LogProb(2.6380422615540024e-39)
  LogProb(2.957980241832018e-51)
  LogProb(2.774799614436636e-71)
                               ⋮
 LogProb(1.3014971317060214e-65)
  LogProb(3.253693291348668e-22)
   LogProb(4.48291449637698e-28)
   LogProb(3.840068842058428e-7)
  LogProb(3.227616979224225e-65)
  LogProb(9.219846727625642e-12)
   LogProb(1.920034420952307e-7)
   LogProb(5.66712461031712e-49)
   LogProb(2.120564959361192e-9)
   LogProb(4.88013132656555e-53)
  LogProb(8.810970539189991e-27)
  LogProb(5.881924085431823e-34)

In [26]:
parser_scores50 = compute_treebank_scores(50)

50-element Array{LogProb,1}:
   LogProb(5.963738023306235e-55)
   LogProb(1.288498847271789e-75)
   LogProb(8.009555890977644e-44)
  LogProb(2.2486403376789327e-43)
   LogProb(3.744024060895086e-52)
  LogProb(1.2927355675059104e-35)
   LogProb(1.311204704921159e-80)
   LogProb(8.085078465646936e-83)
 LogProb(1.3923169728982338e-116)
   LogProb(5.76322073666404e-104)
  LogProb(2.6360188669399398e-39)
  LogProb(2.9581108912458104e-51)
  LogProb(2.7417368784126794e-71)
                                ⋮
  LogProb(1.2981649446202324e-65)
   LogProb(3.253701104435802e-22)
    LogProb(4.49062568383613e-28)
    LogProb(3.840070405954712e-7)
   LogProb(9.644623534455353e-65)
   LogProb(9.219853247932983e-12)
    LogProb(1.920035202977356e-7)
   LogProb(5.667071864832517e-49)
   LogProb(2.1205662470245882e-9)
   LogProb(9.765692763692677e-53)
   LogProb(1.762199302016002e-26)
  LogProb(1.1776356346064007e-33)

In [28]:
for s in parser_scores50
    print(s.log)
    print("\n")
end

-124.85648264507515
-172.44040411807157
-99.23310877651723
-98.20083325955198
-118.41426385057456
-80.33371768674003
-183.9358611028267
-189.02454252043452
-266.76890154090376
-237.7173531972714
-88.83154885274041
-116.34728889052155
-162.47494998577434
-55.598086224955324
-154.19220872706651
-215.76987345856705
-124.7423966261127
-124.85679394522751
-74.055889227019
-96.03706927949685
-113.4045365155372
-207.34840458491627
-153.13264284655662
-241.0471156317203
-107.97855799149832
-122.68013517761777
-152.90787724650454
-132.04252719408942
-217.15926884953382
-181.7112821432861
-140.17869225572318
-90.92240771466666
-147.39979232478905
-69.3913280685186
-88.58991083182386
-175.09887186460008
-162.12290604298516
-81.25439913729073
-149.40707935842593
-49.4770788961049
-62.9703905612672
-14.772604949642727
-147.40163043121453
-25.409661995196807
-15.465752130202672
-111.09199699831785
-19.97158268627361
-119.75813442332964
-59.30064978548148
-75.82179933990338


In [29]:
for s in gold_scores50
    print(s.log)
    print("\n")
end

-125.549862312
-172.438755747
-99.2324691139
-98.2008595082
-118.415123974
-80.3331884937
-183.925938545
-188.982032974
-267.433812398
-237.662807663
-88.8307815523
-116.347333058
-162.462963069
-55.5859518399
-154.880840481
-215.76746764
-125.435494887
-124.85680553
-74.7484854966
-96.0343958057
-113.404584324
-207.334542005
-153.132681118
-241.740837385
-107.97623458
-122.680165266
-152.901201247
-132.042588739
-217.096934936
-182.404447968
-140.178886353
-90.9210965202
-148.093212012
-69.3911739797
-89.2830958991
-175.048432388
-162.709640581
-81.2542605111
-149.404515803
-49.4770812974
-62.9721092117
-14.7726053569
-148.496286957
-25.4096627024
-15.4657525375
-111.091987691
-19.9715832935
-120.451837798
-59.9937999136
-76.516009228
