-
Notifications
You must be signed in to change notification settings - Fork 474
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
50 additions
and
3,634 deletions.
There are no files selected for viewing
50 changes: 50 additions & 0 deletions
50
neuralcoref/train/conll_processing_script/compile_coref_data.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
|
||
#!/bin/bash | ||
|
||
# Script from the Allen NLP research library (https://github.com/allenai/allennlp): | ||
# https://github.com/allenai/allennlp/blob/master/scripts/compile_coref_data.sh | ||
|
||
# This script downloads and compiles the Ontonotes 2012 data in a helpful format | ||
# for co-reference resolution. It generates 3 files: {train, dev, test}.english.v4_gold_conll, | ||
# as well as a directory 'conll-2012' which contains the raw extracted data. | ||
# The script downloads and runs some python scripts which require python 2.X. | ||
|
||
ONTONOTES_PATH=$1 | ||
|
||
if [ ! -n "$ONTONOTES_PATH" ] ; then | ||
echo "USAGE: ./scripts/compile_coref_data.sh /path/to/ontonotes/data" | ||
exit 1 | ||
fi | ||
|
||
function download_and_extract() { | ||
wget $1/$2 | ||
tar -xvzf $2 | ||
rm $2 | ||
} | ||
|
||
function compile_partition() { | ||
rm -f $2.$5.$3$4 | ||
cat conll-2012/$3/data/$1/data/$5/annotations/*/*/*/*.$3$4 >> $2.$5.$3$4 | ||
} | ||
|
||
function compile_language() { | ||
compile_partition development dev v4 _gold_conll $1 | ||
compile_partition train train v4 _gold_conll $1 | ||
compile_partition test test v4 _gold_conll $1 | ||
} | ||
|
||
conll_url=http://conll.cemantix.org/2012/download | ||
download_and_extract $conll_url conll-2012-train.v4.tar.gz | ||
download_and_extract $conll_url conll-2012-development.v4.tar.gz | ||
download_and_extract $conll_url/test conll-2012-test-key.tar.gz | ||
download_and_extract $conll_url/test conll-2012-test-official.v9.tar.gz | ||
|
||
download_and_extract $conll_url conll-2012-scripts.v3.tar.gz | ||
|
||
download_and_extract http://conll.cemantix.org/download reference-coreference-scorers.v8.01.tar.gz | ||
mv reference-coreference-scorers conll-2012/scorer | ||
|
||
# Convert the ontonotes data into the CONLL format. | ||
bash conll-2012/v3/scripts/skeleton2conll.sh -D $ONTONOTES_PATH/data/files/data conll-2012 | ||
|
||
compile_language english |
Oops, something went wrong.