In [1]:
import os
import numpy as np
import glob
from stanfordcorenlp import StanfordCoreNLP
from tqdm import tqdm
import itertools
import corenlp

os.environ["CORENLP_HOME"] = '/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp'

In [None]:
def make_dirs(dirs):
    for d in dirs:
        if not os.path.exists(d):
            os.makedirs(d)

def dependency_parse(filepath,  client, cp='', tokenize=True):
    print('\nDependency parsing ' + filepath)
    dirpath = os.path.dirname(filepath)
    filepre = os.path.splitext(os.path.basename(filepath))[0]
    parentpath = os.path.join(dirpath, filepre + '.parents')
    deps = []
    with open(filepath) as file:
        for line in tqdm(file, total=file.tell()):
            temp = client.dependency_parse(line)
            temp = list(map(lambda x: [int(x[1]), int(x[2])], temp))
            temp = list(itertools.chain(*temp))
            deps.append(temp)
    np.save(parentpath, np.array(deps))

def split(filepath, dst_dir, client):
    with open(filepath) as datafile, \
            open(os.path.join(dst_dir, 'a.txt'), 'w') as afile, \
            open(os.path.join(dst_dir, 'b.txt'), 'w') as bfile:
        datafile.readline()
        for line in tqdm(datafile):
            a, b = line.strip().split('\t')

            ann = client.annotate(a)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            afile.write(a + '\n')
                
            ann = client.annotate(b)
            s = ' '.join([w.word for w in ann.sentence[0].token])
            bfile.write(b + '\n')

def parse(dirpath, client, cp=''):
    dependency_parse(os.path.join(dirpath, 'a.txt'), client, cp=cp, tokenize=True)
    dependency_parse(os.path.join(dirpath, 'b.txt'), client, cp=cp, tokenize=True)

if __name__ == '__main__':
    print('=' * 80)
    print('Preprocessing dataset')
    print('=' * 80)

    base_dir = ''
    data_dir = os.path.join(base_dir, 'data')
    all_dir = os.path.join(data_dir, 'translation/all_data')
    lib_dir = os.path.join(base_dir, 'lib')
    train_dir = os.path.join(data_dir, 'translation/train')
    #dev_dir = os.path.join(data_dir, 'translation/dev')
    #test_dir = os.path.join(data_dir, 'translation/test')
    make_dirs([train_dir])

    # java classpath for calling Stanford parser
    classpath = ':'.join([
        lib_dir,
        os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
        os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

    # split into separate files
    client = corenlp.CoreNLPClient(annotators="tokenize ssplit".split())
    print('create client')
    split(os.path.join(all_dir, 'en-spa.txt'), train_dir, client)
    #split(os.path.join(all_dir, 'SICK_trial.txt'), dev_dir)
    #split(os.path.join(all_dir, 'SICK_test_annotated.txt'), test_dir)

    # parse sentences
    client = StanfordCoreNLP(r'data/lib/stanford-corenlp')
    parse(train_dir, client, cp=classpath)

In [4]:
client = corenlp.CoreNLPClient(annotators="tokenize ssplit".split())

In [5]:
text = "Chris wrote a simple sentence that he parsed with Stanford CoreNLP."

# We assume that you've downloaded Stanford CoreNLP and defined an environment
# variable $CORENLP_HOME that points to the unzipped directory.
# The code below will launch StanfordCoreNLPServer in the background
# and communicate with the server to annotate the sentence.
with corenlp.CoreNLPClient(annotators="tokenize ssplit".split(), timeout=10000, endpoint='http://localhost:9010') as client:
  ann = client.annotate(text)

# You can access annotations using ann.
sentence = ann.sentence[0]

PermanentlyFailedException: Timed out waiting for service to come alive.

In [4]:
client

<corenlp.client.CoreNLPClient at 0x7f2630df1e80>

In [3]:
import numpy as np
nlp = StanfordCoreNLP(r'data/lib/stanford-corenlp')

sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'

a = np.array(nlp.dependency_parse(sentence))

In [18]:
a = np.loadtxt('f.txt')

In [19]:

a

array([[  0.,   7.],
       [  2.,   1.],
       [  7.,   2.],
       [  5.,   3.],
       [  5.,   4.],
       [  2.,   5.],
       [  7.,   6.],
       [  9.,   8.],
       [  7.,   9.],
       [  7.,  10.]])

In [51]:
list(map(lambda x:[int(x[1]), int(x[2])], a))

[[0, 7],
 [2, 1],
 [7, 2],
 [5, 3],
 [5, 4],
 [2, 5],
 [7, 6],
 [9, 8],
 [7, 9],
 [7, 10]]

In [21]:
"""
Preprocessing script for SICK data.
"""



In [22]:
    print('=' * 80)
    print('Preprocessing SICK dataset')
    print('=' * 80)

    base_dir = ''
    data_dir = os.path.join(base_dir, 'data')
    all_dir = os.path.join(data_dir, 'translation/all_data')
    lib_dir = os.path.join(base_dir, 'lib')
    train_dir = os.path.join(data_dir, 'translation/train')
    #dev_dir = os.path.join(data_dir, 'translation/dev')
    #test_dir = os.path.join(data_dir, 'translation/test')
    make_dirs([train_dir])

    # java classpath for calling Stanford parser
    classpath = ':'.join([
        lib_dir,
        os.path.join(lib_dir, 'stanford-parser/stanford-parser.jar'),
        os.path.join(lib_dir, 'stanford-parser/stanford-parser-3.5.1-models.jar')])

    # split into separate files
    split(os.path.join(all_dir, 'en-spa.txt'), train_dir)
    #split(os.path.join(all_dir, 'SICK_trial.txt'), dev_dir)
    #split(os.path.join(all_dir, 'SICK_test_annotated.txt'), test_dir)

    # parse sentences
    parse(train_dir, cp=classpath)
    #parse(dev_dir, cp=classpath)
    #parse(test_dir, cp=classpath)


0it [00:00, ?it/s]

Preprocessing SICK dataset

Dependency parsing data/translation/train/a.txt


115244it [16:37, 115.58it/s]
25it [00:00, 241.35it/s]


Dependency parsing data/translation/train/b.txt


115244it [27:51, 68.94it/s]


In [33]:
import corenlp
text = "Chris wrote a simple gotta sentence that he parsed with Stanford CoreNLP."

# We assume that you've downloaded Stanford CoreNLP and defined an environment
# variable $CORENLP_HOME that points to the unzipped directory.
# The code below will launch StanfordCoreNLPServer in the background
# and communicate with the server to annotate the sentence.
ann = client.annotate(text)
s = ' '.join([w.word for w in ann.sentence[0].token])



In [32]:
client =  corenlp.CoreNLPClient(annotators="tokenize ssplit".split())


In [16]:
# You can access annotations using ann.
s

[token {
  word: "Chris"
  value: "Chris"
  before: ""
  after: " "
  originalText: "Chris"
  beginChar: 0
  endChar: 5
  tokenBeginIndex: 0
  tokenEndIndex: 1
  hasXmlContext: false
}
token {
  word: "wrote"
  value: "wrote"
  before: " "
  after: " "
  originalText: "wrote"
  beginChar: 6
  endChar: 11
  tokenBeginIndex: 1
  tokenEndIndex: 2
  hasXmlContext: false
}
token {
  word: "a"
  value: "a"
  before: " "
  after: " "
  originalText: "a"
  beginChar: 12
  endChar: 13
  tokenBeginIndex: 2
  tokenEndIndex: 3
  hasXmlContext: false
}
token {
  word: "simple"
  value: "simple"
  before: " "
  after: " "
  originalText: "simple"
  beginChar: 14
  endChar: 20
  tokenBeginIndex: 3
  tokenEndIndex: 4
  hasXmlContext: false
}
token {
  word: "got"
  value: "got"
  before: " "
  after: ""
  originalText: "got"
  beginChar: 21
  endChar: 24
  tokenBeginIndex: 4
  tokenEndIndex: 5
  hasXmlContext: false
}
token {
  word: "ta"
  value: "ta"
  before: ""
  after: " "
  originalText: "ta"
  

In [4]:
import os
os.environ["CORENLP_HOME"] = '/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp'

In [None]:
import subprocess
com = 'java -cp "/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp/stanford-corenlp-3.9.2-models.jar:/home/krivas/projects/neural-wsd/new_experiments/data/lib/stanford-corenlp/stanford-corenlp-3.9.2.jar"  -Xmx2g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize -file input.txt -outputFormat json'
output = subprocess.Popen(['ls', '-la'], stdout=subprocess.PIPE)
response=output.communicate()[0]
print (response.decode())

In [17]:
t.shape

(115244,)

In [19]:
t[0]

[[0, 1], [1, 2]]