In [2]:
import subprocess
import shlex

RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar ../ark-tweet-nlp-0.3.2.jar"

def _split_results(rows):
#Parse the tab-delimited returned lines, modified from: https://github.com/brendano/ark-tweet-nlp/blob/master/scripts/show.py"""
    for line in rows:
        line = line.strip()  # remove '\n'
        if len(line) > 0:
            if line.count('\t') == 2:
                parts = line.split('\t')
                tokens = parts[0]
                tags = parts[1]
                confidence = float(parts[2])
                yield tokens, tags, confidence

def _call_runtagger(tweets, run_tagger_cmd=RUN_TAGGER_CMD):
    #"""Call runTagger.sh using a named input file"""

    # remove carriage returns as they are tweet separators for the stdin
    # interface
    tweets_cleaned = [tw.replace('\n', ' ') for tw in tweets]
    message = "\n".join(tweets_cleaned)
    message = message.encode('utf-8')
    args = shlex.split(RUN_TAGGER_CMD)
    args.append('--output-format')
    args.append('conll')
    po = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(po)
    result = po.communicate(message)
    print(result)
    pos_result = result[0].strip('\n\n')  # get first line, remove final double carriage return
    pos_result = pos_result.split('\n\n')  # split messages by double carriage returns
    pos_results = [pr.split('\n') for pr in pos_result]  # split parts of message by each carriage return
    return pos_results

def runtagger_parse(tweets, run_tagger_cmd=RUN_TAGGER_CMD):
    # """Call runTagger.sh on a list of tweets, parse the result, return lists of tuples of (term, type, confidence)"""
    print (tweets)
    print (run_tagger_cmd)
    pos_raw_results = _call_runtagger(tweets, run_tagger_cmd)
    pos_result = []
    for pos_raw_result in pos_raw_results:
        print ([x for x in _split_results(pos_raw_result)])
        pos_result.append([x for x in _split_results(pos_raw_result)])
    print (pos_result)
    return pos_result

def check_script_is_present(run_tagger_cmd=RUN_TAGGER_CMD):
    # """Simple test to make sure we can see the script"""
    success = False
    try:
        args = shlex.split(run_tagger_cmd)
        args.append("--help")
        po = subprocess.Popen(args, stdout=subprocess.PIPE)
        # old call - made a direct call to runTagger.sh (not Windows friendly)
        #po = subprocess.Popen([run_tagger_cmd, '--help'], stdout=subprocess.PIPE)
        while not po.poll():
            lines = [l for l in po.stdout]
        # we expected the first line of --help to look like the following:
        assert "RunTagger [options]" in lines[0]
        success = 1
        print(success)
    except OSError as err:
     print ("HELLO")
        #  print "Caught an OSError, have you specified the correct path to runTagger.sh? We are using \"%s\". Exception: %r" % (run_tagger_cmd, repr(err))
    return success

if __name__ == "__main__":
    print ("Checking that we can see \"%s\", this will crash if we can't" % (RUN_TAGGER_CMD))
    #success = check_script_is_present()
    success = 1
    if success:
        print ("Success.")
        print ("Now pass in two messages, get a list of tuples back:")
        tweets = ['this is a message', 'and a second message']
        print (runtagger_parse(tweets))
        









Checking that we can see "java -XX:ParallelGCThreads=2 -Xmx500m -jar ../ark-tweet-nlp-0.3.2.jar", this will crash if we can't
Success.
Now pass in two messages, get a list of tuples back:
['this is a message', 'and a second message']
java -XX:ParallelGCThreads=2 -Xmx500m -jar ../ark-tweet-nlp-0.3.2.jar
<subprocess.Popen object at 0x000000CA0CBBE080>
(b'', b'Listening on stdin for input.  (-h for help)\r\njava.io.FileNotFoundException: \\dev\\stdin (The system cannot find the path specified)\r\n\tat java.io.FileInputStream.open0(Native Method)\r\n\tat java.io.FileInputStream.open(Unknown Source)\r\n\tat java.io.FileInputStream.<init>(Unknown Source)\r\n\tat java.io.FileInputStream.<init>(Unknown Source)\r\n\tat cmu.arktweetnlp.util.BasicFileIO.openFileToReadUTF8(BasicFileIO.java:50)\r\n\tat cmu.arktweetnlp.RunTagger.runTagger(RunTagger.java:93)\r\n\tat cmu.arktweetnlp.RunTagger.main(RunTagger.java:364)\r\nOct 23, 2018 1:19:03 PM cmu.arktweetnlp.util.BasicFileIO openFileToReadUTF8\r\nSEV

TypeError: a bytes-like object is required, not 'str'