<a href="https://colab.research.google.com/github/jonnyli1125/jp-srparser/blob/main/visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# jp-srparser
https://github.com/jonnyli1125/jp-srparser  
Shift Reduce Dependency Parser for Japanese

### Colab Setup

In [None]:
!pip install pyconll
!pip install --upgrade spacy
!pip install sortedcontainers==2.1.0 sudachipy sudachidict_core
!git clone https://github.com/jonnyli1125/jp-srparser

In [None]:
import sys
sys.path.insert(0, "/content/jp-srparser")

import data
data.WORD2VEC_PATH = "/content/jp-srparser/embeddings/jawiki_gsd_word2vec.txt"
data.MODEL_LISTS_PATH = "/content/jp-srparser/model_lists.txt"

### Example Usage

In [81]:
# Import libraries
import torch
import spacy

from model import ParserModel
from parse import parse


# Load pretrained model
model = ParserModel()
model.load_state_dict(torch.load("/content/jp-srparser/model.pth"))
model.eval()

# Translation: "Natural language processing is a series of technologies that allow a computer to process the natural language that humans use on a daily basis."
sentence = "自然言語処理とは、人間が日常的に使っている自然言語をコンピュータに処理させる一連の技術のことだ。"

# Split sentence into PoS-tagged tokens
ja = spacy.blank("ja")
tokens = [(word.text, word.pos_) for word in ja(sentence)]

# Parse dependencies
deps = parse(tokens, model)
print(deps)

{(18, 19, 'case'), (9, 10, 'mark'), (16, 15, 'compound'), (12, 9, 'advcl'), (27, 3, 'nsubj'), (23, 20, 'acl'), (3, 6, 'punct'), (12, 14, 'aux'), (20, 16, 'obj'), (20, 22, 'aux'), (27, 25, 'nmod'), (9, 11, 'cop'), (16, 17, 'case'), (16, 12, 'acl'), (3, 2, 'compound'), (20, 18, 'obl'), (20, 21, 'aux'), (3, 5, 'case'), (3, 4, 'case'), (7, 8, 'case'), (3, 1, 'compound'), (0, 27, 'root'), (25, 23, 'nmod'), (27, 29, 'punct'), (23, 24, 'case'), (25, 26, 'case'), (27, 28, 'cop'), (12, 7, 'nsubj'), (12, 13, 'mark')}


### Example Dependency Tree Visualization

In [82]:
from spacy import displacy
from spacy.tokens import Doc

words = [w[0] for w in tokens]
spaces = [False] * len(tokens)
pos = [w[1] for w in tokens]
heads = [0] * len(tokens)
deps_ = [""] * len(tokens)
for head_idx, dep_idx, deprel in deps:
    heads[dep_idx-1] = head_idx-1 if head_idx > 0 else None
    deps_[dep_idx-1] = deprel
doc = Doc(ja.vocab, words=words, spaces=spaces, pos=pos, heads=heads, deps=deps_)
displacy.render(doc, style="dep", jupyter=True, options={'distance': 140, 'word_spacing': 40})