# Natlang Dataset Toolkit

## 1. Import the toolkit

In [9]:
import natlang as nl

# Load txt
dataset = nl.load("../natlang/test/sampleConalaTxt.txt")
dataset

|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 6 sents (in: 0:00:00)


[['convert',
  'a',
  'datetime',
  'string',
  'back',
  'to',
  'a',
  'datetime',
  'object',
  'of',
  'format',
  "`'%y-%m-%d",
  "%h:%m:%s.%f'`"],
 ['find',
  'maximum',
  'with',
  '`lookahead`',
  '=',
  '`4`',
  'in',
  'a',
  'list',
  '`arr`'],
 ['in',
  'django,',
  'filter',
  '`task.objects`',
  'based',
  'on',
  'all',
  'entities',
  'in',
  "`['a',",
  "'p',",
  "'f']`"],
 ['join',
  'multiple',
  'dataframes',
  '`d1`,',
  '`d2`,',
  'and',
  '`d3`',
  'on',
  'column',
  "`'name'`"],
 ['using',
  "python's",
  'datetime',
  'module,',
  'get',
  'the',
  'year',
  'that',
  'utc-11',
  'is',
  'currently',
  'in'],
 ['get',
  'the',
  'ascii',
  'value',
  'of',
  'a',
  'character',
  "`u'あ'`",
  'as',
  'an',
  'int']]

## 2. Constituency Tree Loader

The constituency tree loader is called 'tree', or it can also use the default 'txtOrTree'.
To access these different formats, use the format option in `nl.loader`.

This is the same as the 'astTree'.

In [10]:
# Default loader loads txt or trees automatically
dataset = nl.load("../natlang/test/sampleTree.txt")
print(len(dataset))

# You can also specify the format manually
dataset = nl.load("../natlang/test/sampleTree.txt", format='tree')
print(len(dataset))

# You can also specify the format manually
dataset = nl.load("../natlang/test/sampleTree.txt", format=nl.format.tree)
print(len(dataset))

|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 3 sents (in: 0:00:00)
|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 3 sents (in: 0:00:00)
|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 3 sents (in: 0:00:00)

3
3
3





In [11]:
dataset[0].onScreen()

(1, 'ROOT'
  (2, 'S'
    (3, 'NP', 'Andrei')
    (4, 'VP'
      (5, 'VP', 'likes')
      (6, 'NP', 'cheese')
    )
  )
)


`tree.columnFormat() = parColumn, sibColumn, valColumn, hasChild, hasSibl`
This method gives you some basic information for the tree in columns.
`parColumn` gives you an array indicating the node's parent node index. Index 0 is reserved for NULL node, and Index 1 is reserved for ROOT node.
`sibColumn` gives 'next sibling' index. For nodes with multiple siblings, the first sibling will be the 

In [12]:
dataset[0].columnFormat()

([0, 0, 1, 2, 2, 4, 4],
 [0, 0, 0, 0, 3, 0, 5],
 [('NULL',),
  ('ROOT',),
  ('S',),
  ('NP', 'Andrei'),
  ('VP',),
  ('VP', 'likes'),
  ('NP', 'cheese')],
 [0, 1, 1, 0, 1, 0, 0],
 [0, 0, 0, 1, 0, 1, 0])

In [13]:
dataset[0].onScreen()

(1, 'ROOT'
  (2, 'S'
    (3, 'NP', 'Andrei')
    (4, 'VP'
      (5, 'VP', 'likes')
      (6, 'NP', 'cheese')
    )
  )
)


In [14]:
# A non-terminal node will have only one label
dataset[0].value

('ROOT',)

In [18]:
# Any node will have sibling or children, pointing to corresponding nodes
print(dataset[0].child, dataset[0].sibling)
print(dataset[0].child.value)
dataset[0].child.onScreen()

<natlang.format.tree.Node object at 0x109628ef0> None
('S',)
(2, 'S'
  (3, 'NP', 'Andrei')
  (4, 'VP'
    (5, 'VP', 'likes')
    (6, 'NP', 'cheese')
  )
)


In [19]:
# Every node has a unique ID in this sample
# Root node always has ID 1
print(dataset[0].id, dataset[0].child.id)

1 2


In [22]:
# ID calculation is automatic
# When you call method `calcId`, the current Node will be reset to the given input, and traverse the tree to generate the rest
dataset[0].calcId(1)
dataset[0].onScreen()
dataset[0].calcId(5)
dataset[0].onScreen()

(1, 'ROOT'
  (2, 'S'
    (3, 'NP', 'Andrei')
    (4, 'VP'
      (5, 'VP', 'likes')
      (6, 'NP', 'cheese')
    )
  )
)
(5, 'ROOT'
  (6, 'S'
    (7, 'NP', 'Andrei')
    (8, 'VP'
      (9, 'VP', 'likes')
      (10, 'NP', 'cheese')
    )
  )
)


In [None]:
# Each element in the dataset is a unique tree node root
dataset[0].onScreen()
dataset[1].onScreen()

In [30]:
# All you need to do to build a new tree, is to connect the child and silbings
# Give you an example ('Root' ('greater_than' ('block' 'blue') ('block' 'black')) )
root = nl.format.astTree.AstNode()
root.value = ('Root',)
greater_than = root.child = nl.format.astTree.AstNode(parent=root)
greater_than.value = ('greater_than',)
block_blue = greater_than.child =  nl.format.astTree.AstNode(parent=greater_than)
block_blue.value = ('block', 'blue')
block_black = block_blue.sibling = nl.format.astTree.AstNode(parent=greater_than)
block_black.value = ('block', 'black')
root.onScreen()
root.columnFormat()

(0, 'Root'
  (0, 'greater_than'
    (0, 'block', 'blue')
    (0, 'block', 'black')
  )
)


([0], [-1], [('block', 'black')], [1], [1])

In [31]:
# Initialise ID and Column
root.refresh()
root.onScreen()
root.columnFormat()

(1, 'Root'
  (2, 'greater_than'
    (3, 'block', 'blue')
    (4, 'block', 'black')
  )
)


([0, 0, 1, 2, 2],
 [0, 0, 0, 0, 3],
 [('NULL',),
  ('Root',),
  ('greater_than',),
  ('block', 'blue'),
  ('block', 'black')],
 [0, 1, 1, 0, 0],
 [0, 0, 0, 1, 0])

## 3. CoNLL Tree Loader

The constituency tree loader is called 'conll'.
To access these different formats, use the format option in `nl.loader`.

In [6]:
# You need to specify the format manually
dataset = nl.load("../natlang/test/sampleCoNLLU.conll", format='conll')
print(len(dataset))

# Or like this
dataset = nl.load("../natlang/test/sampleCoNLLU.conll", format=nl.format.conll)
print(len(dataset))

|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 46 lines (in: 0:00:00)
|>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>| Time: 0:00:00; Total: 46 lines (in: 0:00:00)

3
3





In [7]:
dataset[0]

ROOT
 │               ┌─case─From
 │               ┌─det─the
 │       ┌─nmod─AP
 └─root─comes
         │       ┌─det─this
         └─nsubj─story
         └─punct─:



Representation: conll.Node("(0, '-ROOT-')")
Leafnode Label: ['From', 'the', 'AP', 'comes', 'this', 'story', ':']