- https://github.com/tree-sitter/py-tree-sitter

# Environment

In [1]:
!pip install tree-sitter
# upgrade version to latest
!pip install --upgrade tree-sitter


Collecting tree-sitter
  Downloading tree_sitter-0.23.1-cp312-cp312-win_amd64.whl.metadata (10 kB)
Downloading tree_sitter-0.23.1-cp312-cp312-win_amd64.whl (117 kB)
Installing collected packages: tree-sitter
Successfully installed tree-sitter-0.23.1


In [2]:
!pip install tree-sitter-python

Collecting tree-sitter-python
  Downloading tree_sitter_python-0.23.2-cp39-abi3-win_amd64.whl.metadata (2.0 kB)
Downloading tree_sitter_python-0.23.2-cp39-abi3-win_amd64.whl (75 kB)
Installing collected packages: tree-sitter-python
Successfully installed tree-sitter-python-0.23.2


# Setup

In [3]:
import tree_sitter_python as tspython
from tree_sitter import Language, Parser

PY_LANGUAGE = Language(tspython.language())

# Basic parsing

In [4]:
parser = Parser(PY_LANGUAGE)

src = bytes(
    """
def foo():
    if bar:
        baz()
""",
    "utf8"
)
tree = parser.parse(src)

In [5]:
print(tree.root_node)

(module (function_definition name: (identifier) parameters: (parameters) body: (block (if_statement condition: (identifier) consequence: (block (expression_statement (call function: (identifier) arguments: (argument_list))))))))


## print_tree

In [3]:
from tree_sitter import Node

def print_tree(root: Node, indent: int = 0):
    if root is None:
        return
    print_node(root, indent)
    if root.children is None or len(root.children) == 0:
        return
    for child in root.children:
        print_tree(child, indent+1)


def print_node(node: Node, indent: int = 0):
    if node is None:
        return
    print('|--' * indent + "{text} [{type}]".format(text=node.text.decode(), type=node.type))

In [7]:
print_tree(tree.root_node, 0)

def foo():
    if bar:
        baz()
 [module]
|--def foo():
    if bar:
        baz() [function_definition]
|--|--def [def]
|--|--foo [identifier]
|--|--() [parameters]
|--|--|--( [(]
|--|--|--) [)]
|--|--: [:]
|--|--if bar:
        baz() [block]
|--|--|--if bar:
        baz() [if_statement]
|--|--|--|--if [if]
|--|--|--|--bar [identifier]
|--|--|--|--: [:]
|--|--|--|--baz() [block]
|--|--|--|--|--baz() [expression_statement]
|--|--|--|--|--|--baz() [call]
|--|--|--|--|--|--|--baz [identifier]
|--|--|--|--|--|--|--() [argument_list]
|--|--|--|--|--|--|--|--( [(]
|--|--|--|--|--|--|--|--) [)]


# Pattern-matching


In [8]:
query = PY_LANGUAGE.query(
    """
(function_definition
  name: (identifier) @function.def
  body: (block) @function.block)

(call
  function: (identifier) @function.call
  arguments: (argument_list) @function.args)
"""
)

captures = query.captures(tree.root_node)
matches = query.matches(tree.root_node)

import pprint
pprint.pp(captures)
pprint.pp(matches)

{'function.def': [<Node type=identifier, start_point=(1, 4), end_point=(1, 7)>],
 'function.block': [<Node type=block, start_point=(2, 4), end_point=(3, 13)>],
 'function.call': [<Node type=identifier, start_point=(3, 8), end_point=(3, 11)>],
 'function.args': [<Node type=argument_list, start_point=(3, 11), end_point=(3, 13)>]}
[(0,
  {'function.def': [<Node type=identifier, start_point=(1, 4), end_point=(1, 7)>],
   'function.block': [<Node type=block, start_point=(2, 4), end_point=(3, 13)>]}),
 (1,
  {'function.call': [<Node type=identifier, start_point=(3, 8), end_point=(3, 11)>],
   'function.args': [<Node type=argument_list, start_point=(3, 11), end_point=(3, 13)>]})]


# py-tree-sitter-languages

- https://github.com/grantjenks/py-tree-sitter-languages


The project also includes the following other projects distributed in binary form:

- https://github.com/tree-sitter/tree-sitter
- https://github.com/WhatsApp/tree-sitter-erlang
- https://github.com/Azganoth/tree-sitter-lua
- https://github.com/Wilfred/tree-sitter-elisp
- https://github.com/alemuller/tree-sitter-make
- https://github.com/camdencheek/tree-sitter-dockerfile
- https://github.com/camdencheek/tree-sitter-go-mod
- https://github.com/elixir-lang/tree-sitter-elixir
- https://github.com/elm-tooling/tree-sitter-elm
- https://github.com/fwcd/tree-sitter-kotlin
- https://github.com/ganezdragon/tree-sitter-perl
- https://github.com/ikatyang/tree-sitter-markdown
- https://github.com/ikatyang/tree-sitter-yaml
- https://github.com/jiyee/tree-sitter-objc
- https://github.com/m-novikov/tree-sitter-sql
- https://github.com/r-lib/tree-sitter-r
- https://github.com/rydesun/tree-sitter-dot
- https://github.com/slackhq/tree-sitter-hack
- https://github.com/theHamsta/tree-sitter-commonlisp
- https://github.com/tree-sitter/tree-sitter-bash
- https://github.com/tree-sitter/tree-sitter-c
- https://github.com/tree-sitter/tree-sitter-c-sharp
- https://github.com/tree-sitter/tree-sitter-cpp
- https://github.com/tree-sitter/tree-sitter-css
- https://github.com/tree-sitter/tree-sitter-embedded-template
- https://github.com/tree-sitter/tree-sitter-go
- https://github.com/tree-sitter/tree-sitter-haskell
- https://github.com/tree-sitter/tree-sitter-html
- https://github.com/tree-sitter/tree-sitter-java
- https://github.com/tree-sitter/tree-sitter-javascript
- https://github.com/tree-sitter/tree-sitter-jsdoc
- https://github.com/tree-sitter/tree-sitter-json
- https://github.com/tree-sitter/tree-sitter-julia
- https://github.com/tree-sitter/tree-sitter-ocaml
- https://github.com/tree-sitter/tree-sitter-php
- https://github.com/tree-sitter/tree-sitter-python
- https://github.com/tree-sitter/tree-sitter-ql
- https://github.com/tree-sitter/tree-sitter-regex
- https://github.com/tree-sitter/tree-sitter-ruby
- https://github.com/tree-sitter/tree-sitter-rust
- https://github.com/tree-sitter/tree-sitter-scala
- https://github.com/dhcmrlchtdj/tree-sitter-sqlite
- https://github.com/tree-sitter/tree-sitter-toml
- https://github.com/tree-sitter/tree-sitter-tsq
- https://github.com/tree-sitter/tree-sitter-typescript
- https://github.com/stsewd/tree-sitter-rst
- https://github.com/MichaHoffmann/tree-sitter-hcl
- https://github.com/stadelmanma/tree-sitter-fortran
- https://github.com/stadelmanma/tree-sitter-fixed-form-fortran

In [9]:
!pip install tree_sitter_languages
# downgrade version of tree_sitter
!pip install --force-reinstall tree_sitter==0.20.4

# WARN: need restart kernel

Collecting tree_sitter_languages




  Downloading tree_sitter_languages-1.10.2-cp312-cp312-win_amd64.whl.metadata (11 kB)
Downloading tree_sitter_languages-1.10.2-cp312-cp312-win_amd64.whl (8.3 MB)
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.3 MB ? eta -:--:--
   ------

  You can safely remove it manually.


In [1]:
def parse_code(code: str = '', lang: str = 'python'):
    from tree_sitter_languages import get_language, get_parser

    language = get_language(lang)
    parser = get_parser(lang)
    tree = parser.parse(code.encode())
    root = tree.root_node
    # print(root.sexp())
    print_tree(root)

In [4]:
example = """
#!shebang
# License blah blah (Apache 2.0)
"This is a module docstring."

a = 1

'''This
is
not
a
multiline
comment.'''

b = 2

class Test:
    "This is a class docstring."

    'This is bogus.'

    def test(self):
        "This is a function docstring."

        "Please, no."

        return 1

c = 3
"""

parse_code(example, 'python')

#!shebang
# License blah blah (Apache 2.0)
"This is a module docstring."

a = 1

'''This
is
not
a
multiline
comment.'''

b = 2

class Test:
    "This is a class docstring."

    'This is bogus.'

    def test(self):
        "This is a function docstring."

        "Please, no."

        return 1

c = 3
 [module]
|--#!shebang [comment]
|--# License blah blah (Apache 2.0) [comment]
|--"This is a module docstring." [expression_statement]
|--|--"This is a module docstring." [string]
|--|--|--" [string_start]
|--|--|--This is a module docstring. [string_content]
|--|--|--" [string_end]
|--a = 1 [expression_statement]
|--|--a = 1 [assignment]
|--|--|--a [identifier]
|--|--|--= [=]
|--|--|--1 [integer]
|--'''This
is
not
a
multiline
comment.''' [expression_statement]
|--|--'''This
is
not
a
multiline
comment.''' [string]
|--|--|--''' [string_start]
|--|--|--This
is
not
a
multiline
comment. [string_content]
|--|--|--''' [string_end]
|--b = 2 [expression_statement]
|--|--b = 2 [assignment]
|--|--

In [5]:
example = """
SELECT * FROM db.table WHERE id > 100;"""
parse_code(example, 'sql')

SELECT * FROM db.table WHERE id > 100; [source_file]
|--SELECT * FROM db.table WHERE id > 100 [select_statement]
|--|--SELECT * [select_clause]
|--|--|--SELECT [SELECT]
|--|--|--* [select_clause_body]
|--|--|--|--* [asterisk_expression]
|--|--|--|--|--* [*]
|--|--FROM db.table [from_clause]
|--|--|--FROM [FROM]
|--|--|--db.table [dotted_name]
|--|--|--|--db [identifier]
|--|--|--|--. [.]
|--|--|--|--table [identifier]
|--|--WHERE id > 100 [where_clause]
|--|--|--WHERE [WHERE]
|--|--|--id > 100 [binary_expression]
|--|--|--|--id [identifier]
|--|--|--|--> [>]
|--|--|--|--100 [number]
|--; [;]
