# Parse the sql to encode and decode only some parts of it

In [1]:
import sqlparse

# Define the SQL statement
sql_statement = "SELECT name, age, email FROM users"

# Parse the SQL statement using sqlparse
parsed_statement = sqlparse.parse(sql_statement)[0]

print(parsed_statement)

# Extract the field names from the SELECT statement
# fields = [str(token) for token in parsed_statement.tokens if token.ttype is sqlparse.tokens.Name]

# # Print the field names
# print(fields)


SELECT name, age, email FROM users


In [2]:
import glob
dir(sqlparse)

['__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'cli',
 'engine',
 'exceptions',
 'filters',
 'format',
 'formatter',
 'keywords',
 'lexer',
 'parse',
 'parsestream',
 'split',
 'sql',
 'tokens',
 'utils']

In [None]:
raw = 'select * from foo; select * from bar;'
statements = sqlparse.split(raw)
statements


['select * from foo;', 'select * from bar;']

In [5]:
# Format the first statement and print it out:
first = statements[0]
print(sqlparse.format(first, reindent=True, keyword_case='upper'))


SELECT *
FROM foo;


In [6]:
identify = sqlparse.get_identifiers('select order.orderid, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc')[0]
print(identify.tokens)

AttributeError: module 'sqlparse' has no attribute 'get_identifiers'

In [4]:
parsed = sqlparse.parse('select order.orderid, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc')[0]
parsed.tokens

[<DML 'select' at 0x118EAAE60>,
 <Whitespace ' ' at 0x11990D900>,
 <IdentifierList 'order....' at 0x119909F50>,
 <Whitespace ' ' at 0x11990F880>,
 <Keyword 'from' at 0x11990F8E0>,
 <Whitespace ' ' at 0x11990F940>,
 <Identifier 'orders...' at 0x119909CB0>,
 <Whitespace ' ' at 0x11990FAC0>,
 <Keyword 'join' at 0x11990FB20>,
 <Whitespace ' ' at 0x11990FB80>,
 <Identifier 'revenu...' at 0x119909D90>,
 <Whitespace ' ' at 0x11990FD00>,
 <Keyword 'on' at 0x11990FD60>,
 <Whitespace ' ' at 0x11990FDC0>,
 <Comparison 'r.orde...' at 0x119909EE0>,
 <Whitespace ' ' at 0x119934220>,
 <Keyword 'order ...' at 0x119934280>,
 <Whitespace ' ' at 0x1199342E0>,
 <Identifier 'order....' at 0x119909E70>]

In [23]:
# Parsing a SQL statement:
parsed = sqlparse.parse('select order.orderid, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc')[0]
parsed_tokens = parsed.tokens
for i, token in enumerate(parsed_tokens):
    print(f"Token {i}: {token}")

Token 0: select
Token 1:  
Token 2: order.orderid, order.order_name, revenue.order_id, r.product_name, revenue.cost
Token 3:  
Token 4: from
Token 5:  
Token 6: orders o
Token 7:  
Token 8: join
Token 9:  
Token 10: revenue r
Token 11:  
Token 12: on
Token 13:  
Token 14: r.order_id  = r.order_id
Token 15:  
Token 16: order by
Token 17:  
Token 18: order.orderid desc


In [24]:
parsed_tokens = parsed.tokens
token_counts = {}

for i, token in enumerate(parsed_tokens):
    tok_type = type(token).__name__
    if tok_type in token_counts:
        token_counts[tok_type] += 1
    else:
        token_counts[tok_type] = 1

for tok_type, count in token_counts.items():
    print(f"{tok_type}: {count}")

Token: 14
IdentifierList: 1
Identifier: 3
Comparison: 1


In [17]:
#rewrite above function but return a list of fields per type in a data table
def get_fields(parsed):
    parsed_tokens = parsed.tokens
    token_counts = {}
    for i, token in enumerate(parsed_tokens):
        tok_type = type(token).__name__
        if tok_type in token_counts:
            token_counts[tok_type].append(token)
        else:
            token_counts[tok_type] = [token]
    #return a dictionary of types and their corresponding tokens in a data table
    return token_counts


In [19]:
def get_fields(parsed):
    parsed_tokens = parsed.tokens
    token_counts = {}
    for i, token in enumerate(parsed_tokens):
        tok_type = type(token).__name__
        if tok_type == 'Whitespace' or tok_type == 'Keyword':
            continue
        if tok_type in token_counts:
            token_counts[tok_type].append(str(token))
        else:
            token_counts[tok_type] = [str(token)]
    #return a list of fields per type in a data table
    return token_counts


In [61]:
from sqlparse import parse

# Example SQL query
#query = "select order.order_id, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc"
query = "SELECT first_name, last_name FROM employees WHERE salary > 50000"

# Parse the query into tokens
parsed = parse(query)[0]

# Get the fields in the parsed query
fields = get_fields(parsed)

# Print out the types and tokens in the data table
for tok_type, tokens in fields.items():
    print(f"{tok_type}: {tokens}")


Token: ['SELECT', ' ', ' ', 'FROM', ' ', ' ']
IdentifierList: ['first_name, last_name']
Identifier: ['employees']
Where: ['WHERE salary > 50000']


In [None]:
import sqlparse

def get_identifiers(parsed):
    parsed_tokens = parsed.tokens
    identifiers = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type == 'Whitespace':
            continue
        elif tok_type == 'Keyword':
            continue
        elif tok_type == 'Identifier':
            identifier = str(token)
            if '.' in identifier:
                identifiers.extend(identifier.split('.'))
            else:
                identifiers.append(identifier)
        elif tok_type == 'Name':
            identifiers.append(str(token))
    return identifiers

#sql = "SELECT first_name, last_name FROM employees WHERE department='Sales'"
sql = "select order.order_id, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc"

parsed = sqlparse.parse(sql)[0]
identifiers = get_identifiers(parsed)
print(identifiers)



['orders o', 'revenue r', 'order', 'orderid desc']


In [41]:
def get_identifiers(parsed):
    parsed_tokens = parsed.tokens
    identifier_list = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type == 'IdentifierList':
            for identifier in token.get_identifiers():
                if not isinstance(identifier, sqlparse.sql.Identifier):
                    continue
                identifier_name = identifier.get_name()
                if '.' in identifier_name:
                    identifier_name = identifier_name.split('.')[1]
                identifier_list.append(identifier_name)
        elif tok_type == 'Identifier':
            identifier_name = token.get_name()
            if '.' in identifier_name:
                identifier_name = identifier_name.split('.')[1]
            identifier_list.append(identifier_name)
    return identifier_list


In [None]:
import sqlparse

def get_identifiers(parsed):
    parsed_tokens = parsed.tokens
    identifiers = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type == 'Whitespace':
            continue
        elif tok_type == 'Keyword':
            continue
        elif tok_type == 'Identifier':
            identifier = str(token)
            if '.' in identifier:
                identifiers.extend(identifier.split('.'))
            else:
                identifiers.append(identifier)
        elif tok_type == 'Name':
            identifiers.append(str(token))
    return identifiers

#sql = "SELECT first_name, last_name FROM employees WHERE department='Sales'"
sql = "select order.order_id, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc"

parsed = sqlparse.parse(sql)[0]
identifiers = get_identifiers(parsed)
print(identifiers)



['orders o', 'revenue r', 'order', 'orderid desc']


In [None]:
import sqlparse

def get_identifiers(parsed):
    parsed_tokens = parsed.tokens
    identifiers = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type == 'Whitespace':
            continue
        elif tok_type == 'Keyword':
            continue
        elif tok_type == 'Identifier':
            identifier = str(token)
            if '.' in identifier:
                identifiers.extend(identifier.split('.'))
            else:
                identifiers.append(identifier)
        elif tok_type == 'Name':
            identifiers.append(str(token))
    return identifiers

#sql = "SELECT first_name, last_name FROM employees WHERE department='Sales'"
sql = "select order.order_id, order.order_name, revenue.order_id, r.product_name, revenue.cost from orders o join revenue r on r.order_id  = r.order_id order by order.orderid desc"

parsed = sqlparse.parse(sql)[0]
identifiers = get_identifiers(parsed)
print(identifiers)



['orders o', 'revenue r', 'order', 'orderid desc']


In [42]:
import sqlparse

sql = "SELECT column1, column2 FROM table1 WHERE column3 = 'value'"
parsed = sqlparse.parse(sql)[0]

identifiers = get_identifiers(parsed)
print(identifiers)


['column1', 'column2', 'table1']


In [50]:
def get_identifier_list(parsed):
    parsed_tokens = parsed.tokens
    identifier_list = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type == 'IdentifierList':
            for identifier in token.get_identifiers():
                if not isinstance(identifier, sqlparse.sql.Identifier):
                    continue
                identifier_name = identifier.get_name()
                if '.' in identifier_name:
                    identifier_name = identifier_name.split('.')[1]
                identifier_list.append(identifier_name)
    return identifier_list


In [52]:
import sqlparse

sql = "SELECT column1, column2 FROM table1 WHERE column3 = 'value'"
parsed = sqlparse.parse(sql)[0]

identifier_list = get_identifier_list(parsed)
print(identifier_list)


['column1', 'column2']


In [72]:
def get_tok_types(parsed):
    parsed_tokens = parsed.tokens
    tok_types = []
    for token in parsed_tokens:
        tok_type = type(token).__name__
        if tok_type in ['Identifier', 'Comparison', 'IdentifierList', 'Where']:
            tok_types.append(tok_type)
    return tok_types


In [73]:
import sqlparse

query = "SELECT first_name, last_name FROM employees WHERE salary > 50000"
parsed_query = sqlparse.parse(query)[0]
tok_types = get_tok_types(parsed_query)
print(tok_types)


['IdentifierList', 'Identifier', 'Where']


In [85]:
import sqlparse

query = "SELECT first_name, last_name FROM employees WHERE salary > 50000"
parsed_query = sqlparse.parse(query)[0]
print(parsed_query.tokens)
# tok_types = get_identifiers(query)
# print(tok_types)

[<DML 'SELECT' at 0x119F13FA0>, <Whitespace ' ' at 0x119F46E60>, <IdentifierList 'first_...' at 0x119F158C0>, <Whitespace ' ' at 0x119F47580>, <Keyword 'FROM' at 0x119F475E0>, <Whitespace ' ' at 0x119F47640>, <Identifier 'employ...' at 0x119F142E0>, <Whitespace ' ' at 0x119F47700>, <Where 'WHERE ...' at 0x119F05770>]


In [194]:
#Alles zusammen - Working for parsing fields from a sql statement. Decide to do it modularly, not sure why but this works. That is why
#works for where!!!
import sqlparse

def get_where_fields(query):
    parsed_query = sqlparse.parse(query)[0]
    where_clause = None
    for token in parsed_query.tokens:
        if isinstance(token, sqlparse.sql.Where):
            where_clause = token
            break
    if not where_clause:
        return []
    fields = []
    for token in where_clause.tokens:
        if isinstance(token, sqlparse.sql.Comparison):
            left = token.left
            if isinstance(left, sqlparse.sql.Identifier):
                fields.append(left.get_name())
            elif isinstance(left, sqlparse.sql.Function):
                fields.append(left.tokens[0].get_name())
    return fields

# Works for identifiers
def get_identifiers(query):
    parsed_tokens = sqlparse.parse(query)[0]
    identifier_set = set()
    for token in parsed_tokens:
        if isinstance(token, sqlparse.sql.IdentifierList):
            for identifier in token.get_identifiers():
                identifier_name = identifier.get_name()
                if '.' in identifier_name:
                    identifier_name = identifier_name.split('.')[1]
                identifier_set.add(identifier_name)
        elif isinstance(token, sqlparse.sql.Identifier):
            identifier_name = token.get_name()
            if '.' in identifier_name:
                identifier_name = identifier_name.split('.')[1]
            identifier_set.add(identifier_name)
    return list(identifier_set)

import sqlparse
query = "SELECT first_name, last_name FROM employees WHERE salary > 50000 AND department = 'Sales'"
identifiers = get_identifiers(query)
where_fields = get_where_fields(query)
print(identifiers + where_fields)

['first_name', 'employees', 'last_name', 'salary', 'department']
