In [1]:
import matplotlib.pyplot as plt
import re
import numpy as np
import sqlparse
from sqlparse.sql import IdentifierList, Identifier, Token
from sqlparse.tokens import Keyword, DML, Whitespace, Newline, Name, Comparison, Operator, Punctuation, CTE
from collections import OrderedDict

IMPORTANT_KEYWORDS = ["SELECT","WHERE","FROM", "INNER","JOIN","OUTER", "STRAIGHT_JOIN","LEFT","RIGHT","FULL","WITH", "GROUP", "BY",
                     "ORDER", "UNION", "DISTINCT", "HAVING","HAVING", "AS", "CASE", "WHEN", "THEN","ELSE" ]




def is_subselect(parsed):
    '''
    @param parsed: a parsed SQL query
    @return: True if there is subquery or False if there is not
    '''
    if not parsed.is_group:
        return False
    for item in parsed.tokens:
        if item.ttype is DML and item.value.upper() == 'SELECT':
            return True
    return False


def get_template(parsed):
    '''
    @param parsed: a parsed SQL query
    @return: list of tokens and brackets
    '''
    for token in parsed.tokens:
        if (token.ttype is Keyword or token.ttype is DML or token.ttype is CTE) and (token.value.upper() in IMPORTANT_KEYWORDS):
            yield token
        if token.is_group:
            for eachtoken in token:
                if is_subselect(eachtoken):
                    yield "("
                    for x in get_template(eachtoken):
                        yield x
                    yield ")"
            if token.token_first().value.upper() == "WHERE":
                yield token.token_first()
            

def to_string(parsed):
    '''
    @param parsed: a parsed SQL query
    @return: template in string format
    '''
    my_template = ""
    for x in get_template(parsed):
        if isinstance(x,Token):
            my_template += x.value.upper() + " "
        else:
            my_template += x + " "

    return my_template
    
def get_queries(file_name):
    '''
    @param file_name: name of the file to be processed
    @return: list of string of queries
    '''
    f = open(file_name,"r")

    queries = []
    current_query =""
    #removing any extra white spaces from the queries and put them in a list "queries" 

    for line in f:
        line = line.lstrip()
        # ignore comments
        if line[0:2] == "--":
            continue
        if line != "" and line[0] != "_":
            current_query += line
        if line!= "" and line[0] == "_":
            if current_query != "":
                queries.append(current_query)
                current_query =""
    
    f.close()
    return queries



list_to_process = []
queries = get_queries("queries.txt")
for query in queries:
    parsed_query  = sqlparse.parse(query)[0]
    list_to_process.append(to_string(parsed_query))


dict_template = {}    
for template in list_to_process:
    val = dict_template.get(template, 0)
    dict_template[template] = val + 1

ord_dict = OrderedDict(sorted(dict_template.items(), key=lambda x: x[1]))
for (key, val) in ord_dict.items():
    print(key,val)

      


SELECT FROM ( SELECT DISTINCT FROM ( SELECT FROM ) )  1
SELECT FROM ( SELECT FROM ( SELECT FROM JOIN ) )  1
SELECT DISTINCT FROM ( SELECT FROM ( SELECT FROM JOIN ) )  1
SELECT FROM ( SELECT DISTINCT FROM ( SELECT FROM ( SELECT FROM JOIN ) ) WHERE )  1
SELECT FROM ( SELECT FROM ( SELECT FROM JOIN ) GROUP BY )  1
SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM SELECT FROM  1
SELECT DISTINCT FROM ( SELECT FROM ) GROUP BY  1
SELECT FROM SELECT FROM ORDER BY  1
SELECT FROM ( SELECT SE