In [7]:
import re
from typing import List

def split_with_comments_and_quotes(line: str) -> List[str]:
    pattern = re.compile(r'/\*.*?\*/|\"[^\"]*\"')
    captures = pattern.findall(line)
    
    placeholder_format = "PLACEHOLDER_{}"
    placeholders = {}
    for i, capture in enumerate(captures):
        placeholder = placeholder_format.format(i)
        line = line.replace(capture, placeholder, 1)
        placeholders[placeholder] = capture
    
    tokens = re.split(r'(\s+|[;,])', line)
    tokens = [token for token in tokens if token.strip()]

    final_tokens = []
    for token in tokens:
        if token in placeholders:
            final_tokens.append(placeholders[token])
        else:
            final_tokens.append(token)
    
    return final_tokens


line = "SELECT 1 /* QUERY_GROUP_ID:main_dashboard_top_query */ \"Palo Alto\";"
tokens = split_with_comments_and_quotes(line)
assert tokens == ['SELECT', '1', '/* QUERY_GROUP_ID:main_dashboard_top_query */', '"Palo Alto"', ';']

line = "SELECT 1 /* QUERY_GROUP_ID:main_dashboard_top_query */ \"/ * LOL COMMENT IN QUOTES * /\";"
tokens = split_with_comments_and_quotes(line)
assert tokens == ['SELECT', '1', '/* QUERY_GROUP_ID:main_dashboard_top_query */', '"/ * LOL COMMENT IN QUOTES * /"', ';']

line = "SELECT 1 /* QUERY_GROUP_ID:main_dashboard_top_query */ \"\"/ * LOL COMMENT IN DOUBLE QUOTES * /\"\";"
tokens = split_with_comments_and_quotes(line)
print(tokens)
assert tokens == ['SELECT', '1', '/* QUERY_GROUP_ID:main_dashboard_top_query */', '""/ * LOL COMMENT IN QUOTES * /""', ';']



['SELECT', '1', '/* QUERY_GROUP_ID:main_dashboard_top_query */', 'PLACEHOLDER_1/', '*', 'LOL', 'COMMENT', 'IN', 'DOUBLE', 'QUOTES', '*', '/PLACEHOLDER_2', ';']


AssertionError: 

In [None]:
keywords = set([
    "ACCESS",
    "ACTION",
    "ADD",
    "ADMIN",
    "AFTER",
    "ALGORITHM",
    "ALIAS",
    "ALL",
    "ALLOWED_LATENESS",
    "ALTER",
    "AND",
    "ANTI",
    "ANY",
    "APPLY",
    "ARRAY",
    "AS",
    "ASC",
    "ASCENDING",
    "ASOF",
    "ASSUME",
    "AST",
    "ASYNC",
    "ATTACH",
    "AUTO_INCREMENT",
    "BACKUP",
    "BASE_BACKUP",
    "BEGIN",
    "BETWEEN",
    "BIDIRECTIONAL",
    "BOTH",
    "BY",
    "CACHE",
    "CACHES",
    "CASCADE",
    "CASE",
    "CASEWITHEXPRESSION",
    "CAST",
    "CHANGE",
    "CHANGEABLE_IN_READONLY",
    "CHANGED",
    "CHAR",
    "CHARACTER",
    "CHECK",
    "CLEANUP",
    "CLEAR",
    "CLUSTER",
    "CLUSTER_HOST_IDS",
    "CLUSTERS",
    "CN",
    "CODEC",
    "COLLATE",
    "COLLECTION",
    "COLUMN",
    "COLUMNS",
    "COMMENT",
    "COMMIT",
    "COMPRESSION",
    "CONCAT",
    "CONSTRAINT",
    "CREATE",
    "CROSS",
    "CUBE",
    "CURRENT",
    "CURRENT_USER",
    "DATABASE",
    "DATABASES",
    "DATE",
    "DATE_ADD",
    "DATEADD",
    "DATE_DIFF",
    "DATEDIFF",
    "DATE_SUB",
    "DATESUB",
    "DAY",
    "DD",
    "DDL",
    "DEDUPLICATE",
    "DEFAULT",
    "DELAY",
    "DELETE",
    "DESC",
    "DESCENDING",
    "DESCRIBE",
    "DETACH",
    "DETACHED",
    "DICTIONARIES",
    "DICTIONARY",
    "DISK",
    "DISTINCT",
    "DIV",
    "DOUBLE_SHA1_HASH",
    "DROP",
    "ELSE",
    "EMPTY",
    "ENABLED",
    "END",
    "ENFORCED",
    "ENGINE",
    "EPHEMERAL",
    "EQUALS",
    "ESTIMATE",
    "EVENT",
    "EVENTS",
    "EXCEPT",
    "EXCHANGE",
    "EXISTS",
    "EXPLAIN",
    "EXPRESSION",
    "EXTERNAL",
    "EXTRACT",
    "FALSE",
    "FETCH",
    "FILE",
    "FILESYSTEM",
    "FILL",
    "FILTER",
    "FINAL",
    "FIRST",
    "FOLLOWING",
    "FOR",
    "FOREIGN",
    "FORMAT",
    "FREEZE",
    "FROM",
    "FULL",
    "FULLTEXT",
    "FUNCTION",
    "GLOBAL",
    "GRANT",
    "GRANTEES",
    "GRANTS",
    "GRANULARITY",
    "GREATER",
    "GREATEROREQUALS",
    "GROUP",
    "GROUPING",
    "GROUPS",
    "HASH",
    "HAVING",
    "HDFS",
    "HH",
    "HIERARCHICAL",
    "HOST",
    "HOUR",
    "ID",
    "IDENTIFIED",
    "IF",
    "ILIKE",
    "IN",
    "INDEX",
    "INFILE",
    "INHERIT",
    "INJECTIVE",
    "INNER",
    "INSERT",
    "INTERPOLATE",
    "INTERSECT",
    "INTERVAL",
    "INTO",
    "INVISIBLE",
    "IP",
    "IS",
    "IS_OBJECT_ID",
    "JOIN",
    "KEY",
    "KEYED",
    "KILL",
    "LAMBDA",
    "LARGE",
    "LAST",
    "LAYOUT",
    "LEADING",
    "LEFT",
    "LESS",
    "LESSOREQUALS",
    "LEVEL",
    "LIFETIME",
    "LIKE",
    "LIMIT",
    "LIMITS",
    "LINEAR",
    "LIST",
    "LITERAL",
    "LIVE",
    "LOCAL",
    "LTRIM",
    "MATCH",
    "MATERIALIZE",
    "MATERIALIZED",
    "MAX",
    "MCS",
    "MEMORY",
    "MI",
    "MICROSECOND",
    "MILLISECOND",
    "MIN",
    "MINUS",
    "MINUTE",
    "MM",
    "MOD",
    "MODIFY",
    "MONTH",
    "MOVE",
    "MS",
    "MULTIIF",
    "MUTATION",
    "NAME",
    "NAMED",
    "NANOSECOND",
    "NEXT",
    "NO",
    "NONE",
    "NOT",
    "NOTEQUALS",
    "NOTIN",
    "NS",
    "NULL",
    "NULLS",
    "OBJECT",
    "OFFSET",
    "ON",
    "ONLY",
    "OPTIMIZE",
    "OPTION",
    "OR",
    "ORDER",
    "OUTER",
    "OUTFILE",
    "OVER",
    "OVERRIDE",
    "PART",
    "PARTIAL",
    "PARTITION",
    "PARTITIONS",
    "PART_MOVE_TO_SHARD",
    "PERMANENTLY",
    "PERMISSIVE",
    "PIPELINE",
    "PLAN",
    "PLUS",
    "POLICY",
    "POPULATE",
    "POSITION",
    "PRECEDING",
    "PRECISION",
    "PREWHERE",
    "PRIMARY",
    "PRIVILEGES",
    "PROCESSLIST",
    "PROFILE",
    "PROJECTION",
    "QQ",
    "QUARTER",
    "QUERY",
    "QUOTA",
    "RANDOMIZED",
    "RANGE",
    "READONLY",
    "REALM",
    "RECOMPRESS",
    "REFERENCES",
    "REFRESH",
    "REGEXP",
    "REGEXPQUOTEMETA",
    "REMOVE",
    "RENAME",
    "REPLACE",
    "REPLACEREGEXPALL",
    "REPLACEREGEXPONE",
    "RESET",
    "RESTORE",
    "RESTRICT",
    "RESTRICTIVE",
    "RESUME",
    "REVOKE",
    "RIGHT",
    "ROLE",
    "ROLES",
    "ROLLBACK",
    "ROLLUP",
    "ROW",
    "ROWS",
    "RTRIM",
    "S3",
    "SALT",
    "SAMPLE",
    "SECOND",
    "SELECT",
    "SEMI",
    "SERVER",
    "SET",
    "SETS",
    "SETTING",
    "SETTINGS",
    "SHA256_HASH",
    "SHARD",
    "SHOW",
    "SIGNED",
    "SIMPLE",
    "SINGLEVALUEORNULL",
    "SNAPSHOT",
    "SOURCE",
    "SPATIAL",
    "SS",
    "STDOUT",
    "STEP",
    "STORAGE",
    "STRICT",
    "STRICTLY_ASCENDING",
    "SUBPARTITION",
    "SUBPARTITIONS",
    "SUBSTRING",
    "SUSPEND",
    "SYNC",
    "SYNTAX",
    "SYSTEM",
    "TABLE",
    "TABLES",
    "TEMPORARY",
    "TEST",
    "THAN",
    "THEN",
    "TIES",
    "TIMESTAMP",
    "TIMESTAMP_ADD",
    "TIMESTAMPADD",
    "TIMESTAMP_DIFF",
    "TIMESTAMPDIFF",
    "TIMESTAMP_SUB",
    "TIMESTAMPSUB",
    "TO",
    "TODATE",
    "TODATETIME",
    "TOP",
    "TOTALS",
    "TRACKING",
    "TRAILING",
    "TRANSACTION",
    "TREE",
    "TRIGGER",
    "TRIM",
    "TRIMBOTH",
    "TRIMLEFT",
    "TRIMRIGHT",
    "TRUE",
    "TRUNCATE",
    "TTL",
    "TUPLE",
    "TYPE",
    "UNBOUNDED",
    "UNFREEZE",
    "UNION",
    "UNIQUE",
    "UNSIGNED",
    "UNTUPLE",
    "UPDATE",
    "URL",
    "USE",
    "USER",
    "USING",
    "UUID",
    "VALUES",
    "VARYING",
    "VIEW",
    "VIEWIFPERMITTED",
    "VISIBLE",
    "VOLUME",
    "WATCH",
    "WATERMARK",
    "WEEK",
    "WHEN",
    "WHERE",
    "WINDOW",
    "WITH",
    "WK",
    "WRITABLE",
    "YEAR",
    "YYYY",
    "ZKPATH"
])

In [None]:
from typing import Dict, List, Tuple

def split_with_comments_and_quotes(line: str) -> List[str]:
    comment_pattern = re.compile(r'/\*.*?\*/')
    comments = comment_pattern.findall(line)
    
    placeholder_format = "COMMENT_PLACEHOLDER_{}"
    for i, comment in enumerate(comments):
        line = line.replace(comment, placeholder_format.format(i), 1)
    
    tokens = line.split()
    
    for i, comment in enumerate(comments):
        tokens = [token if token != placeholder_format.format(i) else comment for token in tokens]
    
    return tokens

def process_lexed_data(
        tokens: List[str],
        tokens_types: List[str],
        keywords: List[str],
    ) -> Tuple[List[str], List[str], List[str]]:

    result_types_whitespaces = []
    result_types_no_whitespaces = []
    result_tokens = []
    
    cnt_whitespaces = 0
    cur_token_idx = 0

    for token_type in tokens_types:
        if token_type == "Comment": # we will remove comments
            cur_token_idx += 1
            continue
        if token_type == "Whitespace":
            cnt_whitespaces += 1
            result_types_whitespaces.append(token_type)
            continue

        new_token_type = ""
        if token_type == "BareWord" and tokens[cur_token_idx].upper() in keywords:
            

def process_lexed_files(
        tokens_filename: str,
        tokens_types_filename: str,
        tokens_filename_processed: str,
        tokens_types_filename_processed: str,
        keywords: Dict[str]
    ) -> None:
    """
    This function filters lines that were lexed with errors,
    For each BareWord checks if it is in KeyWords and if yes leaves it as it is, if no change it to Identifier (token type)
    """
    with open(tokens_filename, "r", errors="ignore") as tokens_f,\
         open(tokens_types_filename, "r") as tokens_types_f,\
         open(tokens_filename_processed, "w+") as tokens_processed,\