# PYTHON TOKENIZER

In [12]:
import tokenize
import keyword
import builtins




def is_valid_identifier(identifier):
    """
    Function to check if an identifier follows Python's identifier declaration rules.
    """
    return identifier.isidentifier() and identifier not in keyword.kwlist

def tokenize_source_code(file_path):
    with open(file_path, 'rb') as f:
        tokens = tokenize.tokenize(f.readline)
        current_line = 1
        operators_list = ['+', '-', '*', '/', '//', '%', '**', '==', '!=', '>', '<', '>=', '<=', 'and', 'or', 'not', '&', '|', '^', '~', '<<', '>>', '=', '+=', '-=', '*=', '/=', '%=', '//=', '**=', '&=', '|=', '^=', '<<=', '>>=']

        for token in tokens:
            if token.type == tokenize.NEWLINE:
                current_line += 1
            else:
                lexeme = token.string
                token_type = tokenize.tok_name[token.type]
                line_number = current_line

                # Categorize tokens based on their types
                if token_type == 'NAME':
                    if lexeme in builtins.__dict__:
                        token_category = 'Built-in Type or Function'
                   
                    elif is_valid_identifier(lexeme):
                        token_category = 'Identifier'
                    else:
                        token_category = 'Keyword'
                elif token_type == 'NUMBER':
                    token_category = 'Number'
                elif token_type == 'STRING':
                    token_category = 'String'
                elif token_type == 'OP' and lexeme in operators_list :
                    token_category = 'Operator'
                elif token_type == 'OP':
                    token_category = ' punctuator'
                elif token_type == 'NEWLINE':
                    token_category = 'Newline'
                else:
                    token_category = 'Other'
                
                # Print token information along with its category
                print(f"| {lexeme:<30} |  {token_type:<10} | {token_category:<40} | {line_number:<10} |")

# Example usage:
file_path = 'TEST.py'  # Replace 'TEST.py' with your source code file path
tokenize_source_code(file_path)


| utf-8                          |  ENCODING   | Other                                    | 1          |
| def                            |  NAME       | Keyword                                  | 1          |
| factorial                      |  NAME       | Identifier                               | 1          |
| (                              |  OP         |  punctuator                              | 1          |
| n                              |  NAME       | Identifier                               | 1          |
| )                              |  OP         |  punctuator                              | 1          |
| :                              |  OP         |  punctuator                              | 1          |
|                                |  INDENT     | Other                                    | 2          |
| if                             |  NAME       | Keyword                                  | 2          |
| n                              |  NAME       | Identi