In [16]:
import string
import re
import pprint
os.getcwd()

'/mnt/shared-dev/Development/UniProjects/CompilerConstruction/A2'

In [54]:
class Token(object):
    KEYWORD, ID, SYM, STRCONST, INTCONST, READCONST, ASSIGN, COLON, COMMA, SEMICOLON, DOT, EQ, NQ, LT, LTE, GT, GTE, QUOTE, INTEGER, PLUS, MINUS, MUL, DIV, LPAREN, RPAREN, EOF, TERM, AND, OR, NOT = (
    'KEYWORD', 'ID', 'SYM', 'STRCONST', 'INTCONST', 'READCONST', "ASSIGN",
    "COLON", "COMMA", "SEMICOLON", "DOT", "EQ", "NQ", "LT", "LTE", "GT", "GTE","QUOTE", 'INTEGER', 'PLUS', 'MINUS', 'MUL',
    'DIV', '(', ')', 'EOF', 'TERM', 'AND', 'OR', 'NOT')

    KEYWORDS = ("PROGRAM", "VAR", "DIV", "INTEGER", "REAL", "BEGIN", "END",
            "PROCEDURE")
    
    def __init__(self, type, value, line_no, pos):
        self.type = type
        self.value = value
        self.line_no = line_no
        self.position = pos
        self.inverse = False
        
        self.row = []
    

    def __str__(self):
        """String representation of the class instance.

        Examples:
            Token(TERM, Hello)
            Token(AND, '&')
            Token(NOT, '!')
        """
        return 'Token({type}, {value}, {line_no}, {position})'.format(type=self.type,
                                               value=repr(self.value), line_no=self.line_no, position=self.position)

    def __repr__(self):
        return self.__str__()


class Lexer(object):
    def __init__(self, text):
        # client string input, e.g. "hello | world & (why | are | you)"
        self.text = text
        # self.pos is an index into self.text
        self.pos = 0
        self.current_char = self.text[self.pos]
        self.line_no = 0
        self.symbol_table = {}

    def error(self):
        raise Exception('Invalid character')

    def peek(self):
        if self.pos + 1 < len(self.text):
            return self.text[self.pos + 1]
        else:
            return None

    def advance(self):
        """Advance the `pos` pointer and set the `current_char` variable."""
        self.pos += 1
        if '\n'in [self.current_char]:
            self.line_no += 1
            
        if self.pos > len(self.text) - 1:
            self.current_char = None  # Indicates end of input
        else:
            self.current_char = self.text[self.pos]

    def skip_whitespace(self):
        while self.current_char is not None and self.current_char.isspace():
            self.advance()

    def integer(self):
        """Return a (multidigit) integer consumed from the input."""
        result = ''
        while self.current_char is not None and self.current_char.isdigit():
            result += self.current_char
            self.advance()
        return int(result)

    def word(self):
        """Return a (multidigit) integer consumed from the input."""
        result = ''
        while self.current_char is not None and (self.current_char.isalpha()
                                                 or self.current_char == '_'):
            result += self.current_char
            self.advance()
        if result.upper() in Token.KEYWORDS:
            return Token(Token.KEYWORD, str(result), self.line_no, self.pos)
        else:
            self.symbol_table[str(result)] = self.pos
            return Token(Token.ID, str(result), self.line_no, self.pos)

    def get_next_token(self):
        """Lexical analyzer (also known as scanner or tokenizer)

        This method is responsible for breaking a sentence
        apart into tokens. One token at a time.
        """
        while self.current_char is not None:
      
            if self.current_char.isspace():
                self.skip_whitespace()
                continue

            if self.current_char.isdigit():
                return Token(Token.INTCONST, self.integer(), self.line_no, self.pos)

            if self.current_char.isalpha():
                #                 print('Got Identifier  ' + self.current_char)
                return self.word()

            if self.current_char == ':' and self.peek() == '=':
                self.advance()
                self.advance()
                return Token(Token.ASSIGN, ":=", self.line_no, self.pos)

            if self.current_char == ':':
                self.advance()
                return Token(Token.COLON, ":", self.line_no, self.pos)

            if self.current_char == ',':
                self.advance()
                return Token(Token.COMMA, ",", self.line_no, self.pos)

            if self.current_char == ';':
                self.advance()
                return Token(Token.SEMICOLON, ";", self.line_no, self.pos)

            if self.current_char == '.':
                self.advance()
                return Token(Token.DOT, ".", self.line_no, self.pos)
            
            if self.current_char == '!':

                self.advance()
                return Token(Token.NOT, 'NOT', self.line_no, self.pos)

            if self.current_char == '&':

                self.advance()
                return Token(Token.AND, 'AND', self.line_no, self.pos)

            if self.current_char == '|':

                self.advance()
                return Token(Token.OR, 'OR', self.line_no, self.pos)

            if self.current_char == '+':
                self.advance()
                return Token(Token.PLUS, '+', self.line_no, self.pos)

            if self.current_char == '-':
                self.advance()
                return Token(Token.MINUS, '-', self.line_no, self.pos)
            
            if self.current_char == '"':
                self.advance()
                return Token(Token.QUOTE, '"', self.line_no, self.pos)

            if self.current_char == '*':
                self.advance()
                return Token(Token.MUL, '*', self.line_no, self.pos)

            if self.current_char == '/':
                self.advance()
                return Token(Token.DIV, '/', self.line_no, self.pos)

            if self.current_char == '=':
                self.advance()
                return Token(Token.EQ, "=", self.line_no, self.pos)
            
            if self.current_char == '<' and self.peek() == '>':
                self.advance()
                self.advance()
                return Token(Token.NEQ, "<=", self.line_no, self.pos)
            
            if self.current_char == '<':
                self.advance()
                return Token(Token.LT, "<", self.line_no, self.pos)
            
            if self.current_char == '<' and self.peek() == '=':
                self.advance()
                self.advance()
                return Token(Token.LTE, "<=", self.line_no, self.pos)
            
            if self.current_char == '>' :
                self.advance()
                return Token(Token.GT, ">", self.line_no, self.pos)
            
            if self.current_char == '>' and self.peek() == '=':
                self.advance()
                self.advance()
                return Token(Token.GTE, ">=", self.line_no, self.pos)
            
            if self.current_char == '(':
                self.advance()
                return Token(Token.LPAREN, '(', self.line_no, self.pos)

            if self.current_char == ')':
                self.advance()
                return Token(Token.RPAREN, ')', self.line_no, self.pos)
            print("before error ",self.current_char)
            self.error()

        return Token(Token.EOF, None, self.line_no, self.pos)

In [38]:
text = """


"""

lexer = Lexer(text)
token = lexer.get_next_token()
while token.type != EOF:
    print(token)
    token = lexer.get_next_token()

Token(KEYWORD, 'program', 1, 8)
Token(ID, 'checkMyAbility', 1, 23)
Token(SEMICOLON, ';', 1, 24)
Token(KEYWORD, 'var', 2, 28)
Token(ID, 'counter', 3, 36)
Token(COLON, ':', 3, 37)
Token(KEYWORD, 'integer', 3, 45)
Token(SEMICOLON, ';', 3, 46)
Token(ID, 'number', 4, 53)
Token(COLON, ':', 4, 54)
Token(KEYWORD, 'integer', 4, 62)
Token(SEMICOLON, ';', 4, 63)
Token(ID, 'factorial', 5, 73)
Token(COLON, ':', 5, 74)
Token(KEYWORD, 'integer', 5, 82)
Token(SEMICOLON, ';', 5, 83)
Token(ID, 'height', 6, 90)
Token(COLON, ':', 6, 92)
Token(KEYWORD, 'real', 6, 97)
Token(SEMICOLON, ';', 6, 98)
Token(ID, 'width', 7, 104)
Token(COLON, ':', 7, 106)
Token(KEYWORD, 'real', 7, 111)
Token(SEMICOLON, ';', 7, 112)
Token(ID, 'breadth', 8, 120)
Token(COLON, ':', 8, 122)
Token(KEYWORD, 'real', 8, 127)
Token(SEMICOLON, ';', 8, 128)
Token(ID, 'volume', 9, 135)
Token(COLON, ':', 9, 137)
Token(KEYWORD, 'real', 9, 142)
Token(SEMICOLON, ';', 9, 143)
Token(KEYWORD, 'begin', 10, 149)
Token(ID, 'number', 11, 156)
Token(ASSIG

In [55]:
text = ""
with open("./source_code.pas", 'r') as pascal_file:
    text = "".join(pascal_file.readlines())


In [56]:
lexer = Lexer(text)
token = lexer.get_next_token()

with open("pascal_tokens.csv", 'w') as token_file:
    while token.type != EOF:
#         print(token)
        token = lexer.get_next_token()
        token_file.write(f"{token.type}, {token.value}, {token.line_no}, {token.position}\n")

26

20

20

19

16

24

20

18

16

24

20

21

16

24

20

18

16

21

20

18

17

22

21

20

17

22

21

19

17

22

21

23

20

20

21

22

21

20

20

22

23

20

21

19

21

15

21

16

24

20

20

20

16

21

22

21

20

21

18

21

22

22

22

20

20

21

16

21

22

19

20

21

16

21

22

21

20

21

16

22

22

20

20

20

16

19

16

21

22

16

20

15

15

23

17

20

15

21

18

24

20

20

20

16

21

22

22

18

24

16

20

15

15

22

16

20

15

22

18

24

20

20

20

16

21

22

22

22

22

19

14

18

23

16

18

14

22

19

14

20

14

22

19

14

18

16

18

14

22

21

14

23

14

22

19

14

18

18

17

19

16

18

18

14

22

21

14

20

14

22

22

16

19

In [57]:
lexer.symbol_table

{'checkMyAbility': 22,
 'counter': 269,
 'number': 551,
 'factorial': 586,
 'height': 342,
 'width': 350,
 'breadth': 360,
 'volume': 633,
 'while': 201,
 'do': 216,
 'if': 444,
 'and': 382,
 'then': 477,
 'else': 435,
 'or': 460,
 'write': 594,
 'Factorial': 531,
 'of': 534,
 'is': 613,
 'writeln': 626,
 'Some': 600,
 'odd': 604,
 'value': 610}