Trying to emulate what lich.rb::XMLParser does to get a handle on how it works

Reference to main xml parsing method

```
   if str = @buffer.slice!(/^[^<]+/)
      text(str.gsub(/&(lt|gt|quot|apos|amp)/) { @unescape[$1] })
   elsif str = @buffer.slice!(/^<\/[^<]+>/)
      element = /^<\/([^\s>\/]+)/.match(str).captures.first
      tag_end(element)
   elsif str = @buffer.slice!(/^<[^<]+>/)
      element = /^<([^\s>\/]+)/.match(str).captures.first
      attributes = Hash.new
      str.scan(/([A-z][A-z0-9_\-]*)=(["'])(.*?)\2/).each { |attr| attributes[attr[0]] = attr[2] }
      tag_start(element, attributes)
      tag_end(element) if str =~ /\/>$/
   else
      break
   end

In [60]:
import pathlib
import html
import re

log = pathlib.Path.cwd() / "tags-sample.log"
with open(log) as infile:
    lines = infile.readlines()
lines[:10]

['Please wait for connection to game server.\n',
 "<playerID id='440984'/>\n",
 '<mode id="GAME"/><settingsInfo  client="1.0.1.26" major="652" crc=\'1714388481\' instance=\'DR\'/>\n',
 '\n',
 'Welcome to DragonRealms (R) v2.00\n',
 'Copyright 2021 Simutronics Corp.\n',
 'All Rights Reserved\n',
 '<mode id="GAME"/>\n',
 '<app char="Crannach" game="DR" title="[DR: Crannach] StormFront"/>\n',
 '<streamWindow id="main" title="Story" location="center" target="drop" resident="true"/>\n']

In [61]:
# TODO: Visitor pattern?

class XMLParser:
    """Parser for Dragonrealms tags, directly translated from lich.rb"""
    def __init__(self):
        self.buffer = ""
        self.active_tags = []
        self.last_tag = None
        self.active_ids = []
        self.last_id = None
        self.bold = False
        self.player_id = None
        self.game = None
        self.current_stream = ""
        self.current_style = ""
        self.prompt = ""

        # TODO: Rename unesc, its actually just looking for anything thats not a tag. Unescaping is just an addon.
        self.unesc_re = re.compile(r"^[^<]+")
        self.end_re = re.compile(r"^<\/[^<]+>")
        self.end_info_re = re.compile(r"^<\/([^\s>\/]+)")
        self.start_re = re.compile(r"^<[^<]+>")
        self.start_info_re = re.compile(r"^<([^\s>\/]+)")
        self.attr_re = re.compile(r"([A-z][A-z0-9_\-]*)=([\"'])(.*?)\2")

        self._strip_xml_multiline = ""

    def text(self, text_string):
        if self.active_tags and self.active_tags[-1] == 'prompt':
            self.prompt = text_string
        return text_string

    def parse(self, line):
        self.buffer += line
        m = self.unesc_re.match(line)
        if m:
            self.buffer = self.unesc_re.sub(self.buffer, "")
            line = self.unesc_re.sub(self.buffer, "")
            # line = html.unescape(line)
            self.text(html.unescape(line))
            if line:
                return line
        m = self.end_re.match(line)
        if m:
            info_match = self.end_info_re.match(m.group(0))
        m = self.start_re.match(line)
        if m:
            info = self.start_info_re.match(m.group(0))
            if info:
                element = info.group(1)
                attributes = {}
                for attr in self.attr_re.finditer(line):
                    attributes[attr.group(1)] = attr.group(3)
                self.tag_start(element, attributes)
                self.tag_end(element)
            return line
        return line
        
    def tag_start(self, name, attributes):
        self.active_tags.append(name)
        if 'id' in attributes:
            self.active_ids.append(attributes['id'])

        if name == "pushBold":
            self.bold = True
        elif name == "popBold":
            self.bold = False
        elif name == 'playerID':
            self.player_id = attributes['id']
        elif name == "settingsInfo":
            if 'instance' in attributes:
                self.game = attributes['instance']

    def tag_end(self, name):
        if self.active_tags:
            self.last_tag = self.active_tags.pop()
        if self.active_ids:
            self.last_id = self.active_ids.pop()

    def strip(self, line: str) -> str:
        if line == "\r\n": return line

        if self._strip_xml_multiline:
            self._strip_xml_multiline += line
            line = self._strip_xml_multiline
        if len(re.split(r"<pushStream[^>]*\/>", line)) > len(re.split(r"<popStream[^>]*\/>", line)):
            self._strip_xml_multiline = line
            return ""
        # Reset
        self._strip_xml_multiline = ""

        line = re.sub(r"<pushStream id=[\"'](?:spellfront|inv|bounty|society|speech|talk)[\"'][^>]*\/>.*?<popStream[^>]*>", "", line, flags=re.MULTILINE)
        line = re.sub(r'<stream id="Spells">.*?<\/stream>', "", line, flags=re.MULTILINE)
        line = re.sub(r"<(compDef|inv|component|right|left|spell|prompt)[^>]*>.*?<\/\1>", "", line, flags=re.MULTILINE)
        line = re.sub(r"<[^>]+>", "", line)
        line = html.unescape(line)
        if not line.strip():
            return ""
        return line

    def reset(self):
        self.current_stream = ""
        self.current_style = ""
        self.active_tags = []
        self.active_ids = []

In [62]:
parser = XMLParser()
for line in lines:
    parser.parse(line)

assert parser.player_id == '440984'


Strip xml from output

Reference:

```
def strip_xml(line)
   return line if line == "\r\n"

   if $strip_xml_multiline
      $strip_xml_multiline = $strip_xml_multiline + line
      line = $strip_xml_multiline
   end
   if (line.scan(/<pushStream[^>]*\/>/).length > line.scan(/<popStream[^>]*\/>/).length)
      $strip_xml_multiline = line
      return nil
   end
   $strip_xml_multiline = nil

   line = line.gsub(/<pushStream id=["'](?:spellfront|inv|bounty|society|speech|talk)["'][^>]*\/>.*?<popStream[^>]*>/m, '')
   line = line.gsub(/<stream id="Spells">.*?<\/stream>/m, '')
   line = line.gsub(/<(compDef|inv|component|right|left|spell|prompt)[^>]*>.*?<\/\1>/m, '')
   line = line.gsub(/<[^>]+>/, '')
   line = line.gsub('&gt;', '>')
   line = line.gsub('&lt;', '<')

   return nil if line.gsub("\n", '').gsub("\r", '').gsub(' ', '').length < 1
   return line
end
```

In [63]:
processed_lines = []
for line in lines:
    res = parser.strip(line)
    if res:
        processed_lines.append(res)

In [65]:
processed_lines[:10]

['Please wait for connection to game server.\n',
 'Welcome to DragonRealms (R) v2.00\n',
 'Copyright 2021 Simutronics Corp.\n',
 'All Rights Reserved\n',
 "Your worn items are:\n  some polished thin-edged zills with silvered esoteric filigree\n  a floppy boar-hide hat\n  a lumium ring helm\n  a kyanite gwethdesuan\n  a dull serpent earcuff\n  a dull serpent earcuff\n  a lumium ring mask\n  a gaethzen sphere\n  a dark watersilk bag bearing a detailed cambrinth medallion\n  a soft white thigh quiver of snow leopard fur artfully wrapped with silver thread\n  a narrow gold bracer engraved with the seal of the Lunar Accord\n  a crystal-inset oaken staff surmounted with a lumpy spleen\n  a heavy silk duffel bag\n  a light cherry forester's crossbow with a slender stock\n  a large hunting pack crafted from wyvern hide\n  an elegant charcoal black linen shirt embellished with monogrammed black ruby intaglio cufflinks\n  a double-breasted grey rabbit's hide greatcoat trimmed with soft leather\n