In [3]:
import re

In [None]:
LOG_SCHEMAS = {
    'Android': ['LineId', 'Date', 'Time', 'Pid', 'Tid', 'Level', 'Component', 'Content', 'EventId', 'EventTemplate'],
    'Apache': ['LineId', 'Time', 'Level', 'Content', 'EventId', 'EventTemplate'],
    'BGL': ['LineId', 'Label', 'Timestamp', 'Date', 'Node', 'Time', 'NodeRepeat', 'Type', 'Component', 'Level', 'Content', 'EventId', 'EventTemplate'],
    'Hadoop': ['LineId', 'Date', 'Time', 'Level', 'Process', 'Component', 'Content', 'EventId', 'EventTemplate'],
    'HDFS': ['LineId', 'Date', 'Time', 'Pid', 'Level', 'Component', 'Content', 'EventId', 'EventTemplate'],
    'HealthApp': ['LineId', 'Time', 'Component', 'Pid', 'Content', 'EventId', 'EventTemplate'],
    'HPC': ['LineId', 'LogId', 'Node', 'Component', 'State', 'Time', 'Flag', 'Content', 'EventId', 'EventTemplate'],
    'Linux': ['LineId', 'Month', 'Date', 'Time', 'Level', 'Component', 'PID', 'Content', 'EventId', 'EventTemplate'],
    'Mac': ['LineId', 'Month', 'Date', 'Time', 'User', 'Component', 'PID', 'Address', 'Content', 'EventId', 'EventTemplate'],
    'OpenSSH': ['LineId', 'Date', 'Day', 'Time', 'Component', 'Pid', 'Content', 'EventId', 'EventTemplate'],
    'OpenStack': ['LineId', 'Logrecord', 'Date', 'Time', 'Pid', 'Level', 'Component', 'ADDR', 'Content', 'EventId', 'EventTemplate'],
    'Proxifier': ['LineId', 'Time', 'Program', 'Content', 'EventId', 'EventTemplate'],
    'Spark': ['LineId', 'Date', 'Time', 'Level', 'Component', 'Content', 'EventId', 'EventTemplate'],
    'Thunderbird': ['LineId', 'Label', 'Timestamp', 'Date', 'User', 'Month', 'Day', 'Time', 'Location', 'Component', 'PID', 'Content', 'EventId', 'EventTemplate'],
    'Windows': ['LineId', 'Date', 'Time', 'Level', 'Component', 'Content', 'EventId', 'EventTemplate'],
    'Zookeeper': ['LineId', 'Date', 'Time', 'Level', 'Node', 'Component', 'Id', 'Content', 'EventId', 'EventTemplate']
}

In [8]:
REGEX_PATTERNS = {
    "Android": re.compile(
        r"(?P<Date>\d{2}-\d{2})\s+(?P<Time>\d{2}:\d{2}:\d{2}\.\d+)\s+(?P<Pid>\d+)\s+(?P<Tid>\d+)\s+(?P<Level>[VDIWEF])\s+(?P<Component>\S+)\s+(?P<Content>.+)"
    ),
    "Apache": re.compile(
        r"(?P<Time>.+?)\s+(?P<Level>\w+)\s+(?P<Content>.+)"
    ),
    "BGL": re.compile(
        r"(?P<Label>[^,]+),(?P<Timestamp>\d+),(?P<Date>[\d.]+),(?P<Node>[^,]+),(?P<Time>[^,]+),(?P<NodeRepeat>[^,]+),(?P<Type>[^,]+),(?P<Component>[^,]+),(?P<Level>[^,]+),(?P<Content>.+)"
    ),
    "Hadoop": re.compile(
        r"(?P<Date>\d{4}-\d{2}-\d{2}),\"(?P<Time>[\d:,]+)\",(?P<Level>\w+),(?P<Process>[^,]+),(?P<Component>[^,]+),(?P<Content>.+)"
    ),
    "HDFS": re.compile(
        r"(?P<Date>\d+),(?P<Time>\d+),(?P<Pid>\d+),(?P<Level>\w+),(?P<Component>[^,]+),(?P<Content>.+)"
    ),
    "HealthApp": re.compile(
        r"(?P<Time>\d{8}-\d{2}:\d{2}:\d{2}:\d+),(?P<Component>[^,]+),(?P<Pid>\d+),(?P<Content>.+)"
    ),
    "HPC": re.compile(
        r"(?P<LogId>\d+),(?P<Node>[^,]+),(?P<Component>[^,]+),(?P<State>[^,]+),(?P<Time>\d+),(?P<Flag>\d+),(?P<Content>.+)"
    ),
    "Linux": re.compile(
        r"(?P<Month>\w+),(?P<Date>\d+),(?P<Time>[\d:]+),(?P<Level>\w+),(?P<Component>[^(]+)\((?P<PID>[^)]+)\),(?P<Content>.+)"
    ),
    "Mac": re.compile(
        r"(?P<Month>\w+),(?P<Date>\d+),(?P<Time>[\d:]+),(?P<User>[^,]+),(?P<Component>[^,]+),(?P<PID>\d+),(?P<Address>[^,]*),(?P<Content>.+)"
    ),
    "OpenSSH": re.compile(
        r"(?P<Date>\w+),(?P<Day>\d+),(?P<Time>[\d:]+),(?P<Component>[^,]+),(?P<Pid>\d+),(?P<Content>.+)"
    ),
    "OpenStack": re.compile(
        r"(?P<Logrecord>[^,]+),(?P<Date>\d{4}-\d{2}-\d{2}),(?P<Time>[\d:.]+),(?P<Pid>\d+),(?P<Level>\w+),(?P<Component>[^,]+),(?P<ADDR>[^,]+),\"(?P<Content>.+)\""
    ),
    "Proxifier": re.compile(
        r"(?P<Time>\d{2}\.\d{2} \d{2}:\d{2}:\d{2}),(?P<Program>[^,]+),(?P<Content>.+)"
    ),
    "Spark": re.compile(
        r"(?P<Date>\d{2}/\d{2}/\d{2}),(?P<Time>[\d:]+),(?P<Level>\w+),(?P<Component>[^,]+),\"(?P<Content>.+)\""
    ),
    "Thunderbird": re.compile(
        r"(?P<Label>[^,]+),(?P<Timestamp>\d+),(?P<Date>[\d.]+),(?P<User>[^,]+),(?P<Month>\w+),(?P<Day>\d+),(?P<Time>[\d:]+),(?P<Location>[^,]+),(?P<Component>[^(]+)\((?P<PID>[^\)]+)\),(?P<Content>.+)"
    ),
    "Windows": re.compile(
        r"(?P<Date>\d{4}-\d{2}-\d{2}),(?P<Time>\d{2}:\d{2}:\d{2}),(?P<Level>\w+),(?P<Component>[^,]+),(?P<Content>.+)"
    ),
    "Zookeeper": re.compile(
        r"(?P<Date>\d{4}-\d{2}-\d{2}),\"(?P<Time>[\d:,]+)\",(?P<Level>\w+),(?P<Node>[^,]+),(?P<Component>[^,]+),(?P<Id>\d+),(?P<Content>.+)"
    ),
}

In [9]:
class SchemaSpecificParser:
    def __init__(self, log_type):
        self.log_type = log_type
        self.pattern = REGEX_PATTERNS.get(log_type)
        self.schema = LOG_SCHEMAS.get(log_type)

    def parse(self, log_line):
        if not self.pattern:
            return {"error": "No pattern defined"}
        
        match = self.pattern.match(log_line)
        if not match:
            return {"error": "Pattern did not match"}
        
        return match.groupdict()

In [10]:
log = "03-17 16:13:38.811 1702 2395 D WindowManager Some message here"
parser = SchemaSpecificParser("Android")
parsed_output = parser.parse(log)

print(parsed_output)

{'Date': '03-17', 'Time': '16:13:38.811', 'Pid': '1702', 'Tid': '2395', 'Level': 'D', 'Component': 'WindowManager', 'Content': 'Some message here'}
