In [1]:
import re
from functools import reduce
import pandas as pd

#global
INPUT_LOCATION = 'day4.txt'
SEPARATOR = '\n'

In [2]:
clean_data = list(map(lambda x: x.rstrip() if len(x) > 1 else x,open(INPUT_LOCATION, 'r').readlines()))  

In [3]:
#passport iterator
class Passporter:
    
    
    keys = [ 'byr', 'iyr', 'eyr', 'hgt', 'hcl', 'ecl', 'pid', 
    # 'cid' is ignored
    ]
    data_format = re.compile(r'(\w{3}):(\S*)')
    
    def __init__(self, p):
        self.p = p
        self.index = 0
        self.current = []
    
    def __iter__(self):
        self.index = 0
        return self
    
    def __next__(self):
        if self.index == len(self.p):
            raise StopIteration 
        self.current = []
        
        while (self.p[self.index] != SEPARATOR):
            self.current.append(self.p[self.index])
            self.index += 1
            if self.index == len(self.p):
                raise StopIteration 
        
        if len(self.current) > 0:
            list_tuples = self.data_format.findall(" ".join(self.current))
            list_dict = list( map(lambda x: {x[0]: x[1]}, list_tuples))
            pas = reduce(lambda x, y: {**x, **y}, list_dict)
            return pas
        self.index += 1
        return self.__next__()


In [4]:
p = pd.DataFrame(iter(Passporter(clean_data)))
p.shape

(279, 8)

In [5]:

# iteraror which returns passport with valid fields only
class ValidFieldIterator(Passporter):
   def __next__(self):
        pas = super().__next__()
        if len(pas) > 6 : 
            if all (k in pas.keys() for k in self.keys):
                return  pas
        return self.__next__()

In [6]:
p = pd.DataFrame(iter(ValidFieldIterator(clean_data)))
p.shape

(213, 8)

In [50]:
# iteraror which returns passport with valid values only
class ValidValuesIterator(ValidFieldIterator):
    HCL_PATTERN = r'#[\w\d]{6}'
    PID_PATTERN = r'\d{9}'
    EYECOLOR_PATTERN = ['amb', 'blu', 'brn', 'gry', 'grn', 'hzl', 'oth']
    LOG = []

    def __apply_rules__(self, passport):
        rules = [
                self.chk_pattern(self.HCL_PATTERN, passport['hcl'],'hcl') != None,
                self.chk_pattern(self.PID_PATTERN,passport['pid'],'pid') != None,
                passport['ecl'] in self.EYECOLOR_PATTERN,
                self.is_beetween(passport['byr'], 1920, 2002, 'byr'),
                self.is_beetween(passport['iyr'], 2010, 2020, 'iyr'),
                self.is_beetween(passport['eyr'], 2020, 2030, 'eyr'),
                self.chk_hgt(passport['hgt']),   
        ]

        if all(value == True for value in rules):
            return True
        return False

    def is_beetween(self,data,min,max,um=''):
            data=int(data)
            if data >= min and data <= max : return True
            self.log(um ,'out of bound', data )
            return False    

    def chk_pattern(self, pattern, data, dataname = ''):
        match = re.search(pattern, data)
        if (match == None):
            self.log(dataname ,'invalid format', data )
            return False
        return True
    
    def log(self, dataname, message, data='', verbose = False):
        self.LOG.append({'file index': self.index,'error': message, 'dataname':dataname,'data':data})
        if(verbose):
            print('[', self.index,  dataname ,']\t::',message,'\t::', data )



    def chk_hgt(self,hgt):
        try:
            height = re.search(r'([0-9]+)(cm|in)',hgt)
            if height.group(2) == 'cm': 
                return self.is_beetween(int(height.group(1)),150,193,'cm')
            if height.group(2) == 'in': 
                return self.is_beetween(int(height.group(1)),59,76,'in')
        except:
            self.log('hgt','invalid format',hgt)
        return False

    def __next__(self):
        pas = super().__next__()
        if self.__apply_rules__(pas) : 
            return  pas
        return self.__next__()

In [57]:
p = iter(ValidValuesIterator(clean_data))

In [58]:
# log parsed passports problems...
count =0
valid = pd.DataFrame(p)
log = pd.DataFrame(p.LOG)
log

Unnamed: 0,file index,error,dataname,data
0,18,out of bound,eyr,1924
1,36,out of bound,cm,64
2,62,invalid format,pid,#430c70
3,62,out of bound,cm,75
4,68,invalid format,hcl,z
...,...,...,...,...
616,1034,out of bound,iyr,2028
617,1034,out of bound,eyr,1960
618,1043,invalid format,pid,15714997
619,1043,out of bound,eyr,1993


In [60]:
# all valid passports...
valid

Unnamed: 0,pid,eyr,ecl,hgt,iyr,byr,hcl,cid
0,937877382,2029,amb,187cm,2019,1933,#888785,
1,138000309,2024,oth,183cm,2013,1992,#7d3b0c,135
2,897123249,2030,amb,165cm,2011,1948,#18171d,99
3,827609097,2029,gry,72in,2017,1963,#cfa07d,
4,432183209,2028,hzl,152cm,2016,1984,#6b5442,
...,...,...,...,...,...,...,...,...
148,802041641,2028,brn,184cm,2013,1969,#c0946f,
149,866047540,2030,amb,64in,2014,1951,#733820,59
150,671492881,2023,brn,69in,2015,1962,#623a2f,
151,924256973,2028,oth,154cm,2020,1950,#b6652a,
