Parsing wikipedia
- Vyparsovanie automobilových vozidiel z wikipédii a vytvorenie služby, ktorá by podľa zadaného automobilu (podľa jeho kategórii (automobilka, rok výroby, produkcia, trieda, predchodca, typ karosérie) umožňovala vyhľadať podobné automobilové vozidlá danej kategórie

In [None]:
import json
import bz2
import re

vehicles = []

In [None]:
class Vehicle:
    def __init__(self, name='', manufacturer=[], production='', vehicle_class='', layout=''):
        self.name = name
        self.manufacturer = manufacturer
        self.production = production
        self.vehicle_class = vehicle_class
        self.layout = layout

In [None]:
def parse_parameter_value(parameter: str, text: str) -> str:
    pattern = f"(\|){{1}}(\s)*({parameter}){{1}}(\s)*(=){{1}}(\s)*"
    value = re.sub(pattern, '', text)
    value = re.split('[&]', value)
    return value[0].replace('[', '').replace(']', '').replace('\n', '').rstrip()


def find_parameter(parameter: str, text: str):
    pattern = f"(\|){{1}}(\s)*({parameter}){{1}}(\s)*(=){{1}}(\s)*"
    return re.search(pattern, text)


def get_vehicle_parameters(vehicles: list, text: str):
    text = text.lower()
    if find_parameter('name', text):
        vehicles[-1]['name'] = parse_parameter_value('name', text)
    if find_parameter('manufacturer', text):
        manufacturers = parse_parameter_value('manufacturer', text)
        manufacturers = re.split('[|]', manufacturers)
        vehicles[-1]['manufacturer'] = [str(item).lstrip().rstrip() for item in manufacturers]
    if find_parameter('class', text):
        vehicle_class = parse_parameter_value('class', text)
        vehicle_class = re.split('[|/]', vehicle_class)
        vehicles[-1]['class'] = [str(item).lstrip().rstrip() for item in vehicle_class]
    if find_parameter('production', text):
        years = parse_parameter_value('production', text)
        years = re.findall(r'([1-2][0-9]{3})', years)
        if years:
            vehicles[-1]['production_year'] = int(years[0])
    if find_parameter('layout', text):
        layout = parse_parameter_value('layout', text)
        layout = re.split('[|/,#]', layout)
        vehicles[-1]['layout'] = [str(item).lstrip().rstrip() for item in layout]

In [9]:
if __name__ == "__main__":
    # dataset/enwiki-20220920-pages-meta-current10.xml-p4045403p5399366.bz2
    # dataset/enwiki-20220920-pages-meta-current.xml.bz2
    global vehicles, json_file
    with bz2.BZ2File('dataset/enwiki-20220920-pages-meta-current.xml.bz2') as f:
        flag = False
        for row in f:
            row = row.decode("utf-8")
            if '{{Infobox automobile' == row.lstrip().rstrip():
                flag = True
                vehicles.append({
                    'name': '',
                    'manufacturer': [],
                    'class': [],
                    'layout': [],
                    'production_year': ''
                    })
                continue
            elif '}}' == row.rstrip().lstrip() and flag == True:
                flag = False

            if flag:
                get_vehicle_parameters(vehicles=vehicles, text=row)
        f.close()
    
    with open('vehicles.json', 'w') as json_file:
        json.dump(vehicles, json_file, indent=2)
        json_file.close()

In [10]:
print(len(vehicles))

8905
