In [27]:
import re
import time
import csv   

In [23]:
class Category:
    def __init__(self, key, englishCategory, hungarianCategory, slovakCategory):
        self.englishCategory = englishCategory
        self.hungarianCategory = hungarianCategory
        self.slovakCategory = slovakCategory

    def toString(self):
        return "Key: " + self.key + "\nEnglishCategory: " + (self.englishCategory.toString() if hasattr(self.englishCategory, 'label') else "Missing") + "\nHungarianCategory: " + (self.hungarianCategory.toString() if hasattr(self.hungarianCategory, 'label') else "Missing") + "\nSlovakCategory: " + (self.slovakCategory.toString() if hasattr(self.slovakCategory, 'label') else "Missing")
        
class Category_Language:
    def __init__(self, dbpediaUri, label, wikipediaUri, broaderCategories):
        self.dbpediaUri = dbpediaUri
        self.label = label
        self.wikipediaUri = wikipediaUri
        self.broaderCategories = broaderCategories

    def toString(self):
        return "\n\tDbPediaUri: " + self.dbpediaUri + "\n\tLabel: " + self.label + "\n\tWikipediaUri: " + self.wikipediaUri + "\n\tBroaderCategories: " + str(self.broaderCategories)
    
def replaceUnicode(string):
    unicode_chars = ["\\u00C1", "\\u00C9", "\\u00CD", "\\u00D3", "\\u00D6", "\\u0150", "\\u00DA", "\\u00DC", "\\u0170", "\\u00E1", "\\u00E9", "\\u00ED",
                    "\\u00F3", "\\u00F6", "\\u0151", "\\u00FA", "\\u00FC", "\\u0171", "\\u00C4", "\\u00D4", "\\u00DD", "\\u010C", "\\u010E", "\\u0139",
                    "\\u013D", "\\u0147", "\\u0154", "\\u0160", "\\u0164", "\\u017D", "\\u01F1", "\\u01C4", "\\u00E4", "\\u00F4", "\\u00FD", "\\u010D",
                    "\\u010F", "\\u013A", "\\u013E", "\\u0148", "\\u0155", "\\u0161", "\\u0165", "\\u017E", "\\u01F3", "\\u01C6", "\\u2013"]
    ascii_chars = ["Á", "É", "Í", "Ó", "Ö", "Ő", "Ú", "Ü", "Ű", "á", "é", "í", "ó", "ö", "ő", "ú", "ü", "ű", "Ä",
                   "Ô", "Ý", "Č", "Ď", "Ĺ", "Ľ", "Ň", "Ŕ", "Š", "Ť", "Ž", "DZ", "DŽ", "ä", "ô", "ý", "č", "ď",
                  "ĺ", "ľ", "ň", "ŕ", "š", "ť", "ž", "dz", "dž", "-"]
    for i in range(0, len(unicode_chars)):
        string = string.replace(unicode_chars[i], ascii_chars[i])
        
    return string

In [6]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [7]:
category_labels_en_len = file_len("Data/category_labels_en.nq")
category_labels_hu_len = file_len("Data/category_labels_hu.nq")
category_labels_en_uris_hu_len = file_len("Data/category_labels_en_uris_hu.nq")
category_labels_en_uris_sk_len = file_len("Data/category_labels_en_uris_sk.nq")
category_labels_sk_len = file_len("Data/category_labels_sk.nq")
skos_categories_en_len = file_len("Data/skos_categories_en.nq")
skos_categories_hu_len = file_len("Data/skos_categories_hu.nq")
skos_categories_sk_len = file_len("Data/skos_categories_sk.nq")

In [8]:
urls = re.compile("""\<(.*?)\>""")
label = re.compile("""\"(.*?)\"""") 

In [9]:
category_labels = []

In [38]:
parsed_data = open('Data/parsed_categories.csv','a', newline='')
fieldnames=['en_db', 'en_label', 'en_wiki', 'en_broader', 'hu_db', 'hu_label', 'hu_wiki', 'hu_broader', 'sk_db','sk_label', 'sk_wiki', 'sk_broader']
writer = csv.DictWriter(parsed_data, fieldnames=fieldnames)
writer.writeheader()

In [39]:
start_time = time.time()
category_en = open("Data/category_labels_en.nq", "r")
for i in range(2):
    line = category_en.readline()
    if (line.startswith('#') == False):                
        split_urls = urls.findall(line)
        split_label = label.findall(line)
        englishCategory = Category_Language(split_urls[0], split_label[0], split_urls[2], [])        
        hungarianWikipediaUri = FindWikipediaUriInHungarianUrls(split_urls[0])        
        slovakWikipediaUri = FindWikipediaUriInSlovakUrls(split_urls[0])
        
        if (hungarianWikipediaUri):
            hungarianCategory = FindHungarianCategory(hungarianWikipediaUri)
        else:
            hungarianCategory = {}
        
        if (slovakWikipediaUri):
            slovakCategory = FindSlovakCategory(slovakWikipediaUri)
        else:
            slovakCategory = {}
        
        englishCategory.broaderCategories = FindEnglishBroaderCategories(englishCategory.label)
        
        if (hungarianWikipediaUri):
            hungarianCategory.broaderCategories = FindHungarianBroaderCategories(hungarianCategory.label)
            
        if (slovakWikipediaUri):
            slovakCategory.broaderCategories = FindSlovakBroaderCategories(slovakCategory.label)
            
        writer.writerow(
            {'en_db': englishCategory.dbpediaUri,
             'en_label': englishCategory.label,
             'en_wiki': englishCategory.wikipediaUri,
             'en_broader': englishCategory.broaderCategories,
             'hu_db': (hungarianCategory.dbpediaUri if hasattr(hungarianCategory, 'dbpediaUri') else ""),
             'hu_label': (hungarianCategory.label if hasattr(hungarianCategory, 'label') else ""),
             'hu_wiki': (hungarianCategory.wikipediaUri if hasattr(hungarianCategory, 'wikipediaUri') else ""),
             'hu_broader': (hungarianCategory.broaderCategories if hasattr(hungarianCategory, 'broaderCategories') else ""),
             'sk_db': (slovakCategory.dbpediaUri if hasattr(slovakCategory, 'dbpediaUri') else ""),
             'sk_label': (slovakCategory.label if hasattr(slovakCategory, 'label') else ""),
             'sk_wiki': (slovakCategory.wikipediaUri if hasattr(slovakCategory, 'wikipediaUri') else ""),
             'sk_broader': (slovakCategory.broaderCategories if hasattr(slovakCategory, 'broaderCategories') else "")
            })
        print("Done: ", i)
        
parsed_data.close()      
print("--- %s seconds ---" % (time.time() - start_time))

Done:  1
--- 1.3870279788970947 seconds ---


In [12]:
def FindWikipediaUriInHungarianUrls(uri):
    category_uris_hu = open("Data/category_labels_en_uris_hu.nq", "r")
    for i in range(category_labels_en_uris_hu_len):
        line = category_uris_hu.readline()
        if (line.startswith('#') == False):
            split_urls = urls.findall(line)
            dbPediaUri = split_urls[0]
            if(dbPediaUri == uri):
                return split_urls[2]

In [13]:
def FindWikipediaUriInSlovakUrls(uri):
    category_uris_sk = open("Data/category_labels_en_uris_sk.nq", "r")
    for i in range(category_labels_en_uris_sk_len):
        line = category_uris_sk.readline()
        if (line.startswith('#') == False):
            split_urls = urls.findall(line)
            dbPediaUri = split_urls[0]
            if(dbPediaUri == uri):
                return split_urls[2]

In [14]:
def FindHungarianCategory(uri):
    category_hu = open("Data/category_labels_hu.nq", "r")
    for i in range(category_labels_hu_len):
        line = category_hu.readline()
        if (line.startswith('#') == False):
            split_urls = urls.findall(line)
            split_label = label.findall(line)
            wikiPediaUri = split_urls[2]
            if(wikiPediaUri == uri):
                return Category_Language(replaceUnicode(split_urls[0]), replaceUnicode(split_label[0]), replaceUnicode(wikiPediaUri), [])

In [15]:
def FindSlovakCategory(uri):
    category_sk = open("Data/category_labels_sk.nq", "r")
    for i in range(category_labels_sk_len):
        line = category_sk.readline()
        if (line.startswith('#') == False):
            split_urls = urls.findall(line)
            split_label = label.findall(line)
            wikiPediaUri = split_urls[2]
            if(wikiPediaUri == uri):
                return Category_Language(replaceUnicode(split_urls[0]), replaceUnicode(split_label[0]), replaceUnicode(wikiPediaUri), [])

In [16]:
def FindEnglishBroaderCategories(english_label):
    skos_category_en = open("Data/skos_categories_en.nq", "r")
    for i in range(skos_categories_en_len):
        line = skos_category_en.readline()
        if (line.startswith('#') == False):
            if(re.search("#prefLabel", line)):
                split_label = label.findall(line)
                if(split_label[0] == english_label):
                    broader_categories = []
                    while(1):
                        line = skos_category_en.readline()
                        if(re.search("#broader", line) or re.search('#related', line)):
                            exception = split_label[0].replace(")", "\)").replace("(", "\(").replace("*", "\*").replace("+", "\+").replace(" ", "_")
                            regex = """(?!resource\/Category:""" + exception + """>)resource\/Category:(.*?)\>"""
                            broader = re.compile(regex)
                            broader_category = broader.findall(line)
                            if(len(broader_category) > 0):
                                broader_categories.append(broader_category[0])
                        if(re.search("#prefLabel", line)):
                            return broader_categories

In [17]:
def FindHungarianBroaderCategories(hungarian_label):
    skos_category_hu = open("Data/skos_categories_hu.nq", "r")
    for i in range(skos_categories_hu_len):
        line = skos_category_hu.readline()
        if (line.startswith('#') == False):
            if(re.search("#prefLabel", line)):
                split_label = label.findall(line)
                if(replaceUnicode(split_label[0]) == hungarian_label):
                    broader_categories = []
                    while(1):
                        line = skos_category_hu.readline()
                        if(re.search("#broader", line) or re.search('#related', line)):
                            exception = replaceUnicode(split_label[0]).replace(")", "\)").replace("(", "\(").replace("*", "\*").replace("+", "\+").replace(" ", "_")
                            line = replaceUnicode(line)
                            regex = "(?!resource\/Kategória:" + exception + ">)resource\/Kategória:(.*?)\>"
                            broader = re.compile(regex)
                            broader_category = broader.findall(line)
                            if(len(broader_category) > 0):
                                broader_categories.append(broader_category[0])
                        if(re.search("#prefLabel", line)):
                            return broader_categories

In [18]:
def FindSlovakBroaderCategories(slovak_label):
    skos_category_sk = open("Data/skos_categories_sk.nq", "r")
    for i in range(skos_categories_sk_len):
        line = skos_category_sk.readline()
        if (line.startswith('#') == False):
            if(re.search("#prefLabel", line)):
                split_label = label.findall(line)
                if(replaceUnicode(split_label[0]) == slovak_label):
                    broader_categories = []
                    while(1):
                        line = skos_category_sk.readline()
                        if(re.search("#broader", line) or re.search('#related', line)):
                            exception = replaceUnicode(split_label[0]).replace(")", "\)").replace("(", "\(").replace("*", "\*").replace("+", "\+").replace(" ", "_")
                            line = replaceUnicode(line)
                            regex = """(?!resource\/Kategória:""" + exception + """>)resource\/Kategória:(.*?)\>"""
                            broader = re.compile(regex)
                            broader_category = broader.findall(line)
                            if(len(broader_category) > 0):
                                broader_categories.append(broader_category[0])
                        if(re.search("#prefLabel", line)):
                            return broader_categories