In [26]:
import time, os, glob,re

import pyspark.sql.functions as sparkfunctions
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkContext

MEDIAWIKI_ARTICLES_DUMPFILE_PATH = 'data/enwiki-latest-pages-articles1.xml'
MEDIAWIKI_ABSTRACTS_DUMPFILE_PATH = 'data/enwiki-latest-abstract.xml'
DBPEDIA_ABSTRACTS_DUMPFILE_PATH = '/data/long-abstracts_lang=en.ttl'

WIKI_ABSTRACTS_SAMPLE_PATH = 'data/raw_sample_data/wiki_abstracts_sample.xml'
DBPEDIA_ABSTRACTS_SAMPLE_PATH = 'data/raw_sample_data/dbpedia_abstracts'

PARSED_ABSTRACTS_PATH = '../../data/document_base/parsed_abstracts'

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [24]:
def check_object(obj):
    if isinstance(obj, str):
        if obj != '':
            return obj
    return ''

def parse_wiki_articles_file(spark, path):
    abstract_pattern = r"'''([^\=]*)(?=(={1,6})([^\n]+?)(={1,6})[ \t]*(\n|\Z))"
    customSchema = StructType([
        StructField('title', StringType()),
        StructField('revision', StructType([
            StructField('text', StructType([
                StructField('_VALUE', StringType())
            ]))
        ]))
    ])
    df = spark.read.format('com.databricks.spark.xml')\
        .options(rowTag='page').load(path, schema=customSchema)
    all_abstracts_rdd = df.rdd.map(
        lambda loop: (
            loop['title'],
            abstract_text.group(0).replace('\n', ' ')
            if (abstract_text := re.search(pattern=abstract_pattern,
                                           string=check_object(loop['revision']['text']['_VALUE']),
                                           flags=re.MULTILINE | re.DOTALL)
                ) is not None else None
        ))
    all_abstracts = all_abstracts_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_abstracts.dropna()
    valid_abstracts = valid_abstracts.filter(~valid_abstracts.title.contains("Wikipedia:Articles for deletion"))
    return valid_abstracts

In [None]:
def parse_wiki_abstracts_file(spark, path):
    title_pattern = r"^Wikipedia: ([^\n]*)"
    customSchema = StructType([
        StructField('title', StringType(), False),
        StructField('abstract', StringType(), False)
    ])
    df = spark.read.format("com.databricks.spark.xml")\
        .options(rowTag='doc').load(path, schema=customSchema)
    wiki_abstract_rdd = df.rdd.map(
        lambda loop: (
            re.search(pattern=title_pattern, string=loop['title']).group(1).replace('\n', ' '),
            loop['abstract']
        ))
    all_wiki_abstracts = wiki_abstract_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_wiki_abstracts.dropna()
    return valid_abstracts

def parse_dbpedia_abstract_file(spark, path):
    title_pattern = r"^<http:\/\/dbpedia\.org\/resource\/([^\>]*)> <"
    abstract_pattern = re.compile(r'"([^"]*)"')
    sc = spark.sparkContext
    rdd = sc.textFile(path)
    rdd = rdd.map(
        lambda loop: (
            title.group(1).replace('_', ' ')
            if (title := re.search(title_pattern, loop)) is not None else None,
            abstract.group(1)
            if (abstract := re.search(abstract_pattern, loop)) is not None else None
        ))
    df = rdd.toDF(['title', 'abstract'])
    df = df.dropna()
    return df

def get_xml_filenames(file_dir):
    os.chdir(file_dir)
    all_files = glob.glob(f"*.xml*")
    xml_files = []
    for file_name in all_files:
        if re.match(r"(.*)(?=(\.bz2))", file_name): continue
        else: xml_files.append(file_name)
    return xml_files

### PARSE ABSTRACTS AND WRITE TO CSV

### Define input-output paths

In [3]:
default_dir = "d:\Study\STU FIIT\ZS 21-22\Information Retrival\wikidata"

#input paths
input_abstracts_data_dir = default_dir + "\wiki_articles"
input_default_abstracts_file = default_dir + "\enwiki-latest-abstract.xml"
input_dbpedia_abstracts_file = default_dir + "\long-abstracts_lang=en.ttl"

#output paths
parsed_abstracts_dir = default_dir + "\parsed_wiki_abstracts"
wikipedia_default_abstracts_dir = default_dir + "\default_wiki_abstracts"
dbpedia_abstracts_dir = default_dir + "\dbpedia_long_abstracts"


parsed_abstracts_dir_v2 = default_dir + "\parsed_wiki_abstracts_v2"

# get all xml files from directory
xml_files = get_xml_filenames(input_abstracts_data_dir)
print(xml_files)

['enwiki-20211101-pages-articles1.xml-p1p41242', 'enwiki-20211101-pages-articles10.xml-p4045403p5399366', 'enwiki-20211101-pages-articles2.xml-p41243p151573', 'enwiki-20211101-pages-articles3.xml-p151574p311329', 'enwiki-20211101-pages-articles4.xml-p311330p558391', 'enwiki-20211101-pages-articles6.xml-p958046p1483661', 'enwiki-20211101-pages-articles7.xml-p1483662p2134111', 'enwiki-20211101-pages-articles8.xml-p2134112p2936260', 'enwiki-20211101-pages-articles9.xml-p2936261p4045402']


## PARSE ABSTRACTS from articles

### Parse Single dump

In [27]:
absolute_path = input_abstracts_data_dir + '/' + xml_files[1]
if os.path.exists(absolute_path):
        start_time_single = time.time()
        parsed_wikipedia_abstracts_1 = parse_wiki_articles_file(spark, path=absolute_path)
        # res = parsed_wikipedia_abstracts_0.union(parsed_wikipedia_abstracts_1)
        print(f"parisng time: {time.time() - start_time_single}")
        parsed_wikipedia_abstracts_1.printSchema()
        try:
            parsed_wikipedia_abstracts_1.write.csv(path=PARSED_ABSTRACTS_PATH,sep='\t',mode='overwrite')
        except:
            print(f'document {absolute_path} - Error, cannot write this file to csv')
        else:
            print(f"document{absolute_path} elapsed: {time.time() - start_time_single}")

parisng time: 2.1592888832092285
root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)

documentd:\Study\STU FIIT\ZS 21-22\Information Retrival\wikidata\wiki_articles/enwiki-20211101-pages-articles10.xml-p4045403p5399366 elapsed: 93.13075613975525


### Multiple articles dump

In [None]:
start_time = time.time()
for idx, file in enumerate(xml_files):
    absolute_path = input_abstracts_data_dir + "/" + file
    mode = 'overwrite' if idx == 0 else 'append'
    if os.path.exists(absolute_path):
        start_time_single = time.time()
        parsed_wikipedia_abstracts = parse_wiki_articles_file(spark, path=absolute_path)
        print(f"parisng time: {time.time() - start_time_single}")
        try:
            parsed_wikipedia_abstracts.write.csv(path=parsed_abstracts_dir,sep='\t',mode=mode)
        except:
            print(f'document {file} - Error, cannot write this file to csv')
        else:
            print(f"document {file} elapsed: {time.time() - start_time_single}")

print(f"elapsed: {time.time() - start_time}")


In [26]:
absolute_path = input_abstracts_data_dir + '/' + xml_files[1]
start_time_single = time.time()
erroring_parsed_wikipedia_abstracts = parse_wiki_articles_file(spark, path=absolute_path)
print(f"parisng time: {time.time() - start_time_single}")


parisng time: 1.886566400527954


#### Parse default abstracts from wikipedia-abstracts-latest

In [8]:
if os.path.exists(input_default_abstracts_file):
    start_time = time.time()
    wiki_abstracts = parse_wiki_abstracts_file(spark, path=input_default_abstracts_file)
    print(f"elapsed: {time.time() - start_time}")
    wiki_abstracts.coalesce(1).write.csv(path=wikipedia_default_abstracts_dir, sep='\t', header=False, mode='overwrite')
    print(f"writing to csv elapsed: {time.time() - start_time}")
    wiki_abstracts.show(20)

elapsed: 2.5000174045562744
writing to csv elapsed: 622.2322602272034
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|           Anarchism|Anarchism is a po...|
|              Autism|| duration     =L...|
|              Albedo|Albedo (; ) is th...|
|                   A|           A-sharp}}|
|             Alabama|(We dare defend o...|
|            Achilles|In Greek mytholog...|
|     Abraham Lincoln|| alt            ...|
|           Aristotle|                  }}|
|An American in Paris|An American in Pa...|
|Academy Award for...|The Academy Award...|
|      Academy Awards|             Oscar}}|
|             Actrius|  | starring       =|
|     Animalia (book)|Animalia is an il...|
|International Ato...|International Ato...|
|            Altruism|Altruism is the p...|
|            Ayn Rand|| birth_place = S...|
|        Alain Connes|| birth_place = D...|
|          Allan Dwan|| birth_place  = ...|
|     

#### Parse dbpedia abstracts from long_abstracts dbpedia

In [11]:
if os.path.exists(input_dbpedia_abstracts_file):
    start_time = time.time()
    dbpedia_abstracts = parse_dbpedia_abstract_file(spark, path=input_dbpedia_abstracts_file)
    print(f"elapsed: {time.time() - start_time}")
    # dbpedia_abstracts = dbpedia_abstracts_rdd.toDF(['title', 'abstract'])
    dbpedia_abstracts.coalesce(1).write.csv(path=dbpedia_abstracts_dir, sep='\t', header=False, mode='overwrite')
    print(f"writing to csv elapsed: {time.time() - start_time}")
    dbpedia_abstracts.show(20)

elapsed: 2.267723798751831
writing to csv elapsed: 196.98528575897217
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|     Animalia (book)|Animalia is an il...|
|Agricultural science|Agricultural scie...|
|              Albedo|Albedo () (Latin:...|
|        Alain Connes|Alain Connes (Fre...|
|International Ato...|International Ato...|
|                   A|A or a is the fir...|
|An American in Paris|An American in Pa...|
|List of Atlas Shr...|This is a list of...|
|          Allan Dwan|Allan Dwan (born ...|
|          Astronomer|An astronomer is ...|
|            Achilles|In Greek mytholog...|
|           Anarchism|Anarchism is a po...|
|        Anthropology|Anthropology is t...|
|              Autism|Autism is a devel...|
|      Academy Awards|The Academy Award...|
|             Actrius|Actresses (Catala...|
|        Answer (law)|In law, an Answer...|
|Academy Award for...|The Academy Award...|
|Appel

In [16]:
# print(f"count parsed wiki abstracts: {parsed_wikipedia_abstracts.count()}")
print(f"count dbpedia {dbpedia_abstracts.count()}")
print(f"count wiki abstracts {wiki_abstracts.count()}")



count parsed wiki abstracts: 10523
count dbpedia 5732950
count wiki abstracts 6386761
