In [1]:
import re, csv, datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark import SparkContext

MEDIAWIKI_ARTICLES_DUMPFILE_PATH = 'data/enwiki-latest-pages-articles1.xml'
MEDIAWIKI_ABSTRACTS_DUMPFILE_PATH = 'data/enwiki-latest-abstract.xml'
DBPEDIA_ABSTRACTS_DUMPFILE_PATH = '/data/long-abstracts_lang=en.ttl'

WIKI_ABSTRACTS_SAMPLE_PATH = 'data/raw_sample_data/wiki_abstracts_sample.xml'
DBPEDIA_ABSTRACTS_SAMPLE_PATH = 'data/raw_sample_data/dbpedia_abstracts'

PARSED_ABSTRACTS_PATH = '../../data/document_base/parsed_abstracts_spark.csv'

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [2]:
def parse_wiki_articles_file(spark, path):
    abstract_pattern = r"'''([^\=]*)(?=(={1,6})([^\n]+?)(={1,6})[ \t]*(\n|\Z))"
    customSchema = StructType([
        StructField('title', StringType(), nullable=False),
        StructField('revision', StructType([
            StructField('text', StructType([
                StructField('_VALUE', StringType(), nullable=True)
            ]), nullable=False)
        ]), nullable=False)
    ])
    df = spark.read.format('com.databricks.spark.xml')\
        .options(rowTag='page').load(path, schema=customSchema)
    df.printSchema()
    all_abstracts_rdd = df.rdd.map(
        lambda loop: (
            loop['title'],
            abstract_text.group(0).replace('\n', ' ')
            if (abstract_text := re.search(pattern=abstract_pattern,
                                           string=loop['revision']['text']['_VALUE'],
                                           flags=re.MULTILINE | re.DOTALL)
                ) is not None else None
        ))
    all_abstracts = all_abstracts_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_abstracts.dropna()
    return valid_abstracts

def parse_wiki_abstracts_file(spark, path):
    title_pattern = r"^Wikipedia: ([^\n]*)"
    customSchema = StructType([
        StructField('title', StringType(), False),
        StructField('abstract', StringType(), False)
    ])
    df = spark.read.format("com.databricks.spark.xml")\
        .options(rowTag='doc').load(path, schema=customSchema)
    wiki_abstract_rdd = df.rdd.map(
        lambda loop: (
            re.search(pattern=title_pattern, string=loop['title']).group(1).replace('\n', ' '),
            loop['abstract']
        ))
    all_wiki_abstracts = wiki_abstract_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_wiki_abstracts.dropna()
    return valid_abstracts

def parse_dbpedia_abstract_file(spark, path):
    title_pattern = r"^<http:\/\/dbpedia\.org\/resource\/([^\>]*)> <"
    abstract_pattern = re.compile(r'"([^"]*)"')
    sc = spark.sparkContext
    rdd = sc.textFile(path)
    rdd = rdd.map(
        lambda loop: (
            title.group(1)
            if (title := re.search(title_pattern, loop)) is not None else None,
            abstract.group(1)
            if (abstract := re.search(abstract_pattern, loop)) is not None else None
        ))
    rdd = rdd.filter(lambda row: filter(None, row))
    return rdd

In [12]:
# Parse abstracts from articles
import time, os, glob
data_dir = "d:\Study\STU FIIT\ZS 21-22\Information Retrival\wikidata\wiki_articles"
extension = 'xml'
os.chdir(data_dir)
xml_files = glob.glob(f"*.{extension}")
print(xml_files)

start_time = time.time()
parsed_wikipedia_abstracts_0 = parse_wiki_articles_file(spark,
                                                        path= data_dir + xml_files[0])
parsed_wikipedia_abstracts_1 = parse_wiki_articles_file(spark,
                                                        path= data_dir + xml_files[1])
print(f"parsing elapsed: {time.time() - start_time}")


start_time = time.time()
all_parsed_wikipedia_abstracts = parsed_wikipedia_abstracts_0.union(parsed_wikipedia_abstracts_1)
print(f"union elapsed: {time.time() - start_time}")
# for file in xml_files:
#     absolute_path = data_dir + file
#     if os.path.exists(absolute_path):
#         start_time = time.time()
#         parsed_wikipedia_abstracts = parse_wiki_articles_file(spark, path=absolute_path)
#         print(f"elapsed: {time.time() - start_time}")

['enwiki-20211101-pages-articles1.xml', 'enwiki-20211101-pages-articles2.xml', 'enwiki-20211101-pages-articles3.xml']


In [4]:
# Parse abstracts from abstracts dump
abstracts_absolute_path = data_dir + '\enwiki-latest-abstract.xml'
if os.path.exists(abstracts_absolute_path):
    start_time = time.time()
    wiki_abstracts = parse_wiki_abstracts_file(spark, path=abstracts_absolute_path)
    print(f"elapsed: {time.time() - start_time}")
    wiki_abstracts.show(20)

elapsed: 2.1997337341308594
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|           Anarchism|Anarchism is a po...|
|              Autism|| duration     =L...|
|              Albedo|Albedo (; ) is th...|
|                   A|           A-sharp}}|
|             Alabama|(We dare defend o...|
|            Achilles|In Greek mytholog...|
|     Abraham Lincoln|| alt            ...|
|           Aristotle|                  }}|
|An American in Paris|An American in Pa...|
|Academy Award for...|The Academy Award...|
|      Academy Awards|             Oscar}}|
|             Actrius|  | starring       =|
|     Animalia (book)|Animalia is an il...|
|International Ato...|International Ato...|
|            Altruism|Altruism is the p...|
|            Ayn Rand|| birth_place = S...|
|        Alain Connes|| birth_place = D...|
|          Allan Dwan|| birth_place  = ...|
|             Algeria|| common_name = A...|
|Lis

In [5]:
# Parse abstracts from dbpedia dump
dbpedia_absolute_path = data_dir + '\long-abstracts_lang=en.ttl'
if os.path.exists(dbpedia_absolute_path):
    start_time = time.time()
    dbpedia_abstracts = parse_dbpedia_abstract_file(spark, path='../../' + DBPEDIA_ABSTRACTS_DUMPFILE_PATH)
    print(f"elapsed: {time.time() - start_time}")
    dbpedia_abstracts = dbpedia_abstracts.toDF(['title', 'abstract'])
    dbpedia_abstracts.show(20)

elapsed: 0.04381728172302246
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|                null|                null|
|     Animalia_(book)|Animalia is an il...|
|Agricultural_science|Agricultural scie...|
|              Albedo|Albedo () (Latin:...|
|        Alain_Connes|Alain Connes (Fre...|
|International_Ato...|International Ato...|
|                   A|A or a is the fir...|
|An_American_in_Paris|An American in Pa...|
|List_of_Atlas_Shr...|This is a list of...|
|          Allan_Dwan|Allan Dwan (born ...|
|          Astronomer|An astronomer is ...|
|            Achilles|In Greek mytholog...|
|           Anarchism|Anarchism is a po...|
|        Anthropology|Anthropology is t...|
|              Autism|Autism is a devel...|
|      Academy_Awards|The Academy Award...|
|             Actrius|Actresses (Catala...|
|        Answer_(law)|In law, an Answer...|
|Academy_Award_for...|The Academy Award...|
|Ap

In [8]:
# print(f"count parsed wiki abstracts: {parsed_wikipedia_abstracts.count()}")
# print(f"count dbpedia {dbpedia_abstracts.count()}")
# print(f"count wiki abstracts {wiki_abstracts.count()}")



In [6]:
def write_df_to_csv(dataframe, path_to_csv):
    # dataframe = dataframe.coalesce(1)
    dataframe.write.csv(path_to_csv, sep='\t', header=False, mode='overwrite')

#### PARSED ABSTRACTS TO CSV

In [7]:
start_time = time.time()

write_df_to_csv(parsed_wikipedia_abstracts,
                path_to_csv='../../data/document_base/parsed_abstracts_spark')
print(f"elapsed: {time.time() - start_time}")

elapsed: 43.86532926559448


In [8]:
start_time = time.time()

write_df_to_csv(wiki_abstracts,
                path_to_csv='../../data/document_base/wiki_abstracts_spark')
print(f"elapsed: {time.time() - start_time}")

elapsed: 219.0451045036316


In [9]:

start_time = time.time()

write_df_to_csv(dbpedia_abstracts,
                path_to_csv='../../data/document_base/dbpedia_abstracts_spark')
print(f"elapsed: {time.time() - start_time}")

elapsed: 87.75686526298523


In [14]:
spark.stop()
