In [1]:
import time, os, glob,re
import pyspark.sql.functions as sparkfunctions
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import *
from pyspark import SparkContext, SparkConf
from functools import reduce

default_dir = "d:/Study/STU FIIT/ZS 21-22/Information Retrival/wikidata"

link_to_full_wiki = default_dir + "/whole_wiki/enwiki-20211101-pages-articles.xml.bz2"

#input paths
input_abstracts_data_dir = default_dir + "/wiki_articles"
input_default_abstracts_file = default_dir + "/enwiki-latest-abstract.xml"
input_dbpedia_abstracts_file = default_dir + "/long-abstracts_lang=en.ttl"

#output paths
project_dir = "../../data/document_base"
parsed_abstracts_dir = project_dir + "/parsed_abstracts"
wikipedia_default_abstracts_dir = project_dir + "/default_abstracts"
dbpedia_abstracts_dir = project_dir + "/dbpedia"




# conf = SparkConf.setAll(pairs=[('spark.executor.memory', '8g'), ('spark.executor.cores', '3'), ('spark.cores.max', '3'), ('spark.driver.memory','8g')])
spark = SparkSession \
    .builder \
    .appName('SparkParser').getOrCreate()
    # .config(conf=conf)


In [2]:
def check_object(obj):
    if isinstance(obj, str):
        if obj != '':
            return obj
    return ''

def parse_wiki_articles_file(spark, path):
    abstract_pattern = r"'''([^\=]*)(?=(={1,6})([^\n]+?)(={1,6})[ \t]*(\n|\Z))"
    customSchema = StructType([
        StructField('title', StringType()),
        StructField('revision', StructType([
            StructField('text', StructType([
                StructField('_VALUE', StringType())
            ]))
        ]))
    ])
    df = spark.read.format('com.databricks.spark.xml')\
        .options(rowTag='page').load(path, schema=customSchema)
    all_abstracts_rdd = df.rdd.map(
        lambda loop: (
            loop['title'],
            abstract_text.group(0).replace('\n', ' ')
            if (abstract_text := re.search(pattern=abstract_pattern,
                                           string=check_object(loop['revision']['text']['_VALUE']),
                                           flags=re.MULTILINE | re.DOTALL)
                ) is not None else None
        ))
    all_abstracts = all_abstracts_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_abstracts.dropna()
    valid_abstracts = valid_abstracts.filter(~valid_abstracts.title.contains("Wikipedia:Articles for deletion"))
    return valid_abstracts

def parse_wiki_abstracts_file(spark, path):
    title_pattern = r"^Wikipedia: ([^\n]*)"
    customSchema = StructType([
        StructField('title', StringType(), False),
        StructField('abstract', StringType(), False)
    ])
    df = spark.read.format("com.databricks.spark.xml")\
        .options(rowTag='doc').load(path, schema=customSchema)
    wiki_abstract_rdd = df.rdd.map(
        lambda loop: (
            re.search(pattern=title_pattern, string=loop['title']).group(1).replace('\n', ' '),
            loop['abstract']
        ))
    all_wiki_abstracts = wiki_abstract_rdd.toDF(['title', 'abstract'])
    valid_abstracts = all_wiki_abstracts.dropna()
    return valid_abstracts

def parse_dbpedia_abstract_file(spark, path):
    title_pattern = r"^<http:\/\/dbpedia\.org\/resource\/([^\>]*)> <"
    abstract_pattern = re.compile(r'"([^"]*)"')
    sc = spark.sparkContext
    rdd = sc.textFile(path)
    rdd = rdd.map(
        lambda loop: (
            title.group(1).replace('_', ' ')
            if (title := re.search(title_pattern, loop)) is not None else None,
            abstract.group(1)
            if (abstract := re.search(abstract_pattern, loop)) is not None else None
        ))
    df = rdd.toDF(['title', 'abstract'])
    df = df.dropna()
    return df

def get_xml_filenames(file_dir):
    os.chdir(file_dir)
    all_files = glob.glob(f"*.xml*")
    xml_files = []
    for file_name in all_files:
        if re.match(r"(.*)(?=(\.bz2))", file_name): continue
        else: xml_files.append(file_name)
    return xml_files

## PARSE ABSTRACTS from articles


## Parse all wiki from single bz2 file

single .bz2 file ~18Gb

In [None]:
all_wiki = parse_wiki_articles_file(spark,link_to_full_wiki)
all_wiki_persisted = all_wiki.cache()
all_wiki_persisted.count()

### Multiple articles dumps

In [5]:
# get all xml files from directory
xml_files = get_xml_filenames(input_abstracts_data_dir)
print(xml_files)
start_time = time.time()
all_dfs = []
for idx, file in enumerate(xml_files):
    absolute_path = input_abstracts_data_dir + "/" + file
    mode = 'overwrite' if idx == 0 else 'append'
    if os.path.exists(absolute_path):
        start_time_single = time.time()
        parsed_wikipedia_abstracts = parse_wiki_articles_file(spark, path=absolute_path)
        all_dfs.append(parsed_wikipedia_abstracts)
        print(f"parisng time: {time.time() - start_time_single}")

print(f"elapsed: {time.time() - start_time}")

# uniou all parsed df
single_parsed_abstracts_df = reduce(DataFrame.union, all_dfs)


parisng time: 12.002042531967163
parisng time: 2.601315975189209
parisng time: 1.0388760566711426
parisng time: 3.9985997676849365
parisng time: 3.080911636352539
parisng time: 2.360297203063965
parisng time: 1.9365158081054688
parisng time: 0.9534831047058105
parisng time: 2.614497423171997
elapsed: 30.588595151901245


#### Parse default abstracts from wikipedia-abstracts-latest

In [5]:
if os.path.exists(input_default_abstracts_file):
    start_time = time.time()
    wiki_abstracts = parse_wiki_abstracts_file(spark, path=input_default_abstracts_file)
    print(f"elapsed: {time.time() - start_time}")
    # wiki_abstracts.coalesce(1).write.csv(path=wikipedia_default_abstracts_dir, sep='\t', header=False, mode='overwrite')
    print(f"writing to csv elapsed: {time.time() - start_time}")
    wiki_abstracts.show(20)

elapsed: 2.980881690979004
writing to csv elapsed: 2.980881690979004
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|           Anarchism|Anarchism is a po...|
|              Autism|| duration     =L...|
|              Albedo|Albedo (; ) is th...|
|                   A|           A-sharp}}|
|             Alabama|(We dare defend o...|
|            Achilles|In Greek mytholog...|
|     Abraham Lincoln|| alt            ...|
|           Aristotle|                  }}|
|An American in Paris|An American in Pa...|
|Academy Award for...|The Academy Award...|
|      Academy Awards|             Oscar}}|
|             Actrius|  | starring       =|
|     Animalia (book)|Animalia is an il...|
|International Ato...|International Ato...|
|            Altruism|Altruism is the p...|
|            Ayn Rand|| birth_place = S...|
|        Alain Connes|| birth_place = D...|
|          Allan Dwan|| birth_place  = ...|
|      

#### Parse dbpedia abstracts from long_abstracts dbpedia

In [6]:
if os.path.exists(input_dbpedia_abstracts_file):
    start_time = time.time()
    dbpedia_abstracts = parse_dbpedia_abstract_file(spark, path=input_dbpedia_abstracts_file)
    print(f"elapsed: {time.time() - start_time}")
    # dbpedia_abstracts = dbpedia_abstracts_rdd.toDF(['title', 'abstract'])
    # dbpedia_abstracts.coalesce(1).write.csv(path=dbpedia_abstracts_dir, sep='\t', header=False, mode='overwrite')
    print(f"writing to csv elapsed: {time.time() - start_time}")
    dbpedia_abstracts.show(20)

elapsed: 1.6530005931854248
writing to csv elapsed: 1.6530005931854248
+--------------------+--------------------+
|               title|            abstract|
+--------------------+--------------------+
|     Animalia (book)|Animalia is an il...|
|Agricultural science|Agricultural scie...|
|              Albedo|Albedo () (Latin:...|
|        Alain Connes|Alain Connes (Fre...|
|International Ato...|International Ato...|
|                   A|A or a is the fir...|
|An American in Paris|An American in Pa...|
|List of Atlas Shr...|This is a list of...|
|          Allan Dwan|Allan Dwan (born ...|
|          Astronomer|An astronomer is ...|
|            Achilles|In Greek mytholog...|
|           Anarchism|Anarchism is a po...|
|        Anthropology|Anthropology is t...|
|              Autism|Autism is a devel...|
|      Academy Awards|The Academy Award...|
|             Actrius|Actresses (Catala...|
|        Answer (law)|In law, an Answer...|
|Academy Award for...|The Academy Award...|
|Appe

### Calculate similarity for abstracts

In [7]:
import nltk
from nltk.corpus import stopwords
import string, re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('stopwords')
stopwords = stopwords.words('english')
def clean_string(text):
    text = ''.join([word for word in text if word not in string.punctuation])
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

def compare_texts(first_text, second_text) -> float:
    """function to compare similarity of two texts"""
    """:return Float value from 0 to 1 which represent similarity of two strs"""
    if first_text == None or first_text == '' \
        or second_text == None or second_text == '':
        return 0.0
    cleaned = list(map(clean_string, [first_text, second_text]))
    for cleaned_str in cleaned:
        if not cleaned_str or len(re.sub(r"\s+", "", cleaned_str)) < 3:
            return 0.0
    vectorizer = CountVectorizer().fit_transform(cleaned)
    vectors = vectorizer.toarray()
    first_text = vectors[0].reshape(1, -1)
    second_text = vectors[1].reshape(1, -1)
    return float(cosine_similarity(first_text, second_text)[0][0])





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ollyt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
dbpedia_abstracts.printSchema()

root
 |-- dbpedia_title: string (nullable = true)
 |-- dbpedia_abstract: string (nullable = true)



In [24]:
wiki_abstracts = wiki_abstracts.withColumnRenamed('title', 'default_title').withColumnRenamed('abstract', 'default_abstract')
dbpedia_abstracts = dbpedia_abstracts.withColumnRenamed('title', 'dbpedia_title').withColumnRenamed('abstract', 'dbpedia_abstract')
res = all_wiki_persisted.join(wiki_abstracts, all_wiki_persisted.title == wiki_abstracts.default_title, 'left')
res = res.join(dbpedia_abstracts, res.title == dbpedia_abstracts.dbpedia_title, 'left')
res = res.drop('default_title', 'dbpedia_title')


res.printSchema()
res_persisted = res.cache()
print(res_persisted.count())
res_persisted.show()

root
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- default_abstract: string (nullable = true)
 |-- dbpedia_abstract: string (nullable = true)

3148413
+--------------------+--------------------+--------------------+--------------------+
|               title|            abstract|    default_abstract|    dbpedia_abstract|
+--------------------+--------------------+--------------------+--------------------+
|    "Nowell Sing We"|'''&#34;Nowell Si...|                null|                null|
|"There Are Things...|'''''"There Are T...|"There Are Things...|                null|
|         & Yet & Yet|'''''& Yet & Yet'...|| rev2      = Pit...|& Yet & Yet is th...|
|     '60s Vibrations|''''' '60s Vibrat...|'60s Vibrations w...|'60s Vibrations w...|
|   (315898) 2008 QD4|'''{{mp|(315898) ...|                  }}|(315898) 2008 QD4...|
|(He's a) Shape in...|'''Production''' ...|"(He's a) Shape i...|                null|
|    ...But Seriously|'''Musicians''' *

In [25]:
df_to_operate = res_persisted

In [26]:
custom_udf = sparkfunctions.udf(lambda col1, col2: compare_texts(col1, col2), FloatType())
df_to_operate = df_to_operate\
    .withColumn('default_similarity', custom_udf(df_to_operate.abstract, df_to_operate.default_abstract))\
    .withColumn('dbpedia_similarity', custom_udf(df_to_operate.abstract, df_to_operate.dbpedia_abstract))

df_to_operate.write.csv(path=default_dir+"/combined_output_whole", sep='\t', mode='overwrite')
df_to_operate.show()

#job time 25 min

+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|               title|            abstract|    default_abstract|    dbpedia_abstract|default_similarity|dbpedia_similarity|
+--------------------+--------------------+--------------------+--------------------+------------------+------------------+
|    "Nowell Sing We"|'''&#34;Nowell Si...|                null|                null|               0.0|               0.0|
|"There Are Things...|'''''"There Are T...|"There Are Things...|                null|               1.0|               0.0|
|         & Yet & Yet|'''''& Yet & Yet'...|| rev2      = Pit...|& Yet & Yet is th...|               0.0|         0.8183171|
|     '60s Vibrations|''''' '60s Vibrat...|'60s Vibrations w...|'60s Vibrations w...|         0.7342174|        0.82996094|
|   (315898) 2008 QD4|'''{{mp|(315898) ...|                  }}|(315898) 2008 QD4...|               0.0|         0.7487049|
|(He's a

In [27]:
df_to_operate.count()


3148413