# Outline
- [ 1 - Setup ](#1)


- [ 2 - Load MAS tables](#2)


- [ 3 - Load dblp xml](#3)


- [ 4 - Query tasks](#4)

    - [ 4.1 - List all the papers (year and title) of “Divesh Srivastava”from DBLP that are published after his last paper in the MAS database ](#4.1)
        - [4.1.1 - Query 1 result](#4.1.1)
        
    - [ 4.2 - List the author(s) (names) who have collaborated most (in terms of number of publications) with “Divesh Srivastava”other than himself based on MAS database and DBLP data.](#4.2)
        - [4.2.1 - Query 2 result](#4.2.1)
    
    - [ 4.3 - List the number of publications of “Divesh Srivastava”each year based on MAS database and DBLP data. Duplicate papers in both MAS and DBLP should be counted only once in the result.](#4.3)
     - [4.3.1 - Query 3 result](#4.3.1)
    
    - [ 4.4 - Find papers published in 2021 that are relevant to keyword query ‘self attention transformer’ (or 'self-attention transformer'). Treat each paper title as one document and rank them using tf-idf. Return the top 10 relevant papers (title, authors, journal/conference and year).](#4.4)
        
        - [ 4.4.1 - Dataframe method ](#4.4.1)
        
        - [ 4.4.2 - With RDD map reduce method ](#4.4.2)
            - [4.4.2.1 - Query 4 result](#4.4.2.1)


- [ 5 - Preprocess xml to remove HTML escaped characters](#5)



# 1 - Setup<a name="1"></a>

In [1]:
import sys
import os

#os.environ['HADOOP_HOME'] = r"C:\spark-3.1.3-bin-hadoop3.2"
#sys.path.append(r"C:\spark-3.1.3-bin-hadoop3.2")

#os.environ['SPARK_HOME'] = r"C:\spark-3.1.3-bin-hadoop3.2"
#sys.path.append(r"C:\spark-3.1.3-bin-hadoop3.2")

#os.environ['JAVA_HOME'] = r"C:\Program Files\Java\jdk1.8.0_351"
#sys.path.append(r"C:\Program Files\Java\jdk1.8.0_351")

#os.environ['CLASSPATH'] = r"C:\Program Files\Java\jdk1.8.0_351"
#sys.path.append(r"C:\Program Files\Java\jdk1.8.0_351")


In [2]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext, sql

from pyspark.sql.functions import array_contains, col, explode, regexp_replace, when, substring, length, expr, arrays_zip
from pyspark.sql.types import *
from pyspark.sql.functions import *
#spark = SparkSession.builder.getOrCreate()

spark = SparkSession.builder \
       .config("spark.driver.memory", "9g") \
       .getOrCreate()

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
#df = spark.sql("select 'spark' as hello")
#df.show()

# 2 - Load MAS tables<a name="2"></a>

In [3]:
author_mas = spark.read.format("jdbc").options(
    url="jdbc:mysql://localhost:3306/mas",
    driver = 'com.mysql.cj.jdbc.Driver',
    dbtable = "author",
    user="root",
    password="Harsvifsat(5").load()

author_mas.printSchema()
author_mas.createOrReplaceTempView("author")

root
 |-- aid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- oid: integer (nullable = true)
 |-- homepage: string (nullable = true)
 |-- photo: string (nullable = true)



In [4]:
publication_mas = spark.read.format("jdbc").options(
    url="jdbc:mysql://localhost:3306/mas",
    driver = 'com.mysql.cj.jdbc.Driver',
    dbtable = "publication",
    user="root",
    password="Harsvifsat(5").load()

publication_mas.printSchema()
publication_mas.createOrReplaceTempView("publication")

root
 |-- pid: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- cid: integer (nullable = true)
 |-- jid: integer (nullable = true)
 |-- reference_num: integer (nullable = true)
 |-- citation_num: integer (nullable = true)
 |-- doi: string (nullable = true)



In [5]:
writes_mas = spark.read.format("jdbc").options(
    url="jdbc:mysql://localhost:3306/mas",
    driver = 'com.mysql.cj.jdbc.Driver',
    dbtable = "writes",
    user="root",
    password="Harsvifsat(5").load()

writes_mas.printSchema()
writes_mas.createOrReplaceTempView("writes")

root
 |-- aid: integer (nullable = true)
 |-- pid: integer (nullable = true)



# 3 - Load dblp xml<a name="3"></a>

In [191]:
schema = StructType([
    StructField("_cdate", DateType(), True),
    StructField("_key", StringType(), True),
    StructField("_mdate", DateType(), True),
    StructField("_publtype", StringType(), True),
    StructField("author", 
                ArrayType(
                     StructType([
                         StructField("_VALUE", StringType(), True),
                         StructField("_aux", StringType(), True),
                         StructField("_orcid", StringType(), True),
                     ]),True),
                True),
    StructField("title", StringType(), True),
    StructField("year", LongType(), True),
    StructField("publnr", StringType(), True),
    StructField("journal", StringType(), True),
])

In [192]:
dblp_article = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "dblp") \
    .option("rowTag", "article") \
    .schema(schema) \
    .load("dblp_cleaned.xml")

dblp_article.printSchema()
dblp_article.createOrReplaceTempView("dblp_artv")

root
 |-- _cdate: date (nullable = true)
 |-- _key: string (nullable = true)
 |-- _mdate: date (nullable = true)
 |-- _publtype: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- publnr: string (nullable = true)
 |-- journal: string (nullable = true)



In [193]:
schema2 = StructType([
    StructField("_key", StringType(), True),
    StructField("_mdate", DateType(), True),
    StructField("_publtype", StringType(), True),
    StructField("author", 
                ArrayType(
                     StructType([
                         StructField("_VALUE", StringType(), True),
                         StructField("_aux", StringType(), True),
                         StructField("_orcid", StringType(), True),
                     ]),True),
                True),
    StructField("title", StringType(), True),
    StructField("year", LongType(), True),
    StructField("publnr", StringType(), True),
    StructField("booktitle", StringType(), True),
])

In [194]:
dblp_inproceedings = spark.read \
    .format("com.databricks.spark.xml") \
    .option("rootTag", "dblp") \
    .option("rowTag", "inproceedings") \
    .schema(schema2) \
    .load("dblp_cleaned.xml")

dblp_inproceedings.printSchema()
dblp_inproceedings.createOrReplaceTempView("dblp_inpv")

root
 |-- _key: string (nullable = true)
 |-- _mdate: date (nullable = true)
 |-- _publtype: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- publnr: string (nullable = true)
 |-- booktitle: string (nullable = true)



# 4 - Query Tasks<a name="4"></a>

### 4.1 - List all the papers (year and title) of “Divesh Srivastava”from DBLP that are published after his last paper in the MAS database <a name="4.1"></a>
There are a few options here: 

Find the year and month information from DBLP for the last paper in MAS and then return new papers after that. 

Use the year information from MAS last paper and return papers published in the following year. 

(NEW!) Find papers in DBLP that are not in MAS database starting from the last year of Divesh's papers in the MAS database. 

In [10]:
# get last year of Divesh's paper in MAS
mas1 = spark.sql("select max(p.year) \
    from publication p \
    join writes w \
    on p.pid = w.pid \
    join author a \
    on w.aid = a.aid \
    where a.name = 'Divesh Srivastava' ")

mas1.printSchema()
mas1.createOrReplaceTempView("mas1v")

root
 |-- max(year): integer (nullable = true)



In [11]:
# get papers published by divesh in last year of Divesh's paper in MAS
mas2 = spark.sql("select p.title \
    from publication p \
    join writes w \
    on p.pid = w.pid \
    join author a \
    on w.aid = a.aid \
    where a.name = 'Divesh Srivastava' \
    and p.year = {}".format(mas1.collect()[0][0]))

mas2.printSchema()
mas2.createOrReplaceTempView("mas2v")

root
 |-- title: string (nullable = true)



In [12]:
#get all papers from articles by divesh (need to remove '.' because MAS's titles do not have '.')
dblp_filt_art = dblp_article.where(array_contains(col('author._VALUE'), 'Divesh Srivastava'))
#dblp_filt_art = dblp_filt_art.withColumn('title', regexp_replace('title', '\.', ''))
dblp_filt_art = dblp_filt_art.withColumn('title', 
                                         when(dblp_filt_art.title.endswith('.'),expr("substring(title, 1, length(title)-1)"))
                                         .otherwise(dblp_filt_art.title))
dblp_filt_art.printSchema()
dblp_filt_art.createOrReplaceTempView("dblp_filt_artv")

root
 |-- _cdate: date (nullable = true)
 |-- _key: string (nullable = true)
 |-- _mdate: date (nullable = true)
 |-- _publtype: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- publnr: string (nullable = true)
 |-- journal: string (nullable = true)



In [13]:
#get all papers from inproceedings by divesh
dblp_filt_inp = dblp_inproceedings.where(array_contains(col('author._VALUE'), 'Divesh Srivastava'))
#dblp_filt_inp = dblp_filt_inp.withColumn('title', regexp_replace('title', '\.', ''))
dblp_filt_inp = dblp_filt_inp.withColumn('title', 
                                         when(dblp_filt_inp.title.endswith('.'),expr("substring(title, 1, length(title)-1)"))
                                         .otherwise(dblp_filt_inp.title))
dblp_filt_inp.printSchema()
dblp_filt_inp.createOrReplaceTempView("dblp_filt_inpv")

root
 |-- _key: string (nullable = true)
 |-- _mdate: date (nullable = true)
 |-- _publtype: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)
 |-- publnr: string (nullable = true)
 |-- booktitle: string (nullable = true)



In [24]:
# get divesh's paper in dblp after his last paper in mas
dblp1 = spark.sql("select distinct upper(d.title) as title, d.year \
    from (select year, title from dblp_filt_artv \
        union \
        select year, title from dblp_filt_inpv \
       ) AS d \
    where d.year >= {} \
    and not exists (select title \
                    from mas2v m \
                    where lower(m.title) = lower(d.title)) \
    order by d.year desc, title asc".format(mas1.collect()[0][0]))

dblp1.printSchema()
dblp1.createOrReplaceTempView("dblp1v")

root
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



### Query 1 result<a name="4.1.1"></a>

In [25]:
dblp1.show(1000, truncate = False)

+-----------------------------------------------------------------------------------------------------------------+----+
|title                                                                                                            |year|
+-----------------------------------------------------------------------------------------------------------------+----+
|ABC OF ORDER DEPENDENCIES                                                                                        |2022|
|ABCOD: MINING BAND ORDER DEPENDENCIES                                                                            |2022|
|CERTEM: EXPLAINING AND DEBUGGING BLACK-BOX ENTITY RESOLUTION SYSTEMS WITH CERTA                                  |2022|
|DATAPRISM: EXPOSING DISCONNECT BETWEEN DATA AND SYSTEMS                                                          |2022|
|DISCOVERING DOMAIN ORDERS VIA ORDER DEPENDENCIES                                                                 |2022|
|EFFECTIVE EXPLANATIONS FOR ENTI

### 4.2 - List the author(s) (names) who have collaborated most (in terms of number of publications) with “Divesh Srivastava”other than himself based on MAS database and DBLP data. <a name="4.2"></a>
The result may contain more than one author if they coauthored with him for same number of papers.

In [153]:
# get author name and paper count with divesh from mas
mas3 = spark.sql("select distinct a2.aid, a2.name, count(distinct lower(p.title)) as countp \
                from author a1 \
                join writes w1 \
                    on a1.aid = w1.aid \
                join writes w2 \
                    on w1.pid = w2.pid \
                join author a2 \
                    on w2.aid = a2.aid \
                join publication p \
                    on w2.pid = p.pid \
                where a1.name = 'Divesh Srivastava' \
                    and a2.name <> 'Divesh Srivastava' \
                group by a2.aid, a2.name \
                order by countp desc")

mas3.printSchema()
mas3.createOrReplaceTempView("mas3v")

root
 |-- aid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- countp: long (nullable = false)



In [154]:
#get all paper by divesh from mas 
mas4 = spark.sql("select distinct p.pid, p.title \
                from publication p \
                join writes w \
                    on p.pid = w.pid join author a \
                    on w.aid = a.aid \
                where a.name = 'Divesh Srivastava' ")

mas4.printSchema()
mas4.createOrReplaceTempView("mas4v")

root
 |-- pid: integer (nullable = true)
 |-- title: string (nullable = true)



In [155]:
# get author name and paper by divesh from both article and inproceeding
dblp2 = spark.sql("select d._VALUE, d.title \
    from (select author._VALUE, title from dblp_filt_artv \
        union \
        select author._VALUE, title from dblp_filt_inpv \
       ) AS d")

dblp2.printSchema()

root
 |-- _VALUE: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- title: string (nullable = true)



In [156]:
# explode author name column
dblp3 = dblp2.select(explode(dblp2._VALUE),dblp2.title)
dblp3.printSchema()
dblp3.createOrReplaceTempView("dblp3v")

root
 |-- col: string (nullable = true)
 |-- title: string (nullable = true)



In [157]:
# get author name and paper count with divesh in dblp that is not in mas
dblp4 = spark.sql("select distinct d32.col, count(distinct lower(d3.title)) as countp \
                from dblp3v d3 \
                join dblp3v d32 \
                    on d3.title = d32.title \
                where d3.col like '%Divesh Srivastava%' \
                    and d32.col <> 'Divesh Srivastava' \
                    and not exists (select title \
                                    from mas4v m \
                                    where lower(m.title) = lower(d3.title)) group by d32.col \
                order by countp desc")

dblp4.printSchema()
dblp4.createOrReplaceTempView("dblp4v")

root
 |-- col: string (nullable = true)
 |-- countp: long (nullable = false)



In [158]:
# get author name with most paper with divesh
dblp5 = spark.sql("select name, sum(countp) as totalcount \
                   from (select col as name, countp from dblp4v \
                        union all\
                        select name, countp from mas3v \
                        ) \
                   group by name \
                   order by totalcount desc \
                   limit 1")

dblp5.printSchema()
dblp5.createOrReplaceTempView("dblp5v")

root
 |-- name: string (nullable = true)
 |-- totalcount: long (nullable = true)



### Query 2 result<a name="4.2.1"></a>

In [159]:
dblp5.show()

+-----------+----------+
|       name|totalcount|
+-----------+----------+
|Nick Koudas|        75|
+-----------+----------+



### 4.3 - List the number of publications of “Divesh Srivastava”each year based on MAS database and DBLP data. Duplicate papers in both MAS and DBLP should be counted only once in the result.<a name="4.3"></a>

In [51]:
# get all papers and year by divesh from MAS 
mas5 = spark.sql("select distinct p.pid, p.title, p.year \
                from publication p \
                join writes w \
                    on p.pid = w.pid \
                join author a \
                    on w.aid = a.aid \
                where a.name = 'Divesh Srivastava'")

mas5.printSchema()
mas5.createOrReplaceTempView("mas5v")

root
 |-- pid: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: integer (nullable = true)



In [52]:
# get all papers and year from dblp articles and inproceedings
dblp6 = spark.sql("select distinct lower(d.title) as title, d.year \
        from (select title, year from dblp_filt_artv \
        union \
        select title, year from dblp_filt_inpv \
       ) AS d")

dblp6.printSchema()
dblp6.createOrReplaceTempView("dblp6v")

root
 |-- title: string (nullable = true)
 |-- year: long (nullable = true)



In [55]:
# union both table and get count of paper per year
dblp7 = spark.sql("select count(distinct lower(d.title)) as countp, d.year \
        from (select lower(title) as title, year from dblp6v \
        union \
        select lower(title) as title, year from mas5v \
       ) AS d \
       group by d.year \
       order by d.year desc")

dblp7.printSchema()
dblp7.createOrReplaceTempView("dblp7v")

root
 |-- countp: long (nullable = false)
 |-- year: long (nullable = true)



### Query 3 result<a name="4.3.1"></a>

In [56]:
dblp7.show(1000, truncate = False)

+------+----+
|countp|year|
+------+----+
|15    |2022|
|16    |2021|
|14    |2020|
|17    |2019|
|19    |2018|
|16    |2017|
|8     |2016|
|21    |2015|
|10    |2014|
|11    |2013|
|17    |2012|
|14    |2011|
|23    |2010|
|26    |2009|
|20    |2008|
|15    |2007|
|15    |2006|
|16    |2005|
|10    |2004|
|28    |2003|
|17    |2002|
|10    |2001|
|13    |2000|
|10    |1999|
|7     |1998|
|3     |1997|
|9     |1996|
|5     |1995|
|8     |1994|
|8     |1993|
|7     |1992|
|2     |1991|
|3     |1990|
|10    |0   |
|1     |null|
+------+----+



### 4.4 - Find papers published in 2021 that are relevant to keyword query ‘self attention transformer’ (or 'self-attention transformer'). Treat each paper title as one document and rank them using tf-idf. Return the top 10 relevant papers (title, authors, journal/conference and year).  <a name="4.4"></a>

Keyword query "self attention transformer" contains 3 keywords (terms) as "self", "attention", and "transformer". 

Keyword query "self-attention transformer" contains 2 keywords (terms) as "self-attention" and "transformer".

Choose one of the keyword query. 

### Scenario used: 

- TF-IDF is case insensitive

- "self-attention" and "transformer" are used for query

- Grammer derivation of "self-attention" and "transformer" such as "transformers" are not counted

- TF is normalized to length of title

### 4.4.1 - Dataframe method <a name="4.4.1"></a>

In [195]:
# get all papers in dblp articles and proceedings
dblp_all =  spark.sql("select distinct d.title, d.author, d.year, d.journal as journal_conference\
                        from (select distinct lower(title) as title, author, year, journal \
                            from dblp_artv \
                            union \
                            select distinct lower(title) as title, author, year, booktitle \
                            from dblp_inpv) as d \
                        where (lower(d.title) like '%self-attention%' \
                                or lower(d.title) like '%transformer%') \
                                and d.year = 2021 \
                                order by d.title")

dblp_all = dblp_all.withColumn('title', 
                             when(dblp_all.title.endswith('.'),expr("substring(title, 1, length(title)-1)"))
                             .otherwise(dblp_all.title))

dblp_all.printSchema()
dblp_all.createOrReplaceTempView("dblp_allv")

root
 |-- title: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)
 |-- year: long (nullable = true)
 |-- journal_conference: string (nullable = true)



In [196]:
# get all papers in dblp articles and proceedings that are in year 2021 that is relevant to self-attention transformer
dblp_filt2 =  spark.sql("select distinct lower(d.title) title, d.author \
                        from (select distinct lower(title) as title, author, year, journal \
                            from dblp_artv \
                            union \
                            select distinct lower(title) as title, author, year, booktitle \
                            from dblp_inpv) as d \
                        where (lower(d.title) like '%self-attention%' \
                                or lower(d.title) like '%transformer%') \
                                and d.year = 2021 \
                                order by title")

dblp_filt2 = dblp_filt2.withColumn('title', 
                             when(dblp_filt2.title.endswith('.'),expr("substring(title, 1, length(title)-1)"))
                             .otherwise(dblp_filt2.title))

dblp_filt2.printSchema()
dblp_filt2.createOrReplaceTempView("dblp_filt2v")

root
 |-- title: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- _VALUE: string (nullable = true)
 |    |    |-- _aux: string (nullable = true)
 |    |    |-- _orcid: string (nullable = true)



In [197]:
dblp_filt2.show()

+--------------------+--------------------+
|               title|              author|
+--------------------+--------------------+
|$\mathcal{laja}{-...|[{Veena Mayya, nu...|
|1d self-attention...|[{Takahiro Suzuki...|
|26.5 a watt-level...|[{Bingzheng Yang,...|
|270-to-300ghz dou...|[{Zhiyu Chen, nul...|
|2d self-attention...|[{Nam Tuan Ly, nu...|
|2lspe: 2d learnab...|[{Zobeir Raisi, n...|
|3-d ultrasonic lo...|[{Hongxin Ji, nul...|
|3d deep attentive...|[{Yiyao Liu, null...|
|3d human pose est...|[{Ce Zheng, null,...|
|3d human texture ...|[{Xiangyu Xu, nul...|
|3d medical point ...|[{Jianhui Yu, nul...|
|3d object trackin...|[{Yubo Cui, null,...|
|3d transformer-ga...|[{Yanmei Luo, nul...|
|3d-anas v2: graft...|[{Xizhe Xue, null...|
|3d-retr: end-to-e...|[{Zai Shi, null, ...|
|3d-transformer: m...|[{Fang Wu, null, ...|
|3dmet: 3d medical...|[{Sheng Wang, nul...|
|3dvg-transformer:...|[{Lichen Zhao, nu...|
|3m-transformers f...|[{Erick Skorupa P...|
|6d-vit: category-...|[{Lu Zou, 

In [198]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, CountVectorizer, Tokenizer
from pyspark.ml.feature import IDF
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import Vectors

In [199]:
tokenizer = Tokenizer(inputCol="title", outputCol="features")
dblp_filt3 = tokenizer.transform(dblp_filt2)

df_split = dblp_filt3.rdd.map(lambda x : (x.title,x.features)) \
          .toDF() \
          .withColumnRenamed("_1","title") \
          .withColumnRenamed("_2","features")

htf = HashingTF(inputCol="features", outputCol="tf", numFeatures=262144)
tf = htf.transform(df_split)

In [200]:
tf.printSchema()

root
 |-- title: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)



In [201]:
#get rdd to map to tf values
res = tf.rdd.map(lambda x : (x.title,
                            x.features,
                            list((None if x.tf is None else float(y) for y in x.tf.values)),
                              ))

In [202]:
#define schema of tf DF and transform rdd to DF
resSchema = StructType([       
    StructField('title', StringType(), True),
    StructField('features', 
                ArrayType(
                    StringType(),True
                ), True),
    StructField('tf', 
                ArrayType(
                    FloatType(),True
                ), True),
])

resDF = spark.createDataFrame(res, schema = resSchema)

In [203]:
resDF.show()
resDF.printSchema()

+--------------------+--------------------+--------------------+
|               title|            features|                  tf|
+--------------------+--------------------+--------------------+
|$\mathcal{laja}{-...|[$\mathcal{laja}{...|[1.0, 1.0, 1.0, 1...|
|1d self-attention...|[1d, self-attenti...|[1.0, 1.0, 1.0, 1...|
|26.5 a watt-level...|[26.5, a, watt-le...|[1.0, 1.0, 1.0, 1...|
|270-to-300ghz dou...|[270-to-300ghz, d...|[1.0, 1.0, 1.0, 1...|
|2d self-attention...|[2d, self-attenti...|[1.0, 1.0, 1.0, 1...|
|2lspe: 2d learnab...|[2lspe:, 2d, lear...|[1.0, 1.0, 1.0, 1...|
|3-d ultrasonic lo...|[3-d, ultrasonic,...|[1.0, 1.0, 1.0, 1...|
|3d deep attentive...|[3d, deep, attent...|[1.0, 2.0, 1.0, 1...|
|3d human pose est...|[3d, human, pose,...|[1.0, 1.0, 1.0, 1...|
|3d human texture ...|[3d, human, textu...|[1.0, 1.0, 1.0, 1...|
|3d medical point ...|[3d, medical, poi...|[1.0, 1.0, 1.0, 1...|
|3d object trackin...|[3d, object, trac...|[1.0, 1.0, 1.0, 1...|
|3d transformer-ga...|[3d

In [204]:
#explode features and tf values
resDF_explode = resDF.withColumn("new", arrays_zip("features", "tf"))\
             .withColumn("new", explode("new"))\
             .select("title", col("new.features").alias("features"), col("new.tf").alias("tf"))

In [205]:
resDF_explode.show()
resDF_explode.printSchema()
resDF_explode.createOrReplaceTempView("resDF_explodev")

+--------------------+-------------------+---+
|               title|           features| tf|
+--------------------+-------------------+---+
|$\mathcal{laja}{-...|$\mathcal{laja}{-}$|1.0|
|$\mathcal{laja}{-...|              label|1.0|
|$\mathcal{laja}{-...|          attention|1.0|
|$\mathcal{laja}{-...|        transformer|1.0|
|$\mathcal{laja}{-...|      architectures|1.0|
|$\mathcal{laja}{-...|                for|1.0|
|$\mathcal{laja}{-...|             icd-10|1.0|
|$\mathcal{laja}{-...|             coding|1.0|
|$\mathcal{laja}{-...|                 of|1.0|
|$\mathcal{laja}{-...|       unstructured|1.0|
|$\mathcal{laja}{-...|           clinical|1.0|
|$\mathcal{laja}{-...|              notes|1.0|
|1d self-attention...|                 1d|1.0|
|1d self-attention...|     self-attention|1.0|
|1d self-attention...|            network|1.0|
|1d self-attention...|                for|1.0|
|1d self-attention...|              point|1.0|
|1d self-attention...|              cloud|1.0|
|1d self-atte

In [206]:
# filter features for self-attention and transformer, then calculate the tf*idf value
dblp_tfidf =  spark.sql("select title, features, tf \
                        from resDF_explodev \
                        where lower(features) = 'self-attention' \
                        or lower(features) = 'transformer' ")

dblp_tfidf.printSchema()
dblp_tfidf.createOrReplaceTempView("dblp_tfidfv")

root
 |-- title: string (nullable = true)
 |-- features: string (nullable = true)
 |-- tf: float (nullable = true)



In [207]:
#calculate number of terms in each document
dblp_tfidf1 = spark.sql("select title, sum(tf) as total_terms\
                        from resDF_explodev \
                        group by title ")
dblp_tfidf1.printSchema()
dblp_tfidf1.createOrReplaceTempView("dblp_tfidf1v")

root
 |-- title: string (nullable = true)
 |-- total_terms: double (nullable = true)



In [208]:
dblp_tfidf1.show()

+--------------------+-----------+
|               title|total_terms|
+--------------------+-----------+
|a text autoencode...|       10.0|
|combining cnns wi...|       11.0|
|finetuning pretra...|        5.0|
|improving multili...|        9.0|
|knowledge graph c...|       10.0|
|latte: lstm self-...|       10.0|
|mt-transunet: med...|       12.0|
|multi-domain tran...|        8.0|
|ordering sentence...|       11.0|
|point cloud trans...|        7.0|
|r2d2: relational ...|        6.0|
|recursive non-aut...|       10.0|
|transformers and ...|       10.0|
|transvg: end-to-e...|        6.0|
|visual transforme...|        3.0|
|a transformer-bas...|        7.0|
|blind deinterleav...|       14.0|
|dct-net: a deep c...|       10.0|
|delving deep into...|       11.0|
|diabetic retinopa...|       10.0|
+--------------------+-----------+
only showing top 20 rows



In [209]:
#calculate number of self-attention in each document 
dblp_tfidf2 = spark.sql("select title, sum(tf) as total_sa_terms\
                        from resDF_explodev \
                        where lower(features) = 'self-attention' \
                        group by title ")
dblp_tfidf2.printSchema()
dblp_tfidf2.createOrReplaceTempView("dblp_tfidf2v")

root
 |-- title: string (nullable = true)
 |-- total_sa_terms: double (nullable = true)



In [210]:
dblp_tfidf2.show()

+--------------------+--------------+
|               title|total_sa_terms|
+--------------------+--------------+
|latte: lstm self-...|           1.0|
|blind deinterleav...|           1.0|
|predicting esopha...|           1.0|
|scsa-net: present...|           1.0|
|self-attention bi...|           1.0|
|dapnet: a double ...|           1.0|
|re-transformer: a...|           1.0|
|spatial context-a...|           1.0|
|dual-axial self-a...|           1.0|
|generalizing rnn-...|           1.0|
|local multi-head ...|           1.0|
|mgsan: a multi-gr...|           1.0|
|brain dynamics vi...|           1.0|
|two-hand pose est...|           1.0|
|a region descript...|           1.0|
|exploring neural ...|           1.0|
|multi-horizon ele...|           2.0|
|human parity on c...|           1.0|
|a korean named en...|           1.0|
|a mixed-domain se...|           1.0|
+--------------------+--------------+
only showing top 20 rows



In [211]:
#calculate number of documents where self-attention appears in
dblp_tfidf3 = spark.sql("select count(distinct title) as countp \
                        from resDF_explodev \
                        where lower(features) = 'self-attention' ")
dblp_tfidf3.printSchema()
dblp_tfidf3.createOrReplaceTempView("dblp_tfidf3v")

root
 |-- countp: long (nullable = false)



In [212]:
dblp_tfidf3.show()

+------+
|countp|
+------+
|   386|
+------+



In [213]:
#calculate number of transformer in each document 
dblp_tfidf4 = spark.sql("select title, sum(tf) as total_t_terms\
                        from resDF_explodev \
                        where lower(features) = 'transformer' \
                        group by title ")
dblp_tfidf4.printSchema()
dblp_tfidf4.createOrReplaceTempView("dblp_tfidf4v")

root
 |-- title: string (nullable = true)
 |-- total_t_terms: double (nullable = true)



In [214]:
dblp_tfidf4.show()

+--------------------+-------------+
|               title|total_t_terms|
+--------------------+-------------+
|a text autoencode...|          1.0|
|combining cnns wi...|          1.0|
|improving multili...|          1.0|
|knowledge graph c...|          1.0|
|recursive non-aut...|          1.0|
|visual transforme...|          1.0|
|dct-net: a deep c...|          1.0|
|diabetic retinopa...|          1.0|
|knowledge-enhance...|          1.0|
|mobilevit: light-...|          1.0|
|probabilistic tra...|          1.0|
|video summarizati...|          1.0|
|vision transforme...|          1.0|
|a differential rf...|          1.0|
|context-aware and...|          1.0|
|gaze estimation u...|          1.0|
|joint localizatio...|          1.0|
|on exploring atte...|          1.0|
|ruite: refining u...|          1.0|
|streaming simulta...|          1.0|
+--------------------+-------------+
only showing top 20 rows



In [215]:
#calculate number of documents where transformer appears in
dblp_tfidf5 = spark.sql("select count(distinct title) as countp \
                        from resDF_explodev \
                        where lower(features) = 'transformer' ")
dblp_tfidf5.printSchema()
dblp_tfidf5.createOrReplaceTempView("dblp_tfidf5v")

root
 |-- countp: long (nullable = false)



In [216]:
dblp_tfidf5.show()

+------+
|countp|
+------+
|  1529|
+------+



In [217]:
#calculate total number of documents
dblp_tfidf7 = spark.sql("select count(distinct title) as countp \
                        from dblp_filt2v ")
dblp_tfidf7.printSchema()
dblp_tfidf7.createOrReplaceTempView("dblp_tfidf7v")

root
 |-- countp: long (nullable = false)



In [218]:
dblp_tfidf7.show()

+------+
|countp|
+------+
|  3292|
+------+



In [219]:
#calculate each term in each doc count
dblp_tfidf6 = spark.sql("select d1.title, d1.total_terms, COALESCE(d2.total_sa_terms,0) as sa_count, COALESCE(d4.total_t_terms,0) as t_count  \
                        from dblp_tfidf1v d1 \
                        full outer join dblp_tfidf2v d2 \
                            on d1.title = d2.title \
                        full outer join dblp_tfidf4v d4 \
                            on d1.title = d4.title ")
dblp_tfidf6.printSchema()
dblp_tfidf6.createOrReplaceTempView("dblp_tfidf6v")

root
 |-- title: string (nullable = true)
 |-- total_terms: double (nullable = true)
 |-- sa_count: double (nullable = false)
 |-- t_count: double (nullable = false)



In [220]:
dblp_tfidf6.show()

+--------------------+-----------+--------+-------+
|               title|total_terms|sa_count|t_count|
+--------------------+-----------+--------+-------+
|a text autoencode...|       10.0|     0.0|    1.0|
|combining cnns wi...|       11.0|     0.0|    1.0|
|finetuning pretra...|        5.0|     0.0|    0.0|
|improving multili...|        9.0|     0.0|    1.0|
|knowledge graph c...|       10.0|     0.0|    1.0|
|latte: lstm self-...|       10.0|     1.0|    0.0|
|mt-transunet: med...|       12.0|     0.0|    0.0|
|multi-domain tran...|        8.0|     0.0|    0.0|
|ordering sentence...|       11.0|     0.0|    0.0|
|point cloud trans...|        7.0|     0.0|    0.0|
|r2d2: relational ...|        6.0|     0.0|    0.0|
|recursive non-aut...|       10.0|     0.0|    1.0|
|transformers and ...|       10.0|     0.0|    0.0|
|transvg: end-to-e...|        6.0|     0.0|    0.0|
|visual transforme...|        3.0|     0.0|    1.0|
|a transformer-bas...|        7.0|     0.0|    0.0|
|blind deint

### 4.4.2 - With RDD map reduce method <a name="4.4.2"></a>

In [228]:
tokenizer = Tokenizer(inputCol="title", outputCol="features")
dblp_filt3 = tokenizer.transform(dblp_filt2)

In [229]:
#(title, features)
lines = dblp_filt3.rdd.map(lambda x : (x.title,x.features))

In [230]:
#[(title,features),1]  : count of each feature per each title
map1 = lines.flatMap(lambda x: [((x[0],i),1) for i in x[1]])
map1.collect()

[(('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   '$\\mathcal{laja}{-}$'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'label'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'attention'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'transformer'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'architectures'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'for'),
  1),
 (('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
   'icd-10'),
  1),
 (('$\\ma

In [231]:
#[(title,features),1] -> [(title, features), TF:(1+1+1...)]  : sum of count of each feature by each title (group by title and feature)
reduce=map1.reduceByKey(lambda x,y:x+y)
reduce.collect()

[(('a 7.9-14.3ghz -243.3db fomt sub-sampling pll with transformer-based dual-mode vco in 40nm cmos',
   'dual-mode'),
  1),
 (('a comparative study of transformers on word sense disambiguation', 'word'),
  2),
 (('a differential rf front-end cmos transformer matching for ambient rf energy harvesting systems',
   'matching'),
  1),
 (('a fuzzy logic proposal for diagnosis multiple incipient faults in a power transformer',
   'multiple'),
  1),
 (('a modular multilevel converter (mmc) based solid-state transformer (sst) topology with simplified energy conversion process and magnetic integration',
   'integration'),
  1),
 (('a new model of transformer operation state evaluation based on analytic hierarchy process and association rule mining',
   'and'),
  1),
 (('a note on learning rare events in molecular dynamics using lstm and transformer',
   'rare'),
  1),
 (('a pi+passivity-based control of a wind energy conversion system enabled with a solid-state transformer',
   'system'),
  1),

In [232]:
#[(title, features), TF] -> [features, (title, TF)] 
tf=reduce.map(lambda x: (x[0][1],(x[0][0],x[1])))
tf.collect()

[('dual-mode',
  ('a 7.9-14.3ghz -243.3db fomt sub-sampling pll with transformer-based dual-mode vco in 40nm cmos',
   1)),
 ('word',
  ('a comparative study of transformers on word sense disambiguation', 2)),
 ('matching',
  ('a differential rf front-end cmos transformer matching for ambient rf energy harvesting systems',
   1)),
 ('multiple',
  ('a fuzzy logic proposal for diagnosis multiple incipient faults in a power transformer',
   1)),
 ('integration',
  ('a modular multilevel converter (mmc) based solid-state transformer (sst) topology with simplified energy conversion process and magnetic integration',
   1)),
 ('and',
  ('a new model of transformer operation state evaluation based on analytic hierarchy process and association rule mining',
   1)),
 ('rare',
  ('a note on learning rare events in molecular dynamics using lstm and transformer',
   1)),
 ('system',
  ('a pi+passivity-based control of a wind energy conversion system enabled with a solid-state transformer',
   1)),

In [233]:
#[(title, features), TF] -> [features, (title, TF , 1)]  : count 1 for every time feature appear in each title 
map3=reduce.map(lambda x: (x[0][1],(x[0][0],x[1],1)))
map3.collect()

[('dual-mode',
  ('a 7.9-14.3ghz -243.3db fomt sub-sampling pll with transformer-based dual-mode vco in 40nm cmos',
   1,
   1)),
 ('word',
  ('a comparative study of transformers on word sense disambiguation', 2, 1)),
 ('matching',
  ('a differential rf front-end cmos transformer matching for ambient rf energy harvesting systems',
   1,
   1)),
 ('multiple',
  ('a fuzzy logic proposal for diagnosis multiple incipient faults in a power transformer',
   1,
   1)),
 ('integration',
  ('a modular multilevel converter (mmc) based solid-state transformer (sst) topology with simplified energy conversion process and magnetic integration',
   1,
   1)),
 ('and',
  ('a new model of transformer operation state evaluation based on analytic hierarchy process and association rule mining',
   1,
   1)),
 ('rare',
  ('a note on learning rare events in molecular dynamics using lstm and transformer',
   1,
   1)),
 ('system',
  ('a pi+passivity-based control of a wind energy conversion system enabled w

In [234]:
#[features, (title, TF , 1)] -> (feature, 1) : count 1 for every time feature appear in each title 
map4=map3.map(lambda x:(x[0],x[1][2]))
map4.collect()

[('dual-mode', 1),
 ('word', 1),
 ('matching', 1),
 ('multiple', 1),
 ('integration', 1),
 ('and', 1),
 ('rare', 1),
 ('system', 1),
 ('training', 1),
 ('natural', 1),
 ('injection', 1),
 ('era', 1),
 ('mri', 1),
 ('verification', 1),
 ('more', 1),
 ('assessing', 1),
 ('transformer-based', 1),
 ('trial', 1),
 ('rail', 1),
 ('transformer', 1),
 ('ava:', 1),
 ('speaker', 1),
 ('extraction', 1),
 ('voltage', 1),
 ('bert', 1),
 ('visual', 1),
 ('vaccine', 1),
 ('transformers', 1),
 ('controllable', 1),
 ('computation-aware', 1),
 ('melody', 1),
 ('a', 1),
 ('sexism', 1),
 ('accelerated', 1),
 ('agent', 1),
 ('transformer-based', 1),
 ('on', 1),
 ('deeplpc-mhanet:', 1),
 ('method', 1),
 ('transformers', 1),
 ('transformer', 1),
 ('real-time', 1),
 ('semantic', 1),
 ('discriminative', 1),
 ('domain', 1),
 ('for', 1),
 ('classification', 1),
 ('strategies', 1),
 ('spotting', 1),
 ('lightweight', 1),
 ('power', 1),
 ('at', 1),
 ('dementia', 1),
 ('and', 1),
 ('via', 1),
 ('spaces', 1),
 ('of',

In [235]:
#(feature, occ_in_corpus(1+1+1...)) : sum of count of title with feature for each feature (group by feature)
reduce2=map4.reduceByKey(lambda x,y:x+y)
reduce2.collect()

[('10', 2),
 ('couplet', 1),
 ('an', 96),
 ('2d', 7),
 ('step', 2),
 ('quality', 19),
 ('matters', 1),
 ('policy', 2),
 ('carlo', 1),
 ('cognlp-sheffield', 1),
 ('solve', 4),
 ('maritime', 1),
 ('shell-type', 2),
 ('temgnet:', 1),
 ('standards', 1),
 ('doubler', 1),
 ('everything', 2),
 ('asr,', 1),
 ('plates', 1),
 ('am-lstm', 1),
 ('wide-range', 1),
 ('rooftops', 1),
 ('process:', 1),
 ('readonce', 1),
 ('colorectal', 1),
 ('orange', 1),
 ('restaurant', 1),
 ('cs-um6p', 1),
 ('sence', 1),
 ('unsupervised', 30),
 ('how', 11),
 ('media', 10),
 ('two-way', 1),
 ('leakage', 8),
 ('lm', 1),
 ('representational', 2),
 ('icd-10', 2),
 ('speeding', 1),
 ('ufo-vit:', 1),
 ('integrating', 6),
 ('task:', 7),
 ('marbert:', 1),
 ('nearest', 1),
 ('pilot:', 1),
 ('cosformer:', 1),
 ('soccer', 1),
 ('co-segmentation', 1),
 ('knn', 1),
 ('leads', 1),
 ('generic', 1),
 ('s2s-ft:', 1),
 ('reordering', 1),
 ('crops', 1),
 ('communicative', 2),
 ('entity-and-relation', 1),
 ('retraining', 1),
 ('japanes

In [236]:
import math
#(feature, idf)
totalcount = dblp_tfidf7.collect()[0][0]
idf=reduce2.map(lambda x: (x[0],math.log2((1+totalcount)/(1+x[1]))))
idf.collect()

[('10', 10.100224295874192),
 ('couplet', 10.685186796595348),
 ('an', 5.08527395440822),
 ('2d', 8.685186796595348),
 ('step', 10.100224295874192),
 ('quality', 7.363258701707985),
 ('matters', 10.685186796595348),
 ('policy', 10.100224295874192),
 ('carlo', 10.685186796595348),
 ('cognlp-sheffield', 10.685186796595348),
 ('solve', 9.363258701707986),
 ('maritime', 10.685186796595348),
 ('shell-type', 10.100224295874192),
 ('temgnet:', 10.685186796595348),
 ('standards', 10.685186796595348),
 ('doubler', 10.685186796595348),
 ('everything', 10.100224295874192),
 ('asr,', 10.685186796595348),
 ('plates', 10.685186796595348),
 ('am-lstm', 10.685186796595348),
 ('wide-range', 10.685186796595348),
 ('rooftops', 10.685186796595348),
 ('process:', 10.685186796595348),
 ('readonce', 10.685186796595348),
 ('colorectal', 10.685186796595348),
 ('orange', 10.685186796595348),
 ('restaurant', 10.685186796595348),
 ('cs-um6p', 10.685186796595348),
 ('sence', 10.685186796595348),
 ('unsupervised', 

In [237]:
#(feature, TF, idf)
rdd=tf.join(idf)
rdd.collect()

[('10',
  (('indt5: a text-to-text transformer for 10 indigenous languages', 1),
   10.100224295874192)),
 ('10',
  (('analysis of the influence of the breaking radiation magnetic field of a 10 kv intelligent circuit breaker on an electronic transformer',
    1),
   10.100224295874192)),
 ('couplet',
  (('transcouplet: transformer based chinese couplet generation', 1),
   10.685186796595348)),
 ('2d',
  (('a transformer architecture based on bert and 2d convolutional neural network to identify dna enhancers from sequence information',
    1),
   8.685186796595348)),
 ('2d',
  (('boxer: box-attention for 2d and 3d transformers', 1), 8.685186796595348)),
 ('2d',
  (('2d self-attention convolutional recurrent network for offline handwritten text recognition',
    1),
   8.685186796595348)),
 ('2d',
  (('portfolio optimization with 2d relative-attentional gated transformer',
    1),
   8.685186796595348)),
 ('2d',
  (('combining a parallel 2d cnn with a self-attention dilated residual netw

In [238]:
#[(title,(feature, tf, idf, tf*idf))] : calculate tf-idf
rdd=rdd.map(lambda x: (x[1][0][0],(x[0],x[1][0][1],x[1][1],x[1][0][1]*x[1][1]))).sortByKey()
rdd.collect()

[('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
  ('icd-10', 1, 10.100224295874192, 10.100224295874192)),
 ('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
  ('$\\mathcal{laja}{-}$', 1, 10.685186796595348, 10.685186796595348)),
 ('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
  ('for', 1, 1.0198508794101715, 1.0198508794101715)),
 ('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
  ('unstructured', 1, 10.685186796595348, 10.685186796595348)),
 ('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured clinical notes',
  ('architectures', 1, 7.515261795153036, 7.515261795153036)),
 ('$\\mathcal{laja}{-}$ label attention transformer architectures for icd-10 coding of unstructured cli

In [239]:
# put into dataframe
rdd=rdd.map(lambda x: (x[0],x[1][0],x[1][1],x[1][2],x[1][3]))
rdd.toDF(["title","feature","TF","IDF","TF-IDF"]).show()

+--------------------+-------------------+---+------------------+------------------+
|               title|            feature| TF|               IDF|            TF-IDF|
+--------------------+-------------------+---+------------------+------------------+
|$\mathcal{laja}{-...|             icd-10|  1|10.100224295874192|10.100224295874192|
|$\mathcal{laja}{-...|$\mathcal{laja}{-}$|  1|10.685186796595348|10.685186796595348|
|$\mathcal{laja}{-...|                for|  1|1.0198508794101715|1.0198508794101715|
|$\mathcal{laja}{-...|       unstructured|  1|10.685186796595348|10.685186796595348|
|$\mathcal{laja}{-...|      architectures|  1| 7.515261795153036| 7.515261795153036|
|$\mathcal{laja}{-...|             coding|  1| 9.100224295874192| 9.100224295874192|
|$\mathcal{laja}{-...|              notes|  1|10.685186796595348|10.685186796595348|
|$\mathcal{laja}{-...|          attention|  1| 4.707906873095431| 4.707906873095431|
|$\mathcal{laja}{-...|                 of|  1| 2.603037755241476|

In [240]:
rdddf = rdd.toDF(["title","feature","TF","IDF","TF-IDF"])
rdddf.createOrReplaceTempView("rdddfv")

In [241]:
rdddf2 = spark.sql("select r.title, r.feature, r.TF, r.IDF, r.TF-IDF, r.TF/d.total_terms as TF_norm, r.TF*r.IDF/d.total_terms as TF_IDF_norm  \
                    from rdddfv r \
                    join dblp_tfidf1v d \
                       on r.title = d.title ")
rdddf2.printSchema()
rdddf2.createOrReplaceTempView("rdddf2v")

root
 |-- title: string (nullable = true)
 |-- feature: string (nullable = true)
 |-- TF: long (nullable = true)
 |-- IDF: double (nullable = true)
 |-- (CAST(TF AS DOUBLE) - IDF): double (nullable = true)
 |-- TF_norm: double (nullable = true)
 |-- TF_IDF_norm: double (nullable = true)



In [242]:
#calculate sum of tf-idf of both self-attention and transformer
dblp_tfidf9 = spark.sql("select r.title, sum(r.TF-IDF) as TF_IDF, sum(r.TF_IDF_norm) as TF_IDF_norm \
                         from rdddf2v r \
                         where r.feature = 'self-attention' \
                             or r.feature = 'transformer' \
                         group by r.title \
                         order by TF_IDF_norm desc \
                         limit 10")
dblp_tfidf9.printSchema()
dblp_tfidf9.createOrReplaceTempView("dblp_tfidf9v")

root
 |-- title: string (nullable = true)
 |-- TF_IDF: double (nullable = true)
 |-- TF_IDF_norm: double (nullable = true)



In [243]:
#get results
dblp_tfidf10 = spark.sql("select d9.title, d.author._VALUE as author, d.year, d.journal_conference, d9.TF_IDF_norm \
                          from dblp_tfidf9v d9 \
                          join dblp_allv d \
                             on d9.title = d.title \
                          order by TF_IDF_norm desc")
dblp_tfidf10.printSchema()
dblp_tfidf10.createOrReplaceTempView("dblp_tfidf10v")

root
 |-- title: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- year: long (nullable = true)
 |-- journal_conference: string (nullable = true)
 |-- TF_IDF_norm: double (nullable = true)



In [244]:
#get results filter out corr
dblp_tfidf11 = spark.sql("select distinct title, min(author) author, min(year) year, first(journal_conference) journal_conference, min(TF_IDF_norm) TF_IDF\
                          from dblp_tfidf10v \
                          group by title \
                          order by TF_IDF desc")
dblp_tfidf11.printSchema()
dblp_tfidf11.createOrReplaceTempView("dblp_tfidf11v")

root
 |-- title: string (nullable = true)
 |-- author: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- year: long (nullable = true)
 |-- journal_conference: string (nullable = true)
 |-- TF_IDF: double (nullable = true)



### Query 4 result<a name="4.4.2.1"></a>

In [245]:
dblp_tfidf11.show()

+--------------------+--------------------+----+--------------------+------------------+
|               title|              author|year|  journal_conference|            TF_IDF|
+--------------------+--------------------+----+--------------------+------------------+
|relative molecule...|[Lukasz Maziarka,...|2021|                CoRR|1.0487169748665677|
|self-attention fo...|[Nathanaeumll Car...|2021|                MLSP|0.7722492601127343|
|self-attention ag...|[Rita Pucci, Chri...|2021|               ICCVW|0.7722492601127343|
|transformer in tr...|[Kai Han 0002, An...|2021|             NeurIPS|0.7372472393435556|
|lite vision trans...|[Chenglin Yang, Y...|2021|                CoRR|0.6991446499110451|
|synthesizer: reth...|[Yi Tay, Dara Bah...|2021|                ICML|0.6991446499110451|
|spatial self-atte...|[Adu Asare Baffou...|2021|J. Vis. Commun. I...|0.6177994080901874|
|session-based rec...|          [Jun Fang]|2021|                CoRR|0.6177994080901874|
|local-to-global s...

In [246]:
dblp_tfidf11.select('title').show(truncate = False)

+--------------------------------------------------------------------------------------------------+
|title                                                                                             |
+--------------------------------------------------------------------------------------------------+
|relative molecule self-attention transformer                                                      |
|self-attention for audio super-resolution                                                         |
|self-attention agreement among capsules                                                           |
|transformer in transformer                                                                        |
|synthesizer: rethinking self-attention for transformer models                                     |
|lite vision transformer with enhanced self-attention                                              |
|self-attention meta-learner for continual learning                                        

# 5- Preprocess XML to remove HTML escaped characters <a name="5"></a>

In [71]:
def getlines(path):
    file = open(path, 'r')
    Lines = file.readlines() 
    return Lines

In [72]:
path = 'dblp.xml'
Lines = getlines(path)

In [73]:
specdict2 = {"&amp;":"and",
"&lt;":"lessthan",
"&gt;":"greaterthan",
"&nbsp;":"nbsp",
"&iexcl;":"iexcl",
"&cent;":"cent",
"&pound;":"pound",
"&curren;":"curren",
"&yen;":"yen",
"&brvbar;":"brvbar",
"&sect;":"sect",
"&uml;":"uml",
"&copy;":"copy",
"&ordf;":"ordf",
"&laquo;":"laquo",
"&not;":"not",
"&shy;":"shy",
"&reg;":"reg",
"&macr;":"macr",
"&deg;":"deg",
"&plusmn;":"plusmn",
"&sup2;":"sup2",
"&sup3;":"sup3",
"&acute;":"acute",
"&micro;":"micro",
"&para;":"para",
"&middot;":"middot",
"&cedil;":"cedil",
"&sup1;":"sup1",
"&ordm;":"ordm",
"&raquo;":"raquo",
"&frac14;":"frac14",
"&frac12;":"frac12",
"&frac34;":"frac34",
"&iquest;":"iquest",
"&Agrave;":"Agrave",
"&Aacute;":"Aacute",
"&Acirc;":"Acirc",
"&Atilde;":"Atilde",
"&Auml;":"Auml",
"&Aring;":"Aring",
"&AElig;":"AElig",
"&Ccedil;":"Ccedil",
"&Egrave;":"Egrave",
"&Eacute;":"Eacute",
"&Ecirc;":"Ecirc",
"&Euml;":"Euml",
"&Igrave;":"Igrave",
"&Iacute;":"Iacute",
"&Icirc;":"Icirc",
"&Iuml;":"Iuml",
"&ETH;":"ETH",
"&Ntilde;":"Ntilde",
"&Ograve;":"Ograve",
"&Oacute;":"Oacute",
"&Ocirc;":"Ocirc",
"&Otilde;":"Otilde",
"&Ouml;":"Ouml",
"&times;":"times",
"&Oslash;":"Oslash",
"&Ugrave;":"Ugrave",
"&Uacute;":"Uacute",
"&Ucirc;":"Ucirc",
"&Uuml;":"Uuml",
"&Yacute;":"Yacute",
"&THORN;":"THORN",
"&szlig;":"szlig",
"&agrave;":"agrave",
"&aacute;":"aacute",
"&acirc;":"acirc",
"&atilde;":"atilde",
"&auml;":"auml",
"&aring;":"aring",
"&aelig;":"aelig",
"&ccedil;":"ccedil",
"&egrave;":"egrave",
"&eacute;":"eacute",
"&ecirc;":"ecirc",
"&euml;":"euml",
"&igrave;":"igrave",
"&iacute;":"iacute",
"&icirc;":"icirc",
"&iuml;":"iuml",
"&eth;":"eth",
"&ntilde;":"ntilde",
"&ograve;":"ograve",
"&oacute;":"oacute",
"&ocirc;":"ocirc",
"&otilde;":"otilde",
"&ouml;":"ouml",
"&divide;":"divide",
"&oslash;":"oslash",
"&ugrave;":"ugrave",
"&uacute;":"uacute",
"&ucirc;":"ucirc",
"&uuml;":"uuml",
"&yacute;":"yacute",
"&thorn;":"thorn",
"&yuml;":"yuml",
"&fnof;":"fnof",
"&Alpha;":"Alpha",
"&Beta;":"Beta",
"&Gamma;":"Gamma",
"&Delta;":"Delta",
"&Epsilon;":"Epsilon",
"&Zeta;":"Zeta",
"&Eta;":"Eta",
"&Theta;":"Theta",
"&Iota;":"Iota",
"&Kappa;":"Kappa",
"&Lambda;":"Lambda",
"&Mu;":"Mu",
"&Nu;":"Nu",
"&Xi;":"Xi",
"&Omicron;":"Omicron",
"&Pi;":"Pi",
"&Rho;":"Rho",
"&Sigma;":"Sigma",
"&Tau;":"Tau",
"&Upsilon;":"Upsilon",
"&Phi;":"Phi",
"&Chi;":"Chi",
"&Psi;":"Psi",
"&Omega;":"Omega",
"&alpha;":"alpha",
"&beta;":"beta",
"&gamma;":"gamma",
"&delta;":"delta",
"&epsilon;":"epsilon",
"&zeta;":"zeta",
"&eta;":"eta",
"&theta;":"theta",
"&iota;":"iota",
"&kappa;":"kappa",
"&lambda;":"lambda",
"&mu;":"mu",
"&nu;":"nu",
"&xi;":"xi",
"&omicron;":"omicron",
"&pi;":"pi",
"&rho;":"rho",
"&sigmaf;":"sigmaf",
"&sigma;":"sigma",
"&tau;":"tau",
"&upsilon;":"upsilon",
"&phi;":"phi",
"&chi;":"chi",
"&psi;":"psi",
"&omega;":"omega",
"&thetasym;":"thetasym",
"&upsih;":"upsih",
"&piv;":"piv",
"&bull;":"bull",
"&hellip;":"hellip",
"&prime;":"prime",
"&Prime;":"Prime",
"&oline;":"oline",
"&frasl;":"frasl",
"&weierp;":"weierp",
"&image;":"image",
"&real;":"real",
"&trade;":"trade",
"&alefsym;":"alefsym",
"&larr;":"larr",
"&uarr;":"uarr",
"&rarr;":"rarr",
"&darr;":"darr",
"&harr;":"harr",
"&crarr;":"crarr",
"&lArr;":"lArr",
"&uArr;":"uArr",
"&rArr;":"rArr",
"&dArr;":"dArr",
"&hArr;":"hArr",
"&forall;":"forall",
"&part;":"part",
"&exist;":"exist",
"&empty;":"empty",
"&nabla;":"nabla",
"&isin;":"isin",
"&notin;":"notin",
"&ni;":"ni",
"&prod;":"prod",
"&sum;":"sum",
"&minus;":"minus",
"&lowast;":"lowast",
"&radic;":"radic",
"&prop;":"prop",
"&infin;":"infin",
"&ang;":"ang",
"&and;":"and",
"&or;":"or",
"&cap;":"cap",
"&cup;":"cup",
"&int;":"int",
"&there4;":"there4",
"&sim;":"sim",
"&cong;":"cong",
"&asymp;":"asymp",
"&ne;":"ne",
"&equiv;":"equiv",
"&le;":"le",
"&ge;":"ge",
"&sub;":"sub",
"&sup;":"sup",
"&nsub;":"nsub",
"&sube;":"sube",
"&supe;":"supe",
"&oplus;":"oplus",
"&otimes;":"otimes",
"&perp;":"perp",
"&sdot;":"sdot",
"&lceil;":"lceil",
"&rceil;":"rceil",
"&lfloor;":"lfloor",
"&rfloor;":"rfloor",
"&lang;":"lang",
"&rang;":"rang",
"&loz;":"loz",
"&spades;":"spades",
"&clubs;":"clubs",
"&hearts;":"hearts",
"&diams;":"diams",
}

In [74]:
from tqdm.notebook import tqdm
tlist = tqdm(Lines)
for i, line in enumerate(tlist):
    for k,v in specdict2.items():
        while k in line:
            line = line.replace(k, v)
    Lines[i] = line

  0%|          | 0/87653933 [00:00<?, ?it/s]

In [75]:
with open('dblp_cleaned.txt','w') as f:
    for line in Lines:
        f.write(line)

In [None]:
#not used
import html
from tqdm.notebook import tqdm
tlist = tqdm(Lines)
for i, line in enumerate(tlist):
    line = html.unescape(line)  #replace escaped characters with utf-8 special characters
    line = line.encode("ascii", "replace") #replace utf-8 special characters with question mark
    line = line.decode(encoding="utf-8", errors="ignore")
    line = line.replace('&', ' and ')
    Lines[i] = line