# Notebook for simplifying and cleaning Web of Science data 
- In the xml to parquet step, we just dumped the xml into parquet format, without cleaning it up or anything. It is very messy and redundant, and now that it is in a fast format, we can clean it up.
- This notebook specifically gets a cleanly formated table of publications out of the data. For authors and their information, there is a separate notebook (as a single paper may have multiple authors).

## Setup

In [1]:
from pyspark.sql import SQLContext
#import pandas as pd
from pyspark.sql.functions import *
import pyspark.sql
import string
spark.sql('set spark.sql.caseSensitive=true')

DataFrame[key: string, value: string]

In [2]:
#import matplotlib.pyplot as plt
#%matplotlib inline

In [3]:
sqlC = SQLContext(sc)

## Read data

In [4]:
raw = sqlC.read.parquet("wos_core.parquet")
#cites.count()

In [5]:
raw.printSchema()

root
 |-- UID: string (nullable = true)
 |-- _r_id_disclaimer: string (nullable = true)
 |-- abstracts: struct (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- abstract: struct (nullable = true)
 |    |    |-- abstract_text: struct (nullable = true)
 |    |    |    |-- _count: long (nullable = true)
 |    |    |    |-- p: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |-- addresses: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- address_name: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- address_spec: struct (nullable = true)
 |    |    |    |    |-- _addr_no: long (nullable = true)
 |    |    |    |    |-- city: string (nullable = true)
 |    |    |    |    |-- country: string (nullable = true)
 |    |    |    |    |-- full_address: string (nullable = true)
 |    |    |    |    |-- organizations: struct 

In [6]:
#raw.count()

In [7]:
# 70461165
#raw.distinct().count()

In [8]:
# 67047061
#raw.select('UID').distinct().count()

In [9]:
#raw = raw.dropDuplicates(subset=["UID"])

## First, select just the columns that have useful information
- Much of the WoS data set has redundant or empty data structures. Finding that is a matter of poking around in the data, not shown here.
- Note that this selects some nested material such as "identifiers" inside of "cluster_related" inside of "dynamic_data". After this, "identifiers" is now a top level column, no longer nested.

In [10]:
raw = raw.select("UID", 
                 'dynamic_data.cluster_related.identifiers',
                 "static_data.fullrecord_metadata.normalized_languages",
                 "static_data.fullrecord_metadata.normalized_doctypes",
                     "static_data.fullrecord_metadata.abstracts",
                     "static_data.fullrecord_metadata.category_info",
                     "static_data.fullrecord_metadata.keywords",
                     "static_data.fullrecord_metadata.references",
                     "static_data.summary.pub_info",
                     "static_data.summary.titles"
                    )

## combine paragraphs
- Abstracts are actually a list of paragraphs in this data. That's annoying. This code glues them back together as a single block of text / string, with spaces in between each.

In [11]:
column = "abstracts.abstract.abstract_text.p"
raw = raw.withColumn('full_abstract', concat_ws(" ", column))

#tmp.select("UID", "abs").show()

## selecting English articles
- Here we glue the language list together and then select all the papers that have "english" listed.

In [12]:
raw = raw.withColumn('all_lang', concat_ws(" ", "normalized_languages.language._VALUE"))
raw = raw.filter("all_lang like '%nglish%'")
#raw.select('all_lang').distinct().show()

## More unnesting
- Here we pull more nested info out into columns for simplicity.
- For some, such as keywords, we take them out of lists and stick them together as strings

In [13]:
raw = raw.withColumn('pubyear', col("pub_info._pubyear"))
raw = raw.withColumn('has_abstract', col("pub_info._has_abstract"))
raw = raw.withColumn('pubtype', col("pub_info._pubtype"))
raw = raw.withColumn('keywords', lower(concat_ws(";", "keywords.keyword")))
raw = raw.withColumn('subjects', lower(concat_ws(";", "category_info.subjects.subject._VALUE")))
raw = raw.withColumn('subheadings', lower(concat_ws(";", "category_info.subheadings.subheading")))
raw = raw.withColumn('headings', lower(concat_ws(";", "category_info.headings.heading")))
raw.printSchema()

root
 |-- UID: string (nullable = true)
 |-- identifiers: struct (nullable = true)
 |    |-- identifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _value: string (nullable = true)
 |-- normalized_languages: struct (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- language: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |-- normalized_doctypes: struct (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- doctype: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |-- abstracts: struct (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- abstract: struct (nullable = true)
 |    |    |-- abstract_text: struct (nullable = t

In [14]:
raw.columns

['UID',
 'identifiers',
 'normalized_languages',
 'normalized_doctypes',
 'abstracts',
 'category_info',
 'keywords',
 'references',
 'pub_info',
 'titles',
 'full_abstract',
 'all_lang',
 'pubyear',
 'has_abstract',
 'pubtype',
 'subjects',
 'subheadings',
 'headings']

## Select just the columns of interest
- now that we have pulled more information out of the nested data, we can drop the columns we do not need anymore.

In [15]:
raw = raw.select('UID',
                 #'normalized_languages',
                 #'normalized_doctypes',
                 #'abstracts',
                 #'category_info',
                 #'names',
                 'identifiers',
                 'keywords',
                 'references',
                 #'pub_info',
                 'titles',
                 'full_abstract',
                 'all_lang',
                 'pubyear',
                 'has_abstract',
                 'pubtype',
                 'subjects',
                 'subheadings',
                 'headings'
                    )

raw.printSchema()

root
 |-- UID: string (nullable = true)
 |-- identifiers: struct (nullable = true)
 |    |-- identifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _value: string (nullable = true)
 |-- keywords: string (nullable = false)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- assignee: string (nullable = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- e

## drop duplicates
- at this point there are some redundancies. E.g. a paper in the database twice. If any rows are completely identical to one another, they get de-duplicated here so that we have just one of each.

In [16]:
raw = raw.dropDuplicates()

## save
- We are not done yet, but we have made a lot of complicated changes and dropped a lot of unnecessary data. At this point, it is good to save an intermediate file so that pyspark doesn't have to recompute all of that over and over. It can just work from this checkpoint.

In [17]:
raw.write.option("maxRecordsPerFile", 10000
                ).parquet("wos_core_skinny.parquet", 
                          mode='overwrite')
print("done")

done


In [18]:
cites = sqlC.read.parquet("wos_core_skinny.parquet")
cites.printSchema()

root
 |-- UID: string (nullable = true)
 |-- identifiers: struct (nullable = true)
 |    |-- identifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _value: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- assignee: string (nullable = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- el

In [19]:
cites.count()

67810850

## clean up titles and publications
- Titles have multiple components inside them.
- We don't want to just stick them together like strings.
- We use the "explode" operation to get all the items in a title as their own variables. 
- Then we filter/select just the ones of the right type, and grab the value (text) of the title, and rename it to something useful.

In [20]:
titles = cites.select("UID", 
                      explode("titles.title"
                             ).alias("t")
                     ).select("UID", "t.*"
                             ).filter("_type == 'item'"
                                     )[['UID', '_VALUE'
                                       ]].withColumnRenamed('_VALUE', 'item_title').cache()

#titles.show()

In [21]:
journals = cites.select("UID", 
                      explode("titles.title"
                             ).alias("t")
                     ).select("UID", "t.*"
                             ).filter("_type == 'source'"
                                     )[['UID', '_VALUE'
                                       ]].withColumnRenamed('_VALUE', 'journal').cache()

#titles.show()

In [None]:
# commented out because names are handled better elsewhere now

#names = cites.select("UID", 
#                      explode("names.name"
#                             ).alias("n")
#                     ).select("UID", "n.*"
#                             ).filter("_role == 'author' or _role == 'book_editor'"
#                                     )[['UID', 'full_name']]

#names = names.groupBy("UID").agg(concat_ws("; ", collect_list("full_name")
#                                          ).alias("all_authors")).cache()

#names.show()

In [23]:
cites.printSchema()

root
 |-- UID: string (nullable = true)
 |-- identifiers: struct (nullable = true)
 |    |-- identifier: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- _VALUE: string (nullable = true)
 |    |    |    |-- _type: string (nullable = true)
 |    |    |    |-- _value: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- assignee: string (nullable = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- el

## Repeat with document ID numbers

In [24]:
cites.select('UID', explode('identifiers.identifier').alias('i')
          ).select('i.*').show()

+------+------------+--------------------+
|_VALUE|       _type|              _value|
+------+------------+--------------------+
|  null|accession_no|               BHT88|
|  null|        issn|           2161-8070|
|  null|        isbn|   978-1-4244-1153-5|
|  null|accession_no|               188AB|
|  null|        issn|           0004-3273|
|  null|accession_no|               BGF16|
|  null|        issn|           0302-9743|
|  null|        isbn|   978-3-540-72359-2|
|  null|accession_no|               155YR|
|  null|        issn|           1052-3928|
|  null|         doi|10.1061/(ASCE)105...|
|  null|accession_no|               BFT91|
|  null|        isbn|   978-1-84628-388-8|
|  null|accession_no|               139NJ|
|  null|        issn|           0027-8424|
|  null|         doi|10.1073/pnas.0609...|
|  null|accession_no|               BJI42|
|  null|        isbn|   978-1-59593-616-5|
|  null|accession_no|               153GJ|
|  null|        issn|           0031-9228|
+------+---

In [25]:
ids = cites.select('UID', explode('identifiers.identifier').alias('i')
          ).select('UID', "i.*")

ids.show()

+-------------------+------+------------+--------------------+
|                UID|_VALUE|       _type|              _value|
+-------------------+------+------------+--------------------+
|WOS:000256345500042|  null|accession_no|               BHT88|
|WOS:000256345500042|  null|        issn|           2161-8070|
|WOS:000256345500042|  null|        isbn|   978-1-4244-1153-5|
|WOS:000247889600055|  null|accession_no|               188AB|
|WOS:000247889600055|  null|        issn|           0004-3273|
|WOS:000246397700022|  null|accession_no|               BGF16|
|WOS:000246397700022|  null|        issn|           0302-9743|
|WOS:000246397700022|  null|        isbn|   978-3-540-72359-2|
|WOS:000245615300006|  null|accession_no|               155YR|
|WOS:000245615300006|  null|        issn|           1052-3928|
|WOS:000245615300006|  null|         doi|10.1061/(ASCE)105...|
|WOS:000244562700008|  null|accession_no|               BFT91|
|WOS:000244562700008|  null|        isbn|   978-1-84628

In [26]:
issn = ids.filter(col('_type') == 'issn').select('UID', col('_value').alias('issn'))
eissn = ids.filter(col('_type') == 'eissn').select('UID', col('_value').alias('eissn'))
isbn = ids.filter(col('_type') == 'isbn').select('UID', col('_value').alias('isbn'))
eisbn = ids.filter(col('_type') == 'eisbn').select('UID', col('_value').alias('eisbn'))
doi = ids.filter(col('_type') == 'doi').select('UID', col('_value').alias('doi'))

isbn.show()

+-------------------+-----------------+
|                UID|             isbn|
+-------------------+-----------------+
|WOS:000256345500042|978-1-4244-1153-5|
|WOS:000246397700022|978-3-540-72359-2|
|WOS:000244562700008|978-1-84628-388-8|
|WOS:000266108700054|978-1-59593-616-5|
|WOS:000252242500008|978-0-415-43725-7|
|WOS:000250479200017|978-0-7695-2871-7|
|WOS:000251345402139|978-1-4244-1296-9|
|WOS:000253372500088|978-0-7695-3072-7|
|WOS:000246019900044|978-3-540-71208-4|
|WOS:000282366200009|978-1-4039-9678-7|
|WOS:000251608403133|978-1-4244-0920-4|
|WOS:000253876500098|*****************|
|WOS:000250380200032|978-3-540-74455-9|
|WOS:000186628100095|    0-7803-7979-9|
|WOS:000186578000033|    1-932415-05-X|
|WOS:000226277800016|    92-0-103603-5|
|WOS:000189396300036|    980-6560-01-9|
|WOS:000188739500144|    7-5053-5066-8|
|WOS:000185702500149|    0-7803-7636-6|
|WOS:000185702800117|    981-238-391-3|
+-------------------+-----------------+
only showing top 20 rows



|   accession_no|65062168|
|           issn|59884348|
|            doi|20653602|
|          eissn|11217854|
|           isbn| 5726378|
|          eisbn| 3105514|
|         art_no| 3102947|
|    meeting_abs| 2349679|
|parent_book_doi

## Merge our new columns back into our data

In [27]:
#cites = cites.join(abstracts, on="UID", how='left')
cites = cites.join(titles, on="UID", how='left')
cites = cites.join(journals, on="UID", how='left')
cites = cites.join(issn, on="UID", how='left')
cites = cites.join(isbn, on="UID", how='left')
cites = cites.join(eissn, on="UID", how='left')
cites = cites.join(eisbn, on="UID", how='left')
cites = cites.join(doi, on="UID", how='left')
#cites = cites.join(names, on="UID", how='left')

## Make a "bare text" column
- Sometimes I just want to search all documents' titles, abstracts, and keywords for something of interest. Having them in a column together makes that easy, so this makes a new column where they're all crammed into the same string. 
- For further searching simplicity, I make that column lower case and replace all punctuation with spaces. This "bare text" column is really just a bag of words now, which is helpful for some applications I will use later. 

In [None]:
#this shows there aren't any funky quotes. yay.
#cites.filter(col('full_abstract').contains("“")).select('full_abstract').show()

In [29]:
spaces = ""
for i in string.punctuation:
    spaces += " "

cites = cites.withColumn('bare_text', 
                         translate(lower(concat_ws(" ", 
                                                   cites.item_title, 
                                                   cites.full_abstract, 
                                                   cites.keywords
                                                  )),
                                   string.punctuation, 
                                   spaces
                                  ))

#cites.select('bare_text').show()

In [30]:
cites = cites.dropDuplicates()

In [31]:
cites.columns

['UID',
 'identifiers',
 'keywords',
 'references',
 'titles',
 'full_abstract',
 'all_lang',
 'pubyear',
 'has_abstract',
 'pubtype',
 'subjects',
 'subheadings',
 'headings',
 'item_title',
 'journal',
 'issn',
 'isbn',
 'eissn',
 'eisbn',
 'doi',
 'bare_text']

## Select columns
- again we will select justthe columns of interest and drop the things we no longer have use for. 
- Note that the format of the data has no nesting at all any more except in references. If we dropped that column, our data would be a simple table like you see in excel. 

In [32]:
cites = cites[['UID',
 'keywords',
 'references',
 'full_abstract',
 'all_lang',
 'pubyear',
 'has_abstract',
 'pubtype',
 'subjects',
 'subheadings',
 'headings',
 'item_title',
 'journal',
 'issn',
 'isbn',
 'eissn',
 'eisbn',
 'doi',
 'bare_text']]
cites.printSchema()

root
 |-- UID: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- assignee: string (nullable = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- page: string (nullable = true)
 |    |    |    |-- patent_no: string (nullable = true)
 |    |    |    |-- sub: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- sup: array (nullable = tr

## Save

In [33]:
cites.write.option("maxRecordsPerFile", 
                   10000
                  ).parquet("wos_core_clean.parquet", mode='overwrite')

In [34]:
cites = sqlC.read.parquet("wos_core_clean.parquet")
cites.count()

68252914

In [35]:
print("done")

done
