# Merge data from SJR and WoS
- Scimagio Journal Rank data cleaned in another notebook: SJR_clean_parquet.ipynb
- this notebook merges it with the Web of Science

## setup

In [1]:
from pyspark.sql import SQLContext
#import pandas as pd
from pyspark.sql.functions import *
import pyspark.sql
import string
sqlC = SQLContext(sc)
print('done')

done


## load data

In [2]:
wos = sqlC.read.parquet('wos_with_loc.parquet')
wos = wos.drop_duplicates(subset=['UID'])
wos.printSchema()

root
 |-- UID: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- page: string (nullable = true)
 |    |    |    |-- patent_no: string (nullable = true)
 |    |    |    |-- sub: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- sup: array (nullable = true)
 |    |    |    |    |-- element: string (containsN

In [3]:
wos = wos.repartition(1000)

In [4]:
wos.count()

61785079

In [59]:
sjr = sqlC.read.parquet('sjr_ranks.parquet')
sjr.printSchema()

root
 |-- rank: string (nullable = true)
 |-- sourceid: string (nullable = true)
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- issn: string (nullable = true)
 |-- sjr: string (nullable = true)
 |-- sjr_best_quartile: string (nullable = true)
 |-- h_index: string (nullable = true)
 |-- total_docs_2003: string (nullable = true)
 |-- total_docs_3years: string (nullable = true)
 |-- total_refs: string (nullable = true)
 |-- total_cites_3years: string (nullable = true)
 |-- citable_docs_3years: string (nullable = true)
 |-- cites__doc_2years: string (nullable = true)
 |-- ref__doc: string (nullable = true)
 |-- country: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- year: string (nullable = true)
 |-- quartile: string (nullable = true)



In [60]:
#sjr.select('sourceid', 'issn', 'title', 'type').show()

##  clean up SJR data a bit more
- Some journals have multiple ISSNs, separated by commas. This code splits those into separate data entries so we can match on them. It also standardizes the ISSN numbers to remove the optional hyphen. 

In [61]:
sjr = sjr.select('rank', 'sourceid', col('title').alias('j_title'), 
                 'type', 'issn', 'sjr',
                 'h_index', 'cites__doc_2years', 'ref__doc',
                 'country', 'publisher', 'categories',
                 col('year').alias('j_stats_year'), 
                 col('quartile').alias('j_quartile'))

sjr = sjr.withColumn('issn',  regexp_replace('issn', '-', ''))
sjr = sjr.withColumn('id', monotonically_increasing_id())

In [62]:
sjr = sjr.select('rank', 'id', 'j_title', 'type',
                 explode(split(col('issn'), ', ')).alias('issn'), 
                 'sjr', 'h_index',
                 'cites__doc_2years', 'ref__doc', 'country',
                 'publisher', 'categories', 'j_stats_year',
                 'j_quartile')

def to_null(c):
    return when(~(col(c).isNull() | isnan(col(c)) | (trim(col(c)) == "")), col(c))

sjr = sjr.withColumn('issn', to_null('issn'))

sjr.select('id', 'issn', 'j_title', 'type').show()

+---+--------+--------------------+-----------+
| id|    issn|             j_title|       type|
+---+--------+--------------------+-----------+
|  0|07320582|Annual Review of ...|    journal|
|  0|15453278|Annual Review of ...|    journal|
|  1|15454509|Annual Review of ...|    journal|
|  1|00664154|Annual Review of ...|    journal|
|  2|15308995|Annual Review of ...|book series|
|  2|10810706|Annual Review of ...|book series|
|  3|00346861|Reviews of Modern...|    journal|
|  3|15390756|Reviews of Modern...|    journal|
|  4|15454126|Annual Review of ...|book series|
|  4| 0147006|Annual Review of ...|book series|
|  5|00928674|                Cell|    journal|
|  5|10974172|                Cell|    journal|
|  6|15424863|CA - A Cancer Jou...|    journal|
|  6|00079235|CA - A Cancer Jou...|    journal|
|  7|15292908|   Nature Immunology|    journal|
|  7|15292916|   Nature Immunology|    journal|
|  8|15221210|Physiological Rev...|    journal|
|  8|00319333|Physiological Rev...|    j

In [63]:
sjr.groupby('issn').count().sort(desc('count')).show()

+--------+------+
|    issn| count|
+--------+------+
|    null|107552|
|00214922|    45|
|15417719|    43|
|10258973|    39|
|16740068|    37|
|02749696|    35|
|00189162|    35|
|10004874|    29|
|14324334|    28|
|00296570|    28|
| 0004783|    28|
|18125425|    27|
|00393681|    27|
|03174956|    27|
|18125387|    27|
|10823409|    26|
|18753507|    26|
|16725123|    26|
|15376516|    25|
|21867275|    25|
+--------+------+
only showing top 20 rows



## ISSN cleaning

In [64]:
#wos.select('uid', 'issn', 'eissn').describe().show()

In [65]:
#wos.filter(col('issn').isNotNull() | col('eissn').isNotNull()
#          ).sample(False, 0.01).select('issn', 'eissn').show()

In [66]:
tmp = sjr.filter(col('issn').isNotNull())
wos_skinny = wos.select('UID','issn', 'eissn', 'item_title', 'journal', 'pubyear')
wos_skinny = wos_skinny.withColumn('issn', regexp_replace('issn', '-', ''))
wos_skinny = wos_skinny.withColumn('eissn', regexp_replace('eissn', '-', ''))

## matching
- Journals may be matched on their ISSN, eISSN, or title. This code tries to match all of them in order.
- Each year a journal's rank changes. This code tries to match exact year, but the SJR rankings only go back to 1999. When an exact year match is not possible, it tried to approximate with another year. 

### pandas approach for reference

fb1 = fb[fb.pubyear >= 1999].merge(journals, left_on=['journal', 'pubyear'], right_on=['Title', 'year'], how='inner')
fb2 = fb[fb.pubyear < 1999].merge(journals[journals.year == 1999], left_on='journal', right_on='Title', how='inner')
fb3 = fb[(~fb.UID.isin(fb1.UID))&(~fb.UID.isin(fb2.UID))]
jt = journals.sort_values(by='year', ascending=False).drop_duplicates(subset=['Title'], keep='first')
fb3 = fb3.merge(jt, left_on='journal', right_on='Title', how='left')

#todo find source of dupes
fb = pd.concat([fb1,fb2,fb3]).drop_duplicates(subset=['UID'])

In [67]:
t2 = tmp.select('id', 'issn', col('j_stats_year').alias('pubyear'), 'j_title')
a = wos_skinny.join(t2, on=['issn', 'pubyear'], how='inner')
a.select('issn', 'pubyear', 'journal', 'j_title').show()

+--------+-------+--------------------+--------------------+
|    issn|pubyear|             journal|             j_title|
+--------+-------+--------------------+--------------------+
|09601481|   1999|    RENEWABLE ENERGY|    Renewable Energy|
|00457930|   1999|  COMPUTERS & FLUIDS|Computers and Fluids|
|00207403|   1999|INTERNATIONAL JOU...|International Jou...|
|00207462|   1999|INTERNATIONAL JOU...|International Jou...|
|00323861|   1999|             POLYMER|             Polymer|
|00323861|   1999|             POLYMER|             Polymer|
|00323861|   1999|             POLYMER|             Polymer|
|03064522|   1999|        NEUROSCIENCE|        Neuroscience|
|00218995|   1999|JOURNAL OF APPLIE...|Journal of Applie...|
|10813004|   1999|JOURNAL OF ADOLES...|Journal of Adoles...|
|09462716|   1999|JOURNAL OF MOLECU...|Journal of Molecu...|
|87560437|   1999|SEMINARS IN SURGI...|Seminars in Surgi...|
|00963003|   1999|APPLIED MATHEMATI...|Applied Mathemati...|
|01764276|   1999|CONSTR

In [68]:
a.count()

26732396

In [69]:
t2 = tmp.select('id', col('issn').alias('eissn'), col('j_stats_year').alias('pubyear'), 'j_title')
b = wos_skinny.join(t2, on=['eissn', 'pubyear'], how='inner')
b.select('issn', 'pubyear', 'journal', 'j_title').show()

+--------+-------+--------------------+--------------------+
|    issn|pubyear|             journal|             j_title|
+--------+-------+--------------------+--------------------+
|03051048|   1999|NUCLEIC ACIDS RES...|Nucleic Acids Res...|
|00223476|   1999|JOURNAL OF PEDIAT...|Journal of Pediat...|
|00218995|   1999|JOURNAL OF APPLIE...|Journal of Applie...|
|00916749|   1999|JOURNAL OF ALLERG...|Journal of Allerg...|
|00220272|   1999|JOURNAL OF CURRIC...|Journal of Curric...|
|00740276|   1999|MEMORIAS DO INSTI...|Memorias do Insti...|
|03345114|   1999|           SYMBIOSIS|           Symbiosis|
|08888892|   1999|CONSERVATION BIOLOGY|Conservation Biology|
|00220949|   1999|JOURNAL OF EXPERI...|Journal of Experi...|
|00092673|   1999|BULLETIN OF THE C...|Bulletin of the C...|
|10465928|   1999|PROTEIN EXPRESSIO...|Protein Expressio...|
|00223476|   1999|JOURNAL OF PEDIAT...|Journal of Pediat...|
|24700010|   1999|   PHYSICAL REVIEW D|   Physical Review D|
|00063592|   1999|BIOTEC

In [70]:
b.count()

3910289

In [71]:
done = a.select('id', 'UID').unionAll(b.select('id', 'UID')).dropDuplicates()
done.cache()
print(done.count())
print(done.dropDuplicates(subset=['UID']).count())

27215321
27144656


In [72]:
print(wos_skinny.count())
wos_skinny = wos_skinny.join(done, on='UID', how='left_anti')
wos_skinny.count()

61785079


34640423

In [73]:
t2 = tmp.filter(col('j_stats_year') == 1999).select('id', 'issn', 'j_title')
c = wos_skinny.join(t2, on=['issn'], how='inner')
c.select('issn', 'pubyear', 'journal', 'j_title').show()

+--------+-------+--------------------+--------------------+
|    issn|pubyear|             journal|             j_title|
+--------+-------+--------------------+--------------------+
|00377333|   1997|         SMITHSONIAN|         Smithsonian|
|00096407|   1997|      CHURCH HISTORY|      Church History|
|00274224|   1997|     MUSIC & LETTERS|   Music and Letters|
|01624962|   1997|BIOGRAPHY-AN INTE...|           Biography|
|00130796|   1997|   ECUMENICAL REVIEW|   Ecumenical Review|
|03061078|   1997|         EARLY MUSIC|         Early Music|
|00238031|   1997|LANDSCAPE ARCHITE...|Landscape Archite...|
|10427961|   1997|JOURNAL OF WOMENS...|Journal of women'...|
|00071250|   1997|BRITISH JOURNAL O...|British Journal o...|
|00778923|   1996|ENZYME ENGINEERIN...|Annals of the New...|
|00778923|   1996|ENZYME ENGINEERIN...|Annals of the New...|
|00778923|   1996|MYOCARDIAL PRESER...|Annals of the New...|
|00778923|   1996|MYOCARDIAL PRESER...|Annals of the New...|
|00139580|   1997|      

In [74]:
c.count()

16595988

In [75]:
t2 = tmp.filter(col('j_stats_year') == 1999).select('id', col('issn').alias('eissn'), 'j_title')
d = wos_skinny.join(t2, on=['eissn'], how='inner')
d.select('issn', 'pubyear', 'journal', 'j_title').show()

+--------+-------+--------------------+--------------------+
|    issn|pubyear|             journal|             j_title|
+--------+-------+--------------------+--------------------+
|03063674|   1997|BRITISH JOURNAL O...|British Journal o...|
|0002726X|   1997|AMERICAN ANNALS O...|American Annals o...|
|1063651X|   1997|   PHYSICAL REVIEW E|  Physical review. E|
|09108327|   1997|   HEART AND VESSELS|   Heart and Vessels|
|10637761|   1997|JOURNAL OF EXPERI...|Journal of Experi...|
|00092673|   1997|BULLETIN OF THE C...|Bulletin of the C...|
|13645021|   1998|PROCEEDINGS OF TH...|Proceedings of th...|
|01676806|   1997|BREAST CANCER RES...|Breast Cancer Res...|
|07302312|   1998|JOURNAL OF CELLUL...|Journal of Cellul...|
|0006291X|   1998|BIOCHEMICAL AND B...|Biochemical and B...|
|00221120|   1998|JOURNAL OF FLUID ...|Journal of Fluid ...|
|00036951|   1998|APPLIED PHYSICS L...|Applied Physics L...|
|09168451|   1998|BIOSCIENCE BIOTEC...|Bioscience, Biote...|
|0022538X|   1998| JOURN

In [76]:
d.count()

379089

In [77]:
tmp_done = c.select('id', 'UID').unionAll(d.select('id', 'UID')).dropDuplicates()
tmp_done.count()

16630998

In [78]:
done = done.unionAll(tmp_done).dropDuplicates(subset=['UID'])
done.count()

43737676

In [79]:
print(wos_skinny.count())
wos_skinny = wos_skinny.join(done, on='UID', how='left_anti')
wos_skinny.count()

34640423


18047403

In [80]:
t2 = tmp.filter(col('j_stats_year') == 2017).select('id', 'issn', 'j_title')
e = wos_skinny.join(t2, on=['issn'], how='inner')
e.count()

1082584

In [81]:
done = done.unionAll(e.select('id', 'UID')).dropDuplicates(subset=['UID'])
done.count()

44818666

## trying title matching below

In [82]:
print(wos_skinny.count())
wos_skinny = wos_skinny.join(done, on='UID', how='left_anti')
wos_skinny.count()

18047403


16966413

In [83]:
wos_skinny.columns

['UID', 'issn', 'eissn', 'item_title', 'journal', 'pubyear']

In [84]:
t2 = sjr.select('id',
                col('j_stats_year').alias('pubyear'), 
                col('j_title').alias('journal'))

spaces = ""
for i in string.punctuation:
    spaces += " "

t2 = t2.withColumn('journal', lower(regexp_replace('journal', "'", '')))
t2 = t2.withColumn('journal', regexp_replace('journal', " & ", ' and '))
t2 = t2.withColumn('journal', regexp_replace('journal', ', vol \d+', ''))
t2 = t2.withColumn('journal', regexp_replace('journal', ', volume \d+', ''))
t2 = t2.withColumn('journal', trim(translate(col('journal'), string.punctuation, spaces)))
t2 = t2.withColumn('journal', regexp_replace('journal', '  ', ' '))
t2 = t2.withColumn('journal', regexp_replace('journal', '  ', ' '))
t2.cache()
#t2.sample(False, 0.1).select('journal').take(20)

wos_skinny = wos_skinny.withColumn('journal', lower(regexp_replace('journal', "'", '')))
wos_skinny = wos_skinny.withColumn('journal', regexp_replace('journal', " & ", ' and '))
wos_skinny = wos_skinny.withColumn('journal', regexp_replace('journal', ', vol \d+', ''))
wos_skinny = wos_skinny.withColumn('journal', regexp_replace('journal', ', volume \d+', ''))
wos_skinny = wos_skinny.withColumn('journal', trim(translate(col('journal'), string.punctuation, spaces)))
wos_skinny = wos_skinny.withColumn('journal', regexp_replace('journal', '  ', ' '))
wos_skinny = wos_skinny.withColumn('journal', regexp_replace('journal', '  ', ' '))
wos_skinny.cache()
#wos_skinny.sample(False, 0.1).select('journal').take(20)

DataFrame[UID: string, issn: string, eissn: string, item_title: string, journal: string, pubyear: bigint]

In [85]:
f = wos_skinny.join(t2, on=['journal', 'pubyear'], how='inner')
f.count()

3734277

In [86]:
done = done.unionAll(f.select('id', 'UID')).dropDuplicates(subset=['UID'])
done.cache()
done.count()

47506069

In [87]:
print(wos_skinny.count())
wos_skinny = wos_skinny.join(done, on='UID', how='left_anti')
wos_skinny.count()

16966413


14279010

In [88]:
t3 = t2.filter(col('pubyear') == 1999)
g = wos_skinny.join(t3, on=['journal'], how='inner')
g.count()

2838194

In [89]:
done = done.unionAll(g.select('id', 'UID')).dropDuplicates(subset=['UID'])
done.count()

49444290

In [90]:
print(wos_skinny.count())
wos_skinny = wos_skinny.join(done, on='UID', how='left_anti')
wos_skinny.count()

14279010


12340789

In [91]:
t3 = t2.filter(col('pubyear') == 2017)
h = wos_skinny.join(t3, on=['journal'], how='inner')
h.count()

358312

In [92]:
done = done.unionAll(h.select('id', 'UID')).dropDuplicates(subset=['UID'])
done.count()

49747057

## bringing it all together

In [93]:
tmp = sjr.select(col('rank').alias('j_rank'), 'id', 'j_title', 'sjr', 
                 'h_index', 'cites__doc_2years',  'country', 'publisher',
                 'categories', 'j_quartile').dropDuplicates(subset=['id'])

done = done.join(tmp, on='id', how='left')
done = done.dropDuplicates(subset=['UID'])
done.printSchema()

root
 |-- id: long (nullable = false)
 |-- UID: string (nullable = true)
 |-- j_rank: string (nullable = true)
 |-- j_title: string (nullable = true)
 |-- sjr: string (nullable = true)
 |-- h_index: string (nullable = true)
 |-- cites__doc_2years: string (nullable = true)
 |-- country: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- j_quartile: string (nullable = true)



In [94]:
done.write.parquet('small_wos_sjr_tmp.parquet', mode='overwrite')

In [21]:
done = sqlC.read.parquet('small_wos_sjr_tmp.parquet')
done = done.dropDuplicates(subset=['UID'])
done.count()

49747057

In [22]:
done = done.select('UID', 'j_rank', 'sjr', 'h_index',
                   'cites__doc_2years', 'country', 'publisher',
                   'categories', 'j_quartile')

In [23]:
done = done.repartition(1000)
done.rdd.getNumPartitions()

1000

In [24]:
wos2 = wos.join(done, on='UID', how='left')
#wos2.printSchema()

In [25]:
wos2 = wos2.repartition(1000)
wos2.rdd.getNumPartitions()

1000

In [26]:
wos2.write.parquet('wos_with_loc_sjr.parquet', mode='overwrite')

## Save

In [27]:
wos2 = sqlC.read.parquet('wos_with_loc_sjr.parquet')
wos2.printSchema()

root
 |-- UID: string (nullable = true)
 |-- keywords: string (nullable = true)
 |-- references: struct (nullable = true)
 |    |-- _VALUE: string (nullable = true)
 |    |-- _count: long (nullable = true)
 |    |-- citedWork: string (nullable = true)
 |    |-- reference: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- citedAuthor: string (nullable = true)
 |    |    |    |-- citedTitle: string (nullable = true)
 |    |    |    |-- citedWork: string (nullable = true)
 |    |    |    |-- doi: string (nullable = true)
 |    |    |    |-- i: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- page: string (nullable = true)
 |    |    |    |-- patent_no: string (nullable = true)
 |    |    |    |-- sub: array (nullable = true)
 |    |    |    |    |-- element: string (containsNull = true)
 |    |    |    |-- sup: array (nullable = true)
 |    |    |    |    |-- element: string (containsN

## Summarize

In [28]:
wos2.count()

61785079

In [29]:
df = wos2.describe().toPandas()
df

Unnamed: 0,summary,UID,keywords,full_abstract,all_lang,pubyear,has_abstract,pubtype,subjects,subheadings,...,loc_subject,loc_bare_text,j_rank,sjr,h_index,cites__doc_2years,country,publisher,categories,j_quartile
0,count,61785079,61785079,61785079,61785079,61785079.0,61785079,61785079,61785079,61785079,...,1797994,1820336,49747057.0,48883810.0,49747057.0,49747057.0,49746574,48640202,49747057,48078664.0
1,mean,,2.85714285714285728E17,5.352941176470588,,1997.029839146115,,,,,...,,,5443.183626882692,1.56718835182854,130.7008273876382,2.597706033745839,,,,1.5243097229157616
2,stddev,,2.3904572186687882E18,3.1412811174461175,,18.032686345726244,,,,,...,,,6561.988529932467,2.2967343355214864,145.92085360382134,3.284882208086771,,,,0.8415846545585174
3,min,WOS:000003907500001,,,Arabic English,1900.0,N,Book,,,...,"3',5'-Cyclic-AMP Phosphodiesterases",a plaine and easie waie to remedie a horse ...,1.0,0.1,0.0,0.0,Argentina,"""Instituto San Jose de Calasanz"""" de Pedagogia...",Accounting (Q1),1.0
4,max,WOS:A1998YL55600003,~celtic sea;integrated vms and logbook data;mi...,||The growth characteristics of GaInNAs quantu...,Welsh English,2018.0,Y,Journal,zoology;zoology,technology;physical sciences;life sciences & b...,...,p-divisible groups.,ʻabbasid studies occasional papers of the sc...,9999.0,9.995,99.0,98.03,Zimbabwe,universitatea de vest,Water Science and Technology (Q4),4.0


In [33]:
df.T

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
UID,61785079,,,WOS:000003907500001,WOS:A1998YL55600003
keywords,61785079,2.85714285714285728E17,2.3904572186687882E18,,~celtic sea;integrated vms and logbook data;mi...
full_abstract,61785079,5.352941176470588,3.1412811174461175,,||The growth characteristics of GaInNAs quantu...
all_lang,61785079,,,Arabic English,Welsh English
pubyear,61785079,1997.029839146115,18.032686345726244,1900,2018
has_abstract,61785079,,,N,Y
pubtype,61785079,,,Book,Journal
subjects,61785079,,,,zoology;zoology
subheadings,61785079,,,,technology;physical sciences;life sciences & b...
