In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
import matplotlib.pyplot as plt
from pyspark.sql.functions import udf
from pyspark.ml.feature import StopWordsRemover
from graphframes import *
%matplotlib inline

In [2]:
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
%%capture
spark = SparkSession.builder.enableHiveSupport().appName('FinalProject_graph').getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

In [4]:
df = spark.read.csv("gs://bdp_group6_bckt_2/data/processed_data/processed_data.parquet",
                    inferSchema=True, header=True)

In [5]:
df.printSchema()

root
 |-- paper_id: string (nullable = true)
 |-- cord_uid: string (nullable = true)
 |-- source_x: string (nullable = true)
 |-- doi: string (nullable = true)
 |-- pmcid: string (nullable = true)
 |-- pubmed_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- journal: string (nullable = true)
 |-- arxiv_id: double (nullable = true)
 |-- pdf_json_files: string (nullable = true)
 |-- pmc_json_files: string (nullable = true)
 |-- title: string (nullable = true)
 |-- abstract: string (nullable = true)
 |-- authors: string (nullable = true)



In [7]:
from pyspark.sql.functions import split, col,lit
df3 = df.select(split(col("authors"),"; ").alias("authors_distinct"),col("journal"),col('paper_id'))
df3 = df3.select("*", F.explode("authors_distinct").alias("author"))
#df3_explore = df3.groupBy("source").count().orderBy("count", ascending=False).toPandas()
#df3_explore = df3_explore.head(5)

In [9]:
from pyspark.sql.functions import udf, col, lower, regexp_replace,expr
user12 = df3.select('journal','paper_id', (lower(regexp_replace('author', "[^a-zA-Z\\s]", "")).alias('authors')))
user12

journal,paper_id,authors
JCO Oncol Pract,PMC8202122,patt debra a
JCO Oncol Pract,PMC8202122,wilfong lalan
JCO Oncol Pract,PMC8202122,toth sara
JCO Oncol Pract,PMC8202122,broussard stephanie
JCO Oncol Pract,PMC8202122,kanipe kristen
JCO Oncol Pract,PMC8202122,hammonds jason
JCO Oncol Pract,PMC8202122,allen victoria
JCO Oncol Pract,PMC8202122,mautner beatrice
JCO Oncol Pract,PMC8202122,campbell nakedra
JCO Oncol Pract,PMC8202122,dubey ajay k


In [11]:
user12 = user12.withColumn('concat_author', regexp_replace('authors', ' ', '_'))

In [12]:
user12

journal,paper_id,authors,concat_author
JCO Oncol Pract,PMC8202122,patt debra a,patt_debra_a
JCO Oncol Pract,PMC8202122,wilfong lalan,wilfong_lalan
JCO Oncol Pract,PMC8202122,toth sara,toth_sara
JCO Oncol Pract,PMC8202122,broussard stephanie,broussard_stephanie
JCO Oncol Pract,PMC8202122,kanipe kristen,kanipe_kristen
JCO Oncol Pract,PMC8202122,hammonds jason,hammonds_jason
JCO Oncol Pract,PMC8202122,allen victoria,allen_victoria
JCO Oncol Pract,PMC8202122,mautner beatrice,mautner_beatrice
JCO Oncol Pract,PMC8202122,campbell nakedra,campbell_nakedra
JCO Oncol Pract,PMC8202122,dubey ajay k,dubey_ajay_k


In [13]:
journal = user12.select('journal','paper_id')
journal = journal.withColumn("relationship", lit("belongs_to"))
journal

journal,paper_id,relationship
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to


In [40]:
authors2 = user12.select('concat_author','paper_id')
authors2 = authors2.withColumn("relationship", lit("published"))
authors2

concat_author,paper_id,relationship
patt_debra_a,PMC8202122,published
wilfong_lalan,PMC8202122,published
toth_sara,PMC8202122,published
broussard_stephanie,PMC8202122,published
kanipe_kristen,PMC8202122,published
hammonds_jason,PMC8202122,published
allen_victoria,PMC8202122,published
mautner_beatrice,PMC8202122,published
campbell_nakedra,PMC8202122,published
dubey_ajay_k,PMC8202122,published


In [24]:
user1 = user12.drop('authors')

In [25]:
user1 = user1.drop('paper_id')

In [26]:
user1 = user1.withColumn("type", lit("author"))

In [27]:
user1 = user1.drop('journal')

In [28]:
user1


concat_author,type
patt_debra_a,author
wilfong_lalan,author
toth_sara,author
broussard_stephanie,author
kanipe_kristen,author
hammonds_jason,author
allen_victoria,author
mautner_beatrice,author
campbell_nakedra,author
dubey_ajay_k,author


In [29]:
user2 = df3.select('paper_id')
user2 = user2.withColumn("type", lit("paper"))
user3 = df3.select('journal')
user3 = user3.withColumn("type", lit("journal"))

In [30]:
user2

paper_id,type
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper
PMC8202122,paper


In [31]:
user3

journal,type
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal
JCO Oncol Pract,journal


In [32]:
user1 = user1.withColumnRenamed("concat_author", "id")
user2 = user2.withColumnRenamed("paper_id", "id")
user3 = user3.withColumnRenamed("journal", "id")

In [33]:
from pyspark.sql.functions import split, col,lit
user1 = user1.withColumn('id', col('id')).distinct()
user2 = user2.withColumn('id', col('id')).distinct()
user3 = user3.withColumn('id', col('id')).distinct()

In [34]:
users_un = user1.unionAll(user2)
users = users_un.unionAll(user3)

In [44]:
journal

src,dst,relationship
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to
JCO Oncol Pract,PMC8202122,belongs_to


In [42]:
journal = journal.withColumnRenamed("journal", "src")
journal = journal.withColumnRenamed("paper_id", "dst")
authors2 = authors2.withColumnRenamed("concat_author", "src")
authors2 = authors2.withColumnRenamed("paper_id", "dst")

In [45]:
dfm = authors2.unionAll(journal)

In [46]:
dfm

src,dst,relationship
patt_debra_a,PMC8202122,published
wilfong_lalan,PMC8202122,published
toth_sara,PMC8202122,published
broussard_stephanie,PMC8202122,published
kanipe_kristen,PMC8202122,published
hammonds_jason,PMC8202122,published
allen_victoria,PMC8202122,published
mautner_beatrice,PMC8202122,published
campbell_nakedra,PMC8202122,published
dubey_ajay_k,PMC8202122,published


In [48]:
dfm.count()

1339762

In [49]:
dfm

src,dst,relationship
patt_debra_a,PMC8202122,published
wilfong_lalan,PMC8202122,published
toth_sara,PMC8202122,published
broussard_stephanie,PMC8202122,published
kanipe_kristen,PMC8202122,published
hammonds_jason,PMC8202122,published
allen_victoria,PMC8202122,published
mautner_beatrice,PMC8202122,published
campbell_nakedra,PMC8202122,published
dubey_ajay_k,PMC8202122,published


In [58]:
def PlotGraph(edge_list):
    Gplot=nx.Graph()
    for row in edge_list.select('src','dst').take(1000):
        Gplot.add_edge(row['src'],row['dst'])

    plt.subplot(121)
    nx.draw(Gplot, with_labels=True, font_weight='bold')

In [51]:
g = GraphFrame(users, dfm)

In [52]:
g.vertices.show()

+--------------------+------+
|                  id|  type|
+--------------------+------+
|           su_john_r|author|
|arcidiacono_paolo...|author|
| imborek_katherine_l|author|
|            wang_ran|author|
|         yan_jinting|author|
|  tsotras_vassilis_j|author|
|   collins_elizabeth|author|
|         welch_kelly|author|
|        li_shaoshuai|author|
|       weber_k_scott|author|
|    whiteside_lauren|author|
|      sang_charlie_j|author|
|        day_andrew_g|author|
|    terzis_gerasimos|author|
|      wakgari_negash|author|
|        lang_vanessa|author|
|   poptsov_alexander|author|
|wan_afifudeen_che...|author|
|        wang_xiaocen|author|
|             wang_qi|author|
+--------------------+------+
only showing top 20 rows



In [53]:
g.edges.show()

+--------------------+----------+------------+
|                 src|       dst|relationship|
+--------------------+----------+------------+
|        patt_debra_a|PMC8202122|   published|
|       wilfong_lalan|PMC8202122|   published|
|           toth_sara|PMC8202122|   published|
| broussard_stephanie|PMC8202122|   published|
|      kanipe_kristen|PMC8202122|   published|
|      hammonds_jason|PMC8202122|   published|
|      allen_victoria|PMC8202122|   published|
|    mautner_beatrice|PMC8202122|   published|
|    campbell_nakedra|PMC8202122|   published|
|        dubey_ajay_k|PMC8202122|   published|
|             wu_nini|PMC8202122|   published|
|     neubauer_marcus|PMC8202122|   published|
|         jones_ben_s|PMC8202122|   published|
|    paulson_r_steven|PMC8202122|   published|
|cunha_marcela_jul...|PMC8210641|   published|
|pinto_carlos_augu...|PMC8210641|   published|
|guerra_joo_carlos...|PMC8210641|   published|
|   tachibana_adriano|PMC8210641|   published|
|portugal_mar

In [54]:
g.degrees.orderBy('degree', ascending=False).show()

+--------------------+------+
|                  id|degree|
+--------------------+------+
|Int J Environ Res...| 22422|
|            PLoS One| 16202|
|             Sci Rep| 11771|
|       Front Immunol| 10025|
|          PMC8344569|  9300|
|          J Clin Med|  8489|
|             Viruses|  7995|
|    Vaccines (Basel)|  7583|
| Front Public Health|  7381|
|       Front Psychol|  6875|
|       Int J Mol Sci|  6789|
|              Cureus|  6100|
|Front Med (Lausanne)|  5908|
|                null|  5632|
|           Br J Surg|  4761|
|           Nutrients|  4365|
|      JAMA Netw Open|  4149|
|    Front Psychiatry|  4041|
|  Healthcare (Basel)|  4034|
|     Front Pharmacol|  3950|
+--------------------+------+
only showing top 20 rows



In [55]:
g.inDegrees.orderBy('inDegree', ascending=False).show()

+----------+--------+
|        id|inDegree|
+----------+--------+
|PMC8344569|    9300|
|PMC8231091|    2128|
|PMC8429803|    1478|
|PMC8711291|    1286|
|PMC8586736|    1158|
|PMC8653629|    1142|
|PMC8358924|     904|
|PMC8323543|     866|
|PMC8457757|     826|
|PMC8728233|     792|
|PMC8275367|     716|
|PMC8384927|     452|
|PMC9098237|     446|
|PMC8928018|     392|
|PMC8521484|     362|
|PMC8764723|     360|
|PMC8459899|     318|
|PMC8609892|     314|
|PMC8389395|     312|
|PMC9040372|     290|
+----------+--------+
only showing top 20 rows



In [56]:
g.outDegrees.orderBy('outDegree', ascending=False).show()

+--------------------+---------+
|                  id|outDegree|
+--------------------+---------+
|Int J Environ Res...|    22422|
|            PLoS One|    16202|
|             Sci Rep|    11771|
|       Front Immunol|    10025|
|          J Clin Med|     8489|
|             Viruses|     7995|
|    Vaccines (Basel)|     7583|
| Front Public Health|     7381|
|       Front Psychol|     6875|
|       Int J Mol Sci|     6789|
|              Cureus|     6100|
|Front Med (Lausanne)|     5908|
|                null|     5632|
|           Br J Surg|     4761|
|           Nutrients|     4365|
|      JAMA Netw Open|     4149|
|    Front Psychiatry|     4041|
|  Healthcare (Basel)|     4034|
|     Front Pharmacol|     3950|
|   BMC Public Health|     3891|
+--------------------+---------+
only showing top 20 rows

