# Setting up 

## Starting Spark NLP session

In [1]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.5.2

# Quick SparkSession start
import sparknlp
spark = sparknlp.start()

print("Spark NLP version")
print(sparknlp.version())
print("Apache Spark version")
print(spark.version)

openjdk version "1.8.0_275"
OpenJDK Runtime Environment (build 1.8.0_275-8u275-b01-0ubuntu1~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.275-b01, mixed mode)
Collecting pyspark==2.4.4
[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)
[K     |████████████████████████████████| 215.7MB 26kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 51.8MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130389 sha256=d675d346ff58e035ac48bccc1bfdc8756beb1cf1c62b4ca6fe681e7e2595d737
  Stored in directory: /root/.cache/pip/wheels/ab/09/4d/0d18423005

'2.4.4'

## Imports

In [2]:
import pandas as pd
from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.embeddings import *
from sparknlp.common import *
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [3]:
# Setting up items needed to make Spark dataframes, allowing us to easily send text through SparkNLP.
# Using SQL through SparkNLP through Pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc =SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

## Downloading and setting up SparkNLP pipeline

In [4]:
#Creating SparkNLP pipeline to get BERT embeddings
document_assembler = DocumentAssembler()\
  .setInputCol("text")\
  .setOutputCol("document")

tokenizer = Tokenizer().setInputCols(["document"])\
  .setOutputCol("token")

normalizer = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
  .setInputCols(["normalized"])\
  .setOutputCol("cleanTokens")

lemma = LemmatizerModel.pretrained('lemma_antbnc') \
  .setInputCols(["cleanTokens"]) \
  .setOutputCol("lemma")

word_embeddings = BertEmbeddings.pretrained('bert_base_cased', 'en')\
  .setInputCols(["document", "token"])\
  .setOutputCol("embeddings")\
  .setCaseSensitive(False)\

bert_pipeline = Pipeline().setStages(
  [
    document_assembler,
    tokenizer,
    normalizer,
    stopwords_cleaner,
    lemma,
    word_embeddings
  ]
)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
bert_base_cased download started this may take some time.
Approximate size to download 389.2 MB
[OK!]


## Downloading dataset

In [5]:
data_path = 'drive/My Drive/projects/data/bxsci-cord-19/'
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Mounted at /content/drive/


In [6]:
# We read the pandas dataframe into the Cord_19_dataframe variable so that we can use it here
Cord_19_dataframe = pd.read_pickle(data_path+'Cord_19_filtered.pkl')
# Isolating the text column so that when we start processing text wee don't process article ID's and what part the text is from
Cord_19_dataframe = Cord_19_dataframe.loc[0:,["text"]]
Cord_19_dataframe

Unnamed: 0,text
0,Next-generation sequencing is increasingly bei...
1,There are many different methods that characte...
2,The analytical question motivating a particula...
3,Metagenomic classification methods are based o...
4,"Alignment to large nucleotide database Huson, ..."
...,...
3155578,Managing severe acute respiratory syndrome (SA...
3155579,Illustration of the known and potential host r...
3155580,Summary of MERS-CoV shedding and presence of v...
3155581,"Summary of clinical signs, pathological findin..."


In [7]:
# Converting the pandas to a spark dataframe.
Cord_Spark_dataframe = sqlContext.createDataFrame(Cord_19_dataframe)
Cord_Spark_dataframe.show()

+--------------------+
|                text|
+--------------------+
|Next-generation s...|
|There are many di...|
|The analytical qu...|
|Metagenomic class...|
|Alignment to larg...|
|To understand the...|
|The unique identi...|
|The analysis is c...|
|The example shown...|
|The nomenclature ...|
|Steps to construc...|
|1. Select a set o...|
|The files contain...|
|       To run SIANN:|
|1. Select a pre-m...|
|The performance o...|
|Organisms were sp...|
|Each program outp...|
|For each method, ...|
|The relationship ...|
+--------------------+
only showing top 20 rows



# Removing stop words, lemmatizing, stemming and producing BERT embeddings

In [8]:
# The single line of code that will perform all the text processing!
Cord_Spark_dataframe = bert_pipeline.fit(Cord_Spark_dataframe).transform(Cord_Spark_dataframe)

In [9]:
# Lets take a look at what the dataframe looks like once that was done
Cord_Spark_dataframe.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|               token|          normalized|         cleanTokens|               lemma|          embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Next-generation s...|[[document, 0, 12...|[[token, 0, 14, N...|[[token, 0, 13, N...|[[token, 0, 13, N...|[[token, 0, 13, N...|[[word_embeddings...|
|There are many di...|[[document, 0, 11...|[[token, 0, 4, Th...|[[token, 0, 4, Th...|[[token, 10, 13, ...|[[token, 10, 13, ...|[[word_embeddings...|
|The analytical qu...|[[document, 0, 91...|[[token, 0, 2, Th...|[[token, 0, 2, Th...|[[token, 4, 13, a...|[[token, 4, 13, a...|[[word_embeddings...|
|Metagenomic class...|[[document, 0, 34...|[[token, 0, 10, M...|[[token, 0, 10, M...|[[token, 0, 10, M...|

The embeddings column all the way to the right contains all the embeddings of each word in the text. I will get the embeddings from the dataframe so that we can look at it as a list and verify that we have vectorized representations of our words.

The .head function only selects the 1st row, to select other rows we need to use .collect which would bring the entire dataframe to memory which spark dataframes are not made for. Due to memory contrictions .collect only works for small data but we still have all our embeddings to work with

In [17]:
# This is the emedding for the first word of the first row

Cord_Spark_dataframe.head()[6][0][5]

# The comment below shows what the rest of the indexing above means:
# Cord_Spark_dataframe.head()[6 IS TO SELECT EMBEDDINGS COLUMN][WORD BEING SELECTED][5 IS TO SELECT LIST CONTAINING EMBEDDINGS]

[-1.4027656316757202,
 0.3348509669303894,
 -0.2293238341808319,
 0.09394717216491699,
 0.5936752557754517,
 -1.7199372053146362,
 0.2005765736103058,
 1.2554306983947754,
 -0.021653443574905396,
 1.2954121828079224,
 -0.7519142627716064,
 0.5978223085403442,
 0.031435105949640274,
 -0.5533902645111084,
 0.8710290789604187,
 -1.5815132856369019,
 -0.2357737421989441,
 -0.3782987594604492,
 1.4824138879776,
 0.3548561632633209,
 0.5365773439407349,
 0.7673106789588928,
 -0.33705300092697144,
 -1.561139702796936,
 -0.9025073051452637,
 0.8358224630355835,
 -1.2630023956298828,
 0.5873980522155762,
 0.8184084892272949,
 -0.09177777171134949,
 -0.9518206119537354,
 -1.7539935111999512,
 -1.1837834119796753,
 -0.1289459764957428,
 1.3897072076797485,
 0.043388646095991135,
 -0.36325299739837646,
 0.9033586978912354,
 -0.41066133975982666,
 -0.2061094343662262,
 0.4144718050956726,
 0.15246830880641937,
 0.6039539575576782,
 0.4591057002544403,
 1.1302857398986816,
 0.07192657887935638,
 -0.