<a href="https://colab.research.google.com/github/hasanabbas21/spark-nlp/blob/main/QuickTest-ADR-50recordDataSet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import json

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

Saving workshop_license_keys_365.json to workshop_license_keys_365 (1).json


In [8]:
%%capture
for k,v in license_keys.items(): 
    %set_env $k=$v

!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp-workshop/master/jsl_colab_setup.sh
!bash jsl_colab_setup.sh

! pip install spark-nlp-display

In [12]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

params = {
    "spark.driver.memory":"16G",
    "spark.kryoserializer.buffer.max":"2000M",
    "spark.driver.maxResultSize":"2000M",
    "gpu":True
      }

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

Spark NLP Version : 3.0.1
Spark NLP_JSL Version : 3.0.1


In [13]:
from pyspark.ml import Pipeline

from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

In [14]:
assembler = DocumentAssembler().setInputCol("text").setOutputCol("document")

In [15]:
sentenceDetector = SentenceDetectorDLModel.pretrained("sentence_detector_dl_healthcare","en","clinical/models")\
        .setInputCols(["document"])\
        .setOutputCol("sentence")

sentence_detector_dl_healthcare download started this may take some time.
Approximate size to download 363.9 KB
[OK!]


In [16]:
tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

In [50]:
bert_sent = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_768")\
 .setInputCols(["document"])\
 .setOutputCol("sentence_embeddings")

classsifierdl = ClassifierDLApproach()\
  .setInputCols(["sentence_embeddings"])\
  .setOutputCol("class")\
  .setLabelColumn("label")\
  .setMaxEpochs(100)\
  .setBatchSize(10)\
  .setLr(1e-4)\
  .setOutputLogsPath('./adr_logs')\
  .setEnableOutputLogs(True)

bert_clf_pipeline = Pipeline(
    stages = [
        assembler,
        bert_sent,
        classsifierdl
    ])

sent_small_bert_L2_768 download started this may take some time.
Approximate size to download 139.6 MB
[OK!]


In [51]:
import pandas as pd

ade_df = pd.read_csv('sample_ADE_dataset.csv')
ade_df.columns=['text', 'label']

In [52]:
spark_df = spark.createDataFrame(ade_df)
spark_df.show(5)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Do U know what Me...|false|
|# hypercholestero...| true|
|Her weight, respi...|false|
|* DM - Pt had sev...| true|
|We report the cas...| true|
+--------------------+-----+
only showing top 5 rows



In [53]:
from pyspark.sql.functions import *
spark_df = spark_df.withColumn("label", col("label").cast(StringType()))
spark_df.show(4)

+--------------------+-----+
|                text|label|
+--------------------+-----+
|Do U know what Me...|false|
|# hypercholestero...| true|
|Her weight, respi...|false|
|* DM - Pt had sev...| true|
+--------------------+-----+
only showing top 4 rows



In [54]:
spark_df.count()

50

In [55]:
(trainingData, testData) = spark_df.randomSplit([0.8, 0.2], seed = 100)


In [56]:
bert_clf_pipeline = bert_clf_pipeline.fit(trainingData)


In [64]:
pred = bert_clf_pipeline.transform(trainingData)

pred.select('label','document',"class.result").show(10, truncate=100)

pred_pd = pred.select('label','document',"class.result").toPandas()

pred_pd['result'] = pred_pd['result'].apply(lambda x : x[0])

+-----+----------------------------------------------------------------------------------------------------+-------+
|label|                                                                                            document| result|
+-----+----------------------------------------------------------------------------------------------------+-------+
| true|[{document, 0, 184, # Maculopapular rash: over extremities, chest and back, thought [**1-14**] zo...| [true]|
| true|[{document, 0, 102, # hypercholesterol: Because of elevated CKs (peaked at 819) the patient's Sim...| [true]|
| true|[{document, 0, 66, # thrombocytopenia: Secondary to chemotherapy and MDS/AML concerns., {sentence...| [true]|
|false|[{document, 0, 206, - Per oral maxillofacial surgery there is no evidence of a current dental abc...| [true]|
|false|[{document, 0, 76, 10 . She was left on prednisone 20mg qd due to high sensitivity to her donor.,...|[false]|
|false|[{document, 0, 88, 2 . Calcipotriene 0.005% Cream Sig: On

In [67]:
from sklearn.metrics import classification_report, confusion_matrix

print (classification_report(pred_pd['result'], pred_pd['label']))

print(confusion_matrix(pred_pd['result'], pred_pd['label']))

              precision    recall  f1-score   support

       false       0.79      1.00      0.88        15
        true       1.00      0.85      0.92        26

    accuracy                           0.90        41
   macro avg       0.89      0.92      0.90        41
weighted avg       0.92      0.90      0.90        41

[[15  0]
 [ 4 22]]
