## This is a colab notebook! Please download and run in colab

In [None]:
# Retreive user creds
from google.colab import auth
auth.authenticate_user()
print('Authenticated')

Authenticated


Enable Data Table Display using google.colab.data_table

In [None]:
%load_ext google.colab.data_table

## Retrieving data from GCP BigQuery

In [None]:
# GCP proejct
project_id = 'nlp-332020'
project_number = '1054321893028'

%env GCLOUD_PROJECT=project_id

env: GCLOUD_PROJECT=project_id


In [None]:
# authenticate colab notebook
import sys
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user()

In [None]:
from google.cloud import bigquery

client = bigquery.Client(project=project_id)

In [None]:
sample_count = 2000
row_count = client.query('''
  SELECT 
    COUNT(*) as total
  FROM `nlp-332020.readmission_dataset.cleaned_dataset`''').to_dataframe().total[0]

df = client.query('''
  SELECT
  text_new,
  CASE
  -- Readmitted within 6 months of last visit.
  -- There are weird cases in which ppl have identical entries except for
  -- some differences in the clinical notes. We shouldn't count those as readmits
    WHEN next_admission != Admittime AND DATE_DIFF(next_admission, Admittime, MONTH) <= 6 THEN 1
  ELSE
  0
END
  AS readmitted
FROM (
  SELECT
    * EXCEPT(TEXT),
    -- Spark DF's are weird about return chars
    -- This just makes it one long string and
    -- gets rid of non-letter/num chars
    REGEXP_REPLACE(TEXT, r"\W", " ") AS text_new,
    LEAD(Admittime) OVER(PARTITION BY SUBJECT_ID ORDER BY Admittime) AS next_admission
  FROM
    `nlp-332020.readmission_dataset.cleaned_dataset`
  WHERE
  -- Don't include ppl who died
    deathtime IS NULL)
''').to_dataframe()

print('Full dataset has %d rows' % row_count)

Full dataset has 63214 rows


In [None]:
type(df)

pandas.core.frame.DataFrame

In [None]:
df.shape

(56750, 2)

In [None]:
df.columns

Index(['text_new', 'readmitted'], dtype='object')

In [None]:
df.head()

Unnamed: 0,text_new,readmitted
0,Admission Date 2170 4 4 Discharge ...,0
1,Admission Date 2139 7 26 Dischar...,0
2,Admission Date 2139 7 26 ...,0
3,Admission Date 2124 8 28 ...,0
4,Admission Date 2128 1 5 Discharg...,0


In [None]:
# For this step, you need a free trial of the John Snow Labs version (spark-nlp-jsl)
# of Spark NLP. While much of Spark NLP is open source, many of the healthcare-specific
# models are only accessible via spark-nlp-jsl. Sign up for an account and free trial
# at https://www.johnsnowlabs.com/. Step by step instructions found in the README.md
import json
import os

from google.colab import files

license_keys = files.upload()

with open(list(license_keys.keys())[0]) as f:
    license_keys = json.load(f)

# Defining license key-value pairs as local variables
locals().update(license_keys)

# Adding license key-value pairs to environment variables
os.environ.update(license_keys)

Saving spark_nlp_for_healthcare_spark_ocr_3270.json to spark_nlp_for_healthcare_spark_ocr_3270 (1).json


In [None]:
# Installing pyspark and spark-nlp
! pip install --upgrade -q pyspark==3.1.2 spark-nlp==$PUBLIC_VERSION

# Installing Spark NLP Healthcare
! pip install --upgrade -q spark-nlp-jsl==$JSL_VERSION  --extra-index-url https://pypi.johnsnowlabs.com/$SECRET

# Installing Spark NLP Display Library for visualization
! pip install -q spark-nlp-display

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spark-nlp-jsl 3.3.2 requires spark-nlp==3.3.2, but you have spark-nlp 3.2.1 which is incompatible.[0m


In [None]:
import json
import os
from pyspark.ml import Pipeline,PipelineModel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from sparknlp.annotator import *
from sparknlp_jsl.annotator import *
from sparknlp.base import *
import sparknlp_jsl
import sparknlp

import warnings
warnings.filterwarnings('ignore')

params = {"spark.driver.memory":"16G", 
          "spark.kryoserializer.buffer.max":"2000M", 
          "spark.driver.maxResultSize":"2000M"} 

print ("Spark NLP Version :", sparknlp.version())
print ("Spark NLP_JSL Version :", sparknlp_jsl.version())

spark = sparknlp_jsl.start(license_keys['SECRET'],params=params)

spark

Spark NLP Version : 3.3.2
Spark NLP_JSL Version : 3.3.2


In [None]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
#Create User defined Custom Schema using StructType
mySchema = StructType([StructField("index", IntegerType(), True)\
                       ,StructField("text_new", StringType(), True)\
                       ,StructField("readmitted", IntegerType(), True)])

spark_df = spark.createDataFrame(df.reset_index(), schema=mySchema)

In [None]:
spark_df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- text_new: string (nullable = true)
 |-- readmitted: integer (nullable = true)



In [None]:
spark_df.show()

+-----+--------------------+----------+
|index|            text_new|readmitted|
+-----+--------------------+----------+
|    0|Admission Date   ...|         0|
|    1|Admission Date   ...|         0|
|    2|Admission Date   ...|         0|
|    3|Admission Date   ...|         0|
|    4|Admission Date   ...|         0|
|    5|Admission Date   ...|         0|
|    6|Admission Date   ...|         0|
|    7|Admission Date   ...|         0|
|    8|Admission Date   ...|         0|
|    9|Admission Date   ...|         1|
|   10|Admission Date   ...|         0|
|   11|Admission Date   ...|         1|
|   12|        Date     ...|         0|
|   13|Admission Date   ...|         0|
|   14|Admission Date   ...|         0|
|   15|Admission Date   ...|         0|
|   16|Admission Date   ...|         1|
|   17|Admission Date   ...|         0|
|   18|Admission Date   ...|         0|
|   19|Admission Date   ...|         0|
+-----+--------------------+----------+
only showing top 20 rows



In [None]:
train, test = spark_df.randomSplit([0.8, 0.2], seed=1234567)

In [None]:
train = train.withColumn("index", train["index"].cast(IntegerType()))
train = train.withColumn("readmitted", train["readmitted"].cast(IntegerType()))
train.printSchema()

root
 |-- index: integer (nullable = true)
 |-- text_new: string (nullable = true)
 |-- readmitted: integer (nullable = true)



In [None]:
test = test.withColumn("index", test["index"].cast(IntegerType()))
test = test.withColumn("readmitted", test["readmitted"].cast(IntegerType()))
test.printSchema()

root
 |-- index: integer (nullable = true)
 |-- text_new: string (nullable = true)
 |-- readmitted: integer (nullable = true)



In [None]:
# Annotator that transforms a text column from dataframe into an Annotation ready for NLP
documentAssembler = DocumentAssembler()\
      .setInputCol("text_new")\
      .setOutputCol("sentence")\
      .setCleanupMode("shrink_full")

# Tokenizer splits words in a relevant format for NLP
tokenizer = Tokenizer()\
      .setInputCols(["sentence"])\
      .setOutputCol("token")

# Normalize inputs
normalizer = Normalizer()\
  .setInputCols(["token"])\
  .setOutputCol("normalized")

# Remove stop words
stopwords_cleaner = StopWordsCleaner()\
  .setInputCols("normalized")\
  .setOutputCol("cleanTokens")

word_embeddings = WordEmbeddingsModel.pretrained("embeddings_clinical","en","clinical/models")\
      .setInputCols(["sentence", "cleanTokens"])\
      .setOutputCol("embeddings")

# bert_embeddings = BertEmbeddings.pretrained("biobert_pubmed_base_cased", "en")\
#     .setInputCols(["sentence", "cleanTokens"])\
#     .setOutputCol("embeddings")

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["sentence", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")\
      .setStorageRef('biobert_pubmed_base_cased')

classsifierdl = ClassifierDLApproach()\
  .setInputCols("sentence_embeddings")\
  .setOutputCol("class")\
  .setLabelColumn("readmitted")\
  .setMaxEpochs(10)\
  .setBatchSize(12)

base_clf_pipeline = Pipeline(
    stages=[documentAssembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner,
            word_embeddings,
            # bert_embeddings,
            embeddingsSentence,
            classsifierdl])

base_clf_model = base_clf_pipeline.fit(train)

biobert_pubmed_base_cased download started this may take some time.
Approximate size to download 386.4 MB
[OK!]


In [None]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, auc, precision_recall_curve

df_results = base_clf_model.transform(test).select("readmitted", "text_new", "class.result").toPandas()

df_results["result"] = df_results["result"].apply(lambda x: int(x[0]))

precision, recall, thresholds = precision_recall_curve(df_results.readmitted, df_results.result)

print(classification_report(df_results.readmitted, df_results.result))
print(accuracy_score(df_results.readmitted, df_results.result))
print(roc_auc_score(df_results.readmitted, df_results.result))
print(auc(recall, precision))