In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash 

--2021-10-24 15:47:03--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-10-24 15:47:03--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-10-24 15:47:03--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [None]:
import sparknlp
spark = sparknlp.start()

print("Spark NLP version: {}".format(sparknlp.version()))
print("Apache Spark version: {}".format(spark.version))

Spark NLP version: 3.3.1
Apache Spark version: 3.0.3


In [None]:
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

import pandas as pd 

In [None]:
# deifine document 
document = DocumentAssembler().setInputCol('text').setOutputCol('document').setCleanupMode('shrink')

# define sentence detector 
sentence = SentenceDetector().setInputCols('document').setOutputCol('sentence') 
sentence.setExplodeSentences(True) 

# define tokenizer 
tokenizer = Tokenizer().setInputCols('sentence').setOutputCol('token') 
tokenizer.setExceptions(['e-mail'])

# define word embedding 
embeddings = WordEmbeddingsModel.pretrained("glove_840B_300", "xx").setInputCols("document", "token").setOutputCol("embeddings")

# define ner model 
ner = NerDLModel.pretrained("ner_jifs_glove_840B_300d", "bn").setInputCols(["document", "token", "embeddings"]).setOutputCol("ner")

# define pipeline 
pipeline = Pipeline(
    stages = [
              document, 
              sentence, 
              tokenizer, 
              embeddings, 
              ner
    ])


glove_840B_300 download started this may take some time.
Approximate size to download 2.3 GB
[OK!]
ner_jifs_glove_840B_300d download started this may take some time.
Approximate size to download 16.7 MB
[OK!]


In [None]:
example = spark.createDataFrame([["৯০ এর দশকের শুরুর দিকে বৃহৎ আকারে মার্কিন যুক্তরাষ্ট্রে এর প্রয়োগের প্রক্রিয়া শুরু হয়'"]], ["text"])

model = pipeline.fit(example)  

In [None]:
result = model.transform(example)
result.show() 

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|            document|            sentence|               token|          embeddings|                 ner|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|৯০ এর দশকের শুরুর...|[[document, 0, 88...|[[document, 0, 88...|[[token, 0, 1, ৯০...|[[word_embeddings...|[[named_entity, 0...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [None]:
result.select('sentence.result').show(truncate=False) 

+-------------------------------------------------------------------------------------------+
|result                                                                                     |
+-------------------------------------------------------------------------------------------+
|[৯০ এর দশকের শুরুর দিকে বৃহৎ আকারে মার্কিন যুক্তরাষ্ট্রে এর প্রয়োগের প্রক্রিয়া শুরু হয়']|
+-------------------------------------------------------------------------------------------+



In [None]:
result.select('token.result').show(truncate=False) 

+----------------------------------------------------------------------------------------------------------+
|result                                                                                                    |
+----------------------------------------------------------------------------------------------------------+
|[৯০, এর, দশকের, শুরুর, দিকে, বৃহৎ, আকারে, মার্কিন, যুক্তরাষ্ট্রে, এর, প্রয়োগের, প্রক্রিয়া, শুরু, হয়, ']|
+----------------------------------------------------------------------------------------------------------+



In [None]:
result.select('embeddings.result').show(truncate=False) 

+----------------------------------------------------------------------------------------------------------+
|result                                                                                                    |
+----------------------------------------------------------------------------------------------------------+
|[৯০, এর, দশকের, শুরুর, দিকে, বৃহৎ, আকারে, মার্কিন, যুক্তরাষ্ট্রে, এর, প্রয়োগের, প্রক্রিয়া, শুরু, হয়, ']|
+----------------------------------------------------------------------------------------------------------+



In [None]:
result.select('ner.result').show(truncate=False) 

+-----------------------------------------------------+
|result                                               |
+-----------------------------------------------------+
|[O, O, O, O, O, O, O, B-LOC, I-LOC, O, O, O, O, O, O]|
+-----------------------------------------------------+

