### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

In [None]:
import ads 
import pyspark 
ads.set_auth("resource_principal")

In [None]:
%load_ext sparkmagic.df_magics

In [None]:
#conda install -c johnsnowlabs spark-nlp

In [None]:
%create_session -l python -c '{"compartmentId":"ocid1.compartment.oc1..aaaaaaaalcio324mqxi6egudwmc2wzix3yclcysmmji4cggvnj4b5timvw2q", \
"displayName":"spark-nlp-project",\
"sparkVersion":"3.2.1", \
"language":"PYTHON", \
"type": "SESSION",\
"driverShape":"VM.Standard2.1", \
"executorShape":"VM.Standard2.1",\
"numExecutors":4,\
"dataflow.auth":"resource_principal",\
"configuration": {"spark.archives":"oci://conda-env-ds@bigdatadatasciencelarge/conda_environments/cpu/pyspark_30_and_data_flow/5.0/pyspark30_p37_cpu_v5#conda",\
"spark.jars.ivy":"/opt/spark/work-dir/conda/.ivy2",\
"spark.jars.packages": "com.johnsnowlabs.nlp:spark-nlp_2.12:4.1.0",\
"fs.oci.client.custom.authenticator":"com.oracle.bmc.hdfs.auth.ResourcePrincipalsCustomAuthenticator", \
"spark.jsl.settings.pretrained.cache_folder": "oci://conda-env-ds@bigdatadatasciencelarge/cachedmodels/"}\
}'

In [None]:
%%spark 

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

spark = sparknlp.start()
print(sparknlp.version())
print(spark.version)

from sparknlp.training import CoNLL
conll = CoNLL()

In [None]:
%%spark 

from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

# Start SparkSession with Spark NLP
# start() functions has 3 parameters: gpu, m1, and memory
# sparknlp.start(gpu=True) will start the session with GPU support
# sparknlp.start(m1=True) will start the session with macOS M1 support
# sparknlp.start(memory="16G") to change the default driver memory in SparkSession
spark = sparknlp.start()


pipeline = PretrainedPipeline('explain_document_dl', lang='en', disk_location="/opt/spark/work-dir/conda/pretrained/explain_document_dl_en_3.1.3_3.0_1631046343759/")
#pipeline = PretrainedPipeline.load('oci://sparknlp-models@bigdatadatasciencelarge/')
# Your testing dataset
text = """
Lawrence Joseph Ellison (born August 17, 1944) is an American business magnate and investor who is the co-founder, 
executive chairman, chief technology officer (CTO) and former chief executive officer (CEO) of the 
American computer technology company Oracle Corporation.[2] As of September 2022, he was listed by 
Bloomberg Billionaires Index as the ninth-wealthiest person in the world, with an estimated 
fortune of $93 billion.[3] Ellison is also known for his 98% ownership stake in Lanai, 
the sixth-largest island in the Hawaiian Archipelago.[4]
"""

# Annotate your testing dataset
result = pipeline.annotate(text)

# What's in the pipeline
print(list(result.keys()))

# Check the results
print(result['entities'])

In [None]:
%%spark
import os

os.listdir("/opt/spark/work-dir/conda/pretrained/explain_document_dl_en_3.1.3_3.0_1631046343759/metadata")

In [None]:
%%spark

import subprocess
subprocess.run(["df, "-sh"])

In [None]:
%stop_session

In [None]:
#pipeline = PretrainedPipeline('explain_document_dl', lang='en', disk_location='oci://sparknlp-models@bigdatadatasciencelarge/')