### OCI Data Science - Useful Tips
<details>
<summary><font size="2">Check for Public Internet Access</font></summary>

```python
import requests
response = requests.get("https://oracle.com")
assert response.status_code==200, "Internet connection failed"
```
</details>
<details>
<summary><font size="2">Helpful Documentation </font></summary>
<ul><li><a href="https://docs.cloud.oracle.com/en-us/iaas/data-science/using/data-science.htm">Data Science Service Documentation</a></li>
<li><a href="https://docs.cloud.oracle.com/iaas/tools/ads-sdk/latest/index.html">ADS documentation</a></li>
</ul>
</details>
<details>
<summary><font size="2">Typical Cell Imports and Settings for ADS</font></summary>

```python
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)

import ads
from ads.dataset.factory import DatasetFactory
from ads.automl.provider import OracleAutoMLProvider
from ads.automl.driver import AutoML
from ads.evaluations.evaluator import ADSEvaluator
from ads.common.data import ADSData
from ads.explanations.explainer import ADSExplainer
from ads.explanations.mlx_global_explainer import MLXGlobalExplainer
from ads.explanations.mlx_local_explainer import MLXLocalExplainer
from ads.catalog.model import ModelCatalog
from ads.common.model_artifact import ModelArtifact
```
</details>
<details>
<summary><font size="2">Useful Environment Variables</font></summary>

```python
import os
print(os.environ["NB_SESSION_COMPARTMENT_OCID"])
print(os.environ["PROJECT_OCID"])
print(os.environ["USER_OCID"])
print(os.environ["TENANCY_OCID"])
print(os.environ["NB_REGION"])
```
</details>

### Install huggingface datasets library

In [1]:
%%capture
pip install datasets

### Load dataset from huggingface hub

In [2]:
from datasets import load_dataset
dataset = load_dataset("TypicaAI/pii-masking-60k_fr")
dataset

DatasetDict({
    train: Dataset({
        features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text'],
        num_rows: 61918
    })
})

In [4]:
dataset['train'][0]

{'masked_text': "Cher [PREFIX_1] [LASTNAME_1], nous organisons un programme d'alphabétisation à [CITY_1] en collaboration avec [COMPANYNAME_1]. Contactez [EMAIL_1] pour plus de détails.",
 'unmasked_text': "Cher Ms. Keebler, nous organisons un programme d'alphabétisation à West Shemar en collaboration avec Morissette - Russel. Contactez Hulda44@yahoo.com pour plus de détails.",
 'privacy_mask': "{'[PREFIX_1]': 'Ms.', '[LASTNAME_1]': 'Keebler', '[CITY_1]': 'West Shemar', '[COMPANYNAME_1]': 'Morissette - Russel', '[EMAIL_1]': 'Hulda44@yahoo.com'}",
 'span_labels': "[[0, 5, 'O'], [5, 8, 'PREFIX_1'], [8, 9, 'O'], [9, 16, 'LASTNAME_1'], [16, 67, 'O'], [67, 78, 'CITY_1'], [78, 101, 'O'], [101, 120, 'COMPANYNAME_1'], [120, 132, 'O'], [132, 149, 'EMAIL_1'], [149, 171, 'O']]",
 'bio_labels': ['O',
  'B-PREFIX',
  'I-PREFIX',
  'B-LASTNAME',
  'I-LASTNAME',
  'I-LASTNAME',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-CITY',
  'I-CITY',
  'I-CITY

# Generate OCI Data Labeling dataset 

(jsonl metadata with text files to go to obj storage bucket) Import format = JSONL Consolidated

In [10]:
small_dataset = dataset["train"].select(range(0,3000))
small_dataset

Dataset({
    features: ['masked_text', 'unmasked_text', 'privacy_mask', 'span_labels', 'bio_labels', 'tokenised_text'],
    num_rows: 3000
})

In [11]:
import oci
from datasets import load_dataset
import json
import io
import os
import tempfile


# Initialize OCI Object Storage Client with notebook session's resource principal
signer = oci.auth.signers.get_resource_principals_signer()
object_storage_client = oci.object_storage.ObjectStorageClient(config={}, signer=signer)

# Initialize Object Storage bucket infos
namespace = object_storage_client.get_namespace().data #"yz2wwgkgt8eh"
bucket_name = "book_oci_nlp_labeling_bucket" #"ner_ar_iob_bucket"

# Base folder in the bucket
base_folder = "taln_pii_cs_ds_060324_3000/"

# Function to strip "_1" from labels
def strip_label(label):
    return label.rsplit("_", 1)[0] if "_" in label else label

labels_set = set()
annotations_list = []

for idx, item in enumerate(small_dataset):
    text = item['unmasked_text']
    span_labels = eval(item['span_labels'])  # Convert string to list if necessary

    # exclude labels not relevant to our use-case
    relevant_labels = {'EMAIL_1', 'CREDITCARDNUMBER_1', 'MIDDLENAME_1', 'FIRSTNAME_1', 'LASTNAME_1', 'AGE_1',
                     'PHONENUMBER_1', 'STREET_1', 'ZIPCODE_1'}
    
    # Prepare annotations for this row
    entities = []
    for start, end, label in span_labels:
        if label in relevant_labels:
            label = strip_label(label)
            labels_set.add(label)  # Add to the set of unique labels
            entities.append({
                "entityType": "TEXTSELECTION",
                "labels": [{"label_name": label}],
                "textSpan": {"offset": start, "length": end - start}
            })
    
    if len(entities) > 0:

        file_name = f"taln_pii_case_study_{idx}.txt"

        annotations_list.append({
            "sourceDetails": {"path": file_name},
            "annotations": [{"entities": entities}]
        })

        # Use delete=False because we need the file path for uploading
        with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.txt') as tmpfile:
            tmpfile.write(text)
            temp_file_path = tmpfile.name  # Store the temp file path to delete later

        # Upload the text to OCI bucket
        record_filename = f"{base_folder}{file_name}"
        #record_body = io.BytesIO(text.encode('utf-8'))

        with open(temp_file_path, 'rb') as f:
            object_storage_client.put_object(namespace, 
                                             bucket_name, 
                                             record_filename, 
                                             f,
                                             content_type='text/plain')
        #print(f'Uploaded {temp_file_path} file to object {bucket_name}/{record_filename}')

        # Delete the tempfile now that it's been uploaded
        os.remove(temp_file_path)




# Prepare the dataset details JSON
dataset_details = {
    "displayName": "book_oci_nlp_labeling_ds_060324_3000",
    "description": "book_oci_nlp_labeling_ds",
    "labelsSet": [{"name": label} for label in labels_set],
    "annotationFormat": "ENTITY_EXTRACTION",
    "datasetFormatDetails": {"formatType": "TEXT"}
}

# Metadata and annotations as JSONL string
jsonl_data = json.dumps(dataset_details) + '\n' + '\n'.join(json.dumps(annotation) for annotation in annotations_list)

# Use tempfile to create a temporary file for JSONL content
with tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.jsonl') as tmpfile:
# Write the metadata and annotations to a JSONL file
#with open('taln_pii_case_study_ds_metadata.jsonl', 'w') as f:
    json.dump(dataset_details, tmpfile)
    tmpfile.write('\n')
    for annotation in annotations_list:
        json.dump(annotation, tmpfile)
        tmpfile.write('\n')
    #tmpfile.write(jsonl_data)
    tmpfile_path = tmpfile.name  # Save the path for uploading
        
# Upload the JSONL file to OCI bucket
with open(tmpfile_path, 'rb') as f:
    objStoreResp = object_storage_client.put_object(namespace, 
                                                    bucket_name, 
                                                    f"{base_folder}dataset_metadata.jsonl", 
                                                    f,
                                                    content_type='application/json'  # Set the Content-Type for the object
                                                   )

print(f'The Dataset {dataset_details["displayName"]} was created successfully in the bucket {bucket_name}')

# Optionally delete the temporary file
os.remove(tmpfile_path)




The Dataset book_oci_nlp_labeling_ds_060324_3000 was created successfully in the bucket book_oci_nlp_labeling_bucket
