## Import all the necessary modules and global variables

In [3]:
from docai_module.config import *

# Part 1: Natural Language API

The Cloud Natural Language API lets you extract entities from text, perform sentiment and syntactic analysis, and classify text into categories.  
In this lab, you learn how to use the Natural Language API to analyze entities, sentiment, and syntax.

## Natural Language features
The Natural Language API has several methods for performing analysis and annotation on your text. Each level of analysis provides valuable information for language understanding. These methods are listed below:

 - **Sentiment analysis** inspects the given text and identifies the prevailing emotional opinion within the text, especially to determine a writer's attitude as positive, negative, or neutral. Sentiment analysis is performed through the analyzeSentiment method.

 - **Entity analysis inspects** the given text for known entities (Proper nouns such as public figures, landmarks, and so on. Common nouns such as restaurant, stadium, and so on.) and returns information about those entities. Entity analysis is performed with the analyzeEntities method.

 - **Entity sentiment analysis** inspects the given text for known entities (proper nouns and common nouns), returns information about those entities, and identifies the prevailing emotional opinion of the entity within the text, especially to determine a writer's attitude toward the entity as positive, negative, or neutral. Entity analysis is performed with the analyzeEntitySentiment method.

 - **Syntactic analysis** extracts linguistic information, breaking up the given text into a series of sentences and tokens (generally, word boundaries), providing further analysis on those tokens. Syntactic Analysis is performed with the analyzeSyntax method.

 - **Content classification** analyzes text content and returns a content category for the content. Content classification is performed by using the classifyText method.

Each API call also detects and returns the language, if a language is not specified by the caller in the initial request.

Additionally, if you wish to perform several natural language operations on given text using only one API call, the annotateText request can also be used to perform sentiment analysis and entity analysis.

## Analyze Entities

This function receives a text and sends to the API to do the **Entity Extraction**.

In [11]:
def analyze_entities(text_content: str):
    """
    Analyzing Entities in a String
    Args:
      text_content The text content to analyze
    """
    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT
    lang = "pt"
    document = {"content": text_content, "type_": type_, "language": lang}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8
    
    response = NLP_CLIENT.analyze_entities(request = {'document': document, 'encoding_type': encoding_type})

    # Loop through entitites returned from the API
    for entity in response.entities:
        print(u"Representative name for the entity: {}".format(entity.name))

        # Get entity type, e.g. PERSON, LOCATION, ADDRESS, NUMBER, et al
        print(u"Entity type: {}".format(language_v1.Entity.Type(entity.type_).name))

        # Get the salience score associated with the entity in the [0, 1.0] range
        print(u"Salience score: {}".format(entity.salience))

        # Loop over the metadata associated with entity. For many known entities,
        # the metadata is a Wikipedia URL (wikipedia_url) and Knowledge Graph MID (mid).
        # Some entity types may have additional metadata, e.g. ADDRESS entities
        # may have metadata for the address street_name, postal_code, et al.
        for metadata_name, metadata_value in entity.metadata.items():
            print(u"{}: {}".format(metadata_name, metadata_value))

        # Loop over the mentions of this entity in the input document.
        # The API currently supports proper noun mentions.
        for mention in entity.mentions:
            print(u"Mention text: {}".format(mention.text.content))

            # Get the mention type, e.g. PROPER for proper noun
            print(
                u"Mention type: {}".format(language_v1.EntityMention.Type(mention.type_).name)
            )

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))

In [12]:
analyze_entities('Osasco, um lugar legal para se morar')

Representative name for the entity: Osasco
Entity type: LOCATION
Salience score: 0.6129672527313232
wikipedia_url: https://fr.wikipedia.org/wiki/Microregion_d'Osasco
mid: /g/122n5xvw
Mention text: Osasco
Mention type: PROPER
Representative name for the entity: lugar
Entity type: OTHER
Salience score: 0.38703271746635437
Mention text: lugar
Mention type: COMMON
Representative name for the entity: um
Entity type: NUMBER
Salience score: 0.0
value: 1
Mention text: um
Mention type: TYPE_UNKNOWN
Language of the text: pt


#### Sentiment analysis

In [15]:
def analyze_sentiment(text_content: str):
    """
    Analyzing Sentiment in a String
    Args:
      text_content The text content to analyze
    """
    # Available types: PLAIN_TEXT, HTML
    type_ = language_v1.Document.Type.PLAIN_TEXT
    lang = "pt"
    document = {"content": text_content, "type_": type_, "language": lang}

    # Available values: NONE, UTF8, UTF16, UTF32
    encoding_type = language_v1.EncodingType.UTF8

    response = NLP_CLIENT.analyze_sentiment(request = {'document': document, 'encoding_type': encoding_type})
    # Get overall sentiment of the input document
    print(u"Document sentiment score: {}".format(response.document_sentiment.score))
    print(
        u"Document sentiment magnitude: {}".format(
            response.document_sentiment.magnitude
        )
    )
    # Get sentiment for all sentences in the document
    for sentence in response.sentences:
        print(u"Sentence text: {}".format(sentence.text.content))
        print(u"Sentence sentiment score: {}".format(sentence.sentiment.score))
        print(u"Sentence sentiment magnitude: {}".format(sentence.sentiment.magnitude))

    # Get the language of the text, which will be the same as
    # the language specified in the request or, if not specified,
    # the automatically-detected language.
    print(u"Language of the text: {}".format(response.language))

In [17]:
analyze_sentiment('hoje não é um bom dia para caminhar')

Document sentiment score: -0.6000000238418579
Document sentiment magnitude: 0.6000000238418579
Sentence text: hoje não é um bom dia para caminhar
Sentence sentiment score: -0.6000000238418579
Sentence sentiment magnitude: 0.6000000238418579
Language of the text: pt


# Part 2: AutoML Natural Language

AutoML Natural Language uses machine learning to analyze the structure and meaning of documents. You train a custom machine learning model to classify documents, extract information, or understand the sentiment of authors.

 - A **classification model** analyzes a document and returns a list of content categories that apply to the text found in the document.

 - An **entity extraction model** inspects a document for known entities referenced in the document and labels those entities in the text.

 - A **sentiment analysis model** inspects a document and identifies the prevailing emotional opinion within it, especially to determine a writer's attitude as positive, negative, or neutral.

Basic classification, entity extraction, and sentiment analysis are available through the Cloud Natural Language API. AutoML Natural Language enables you to define custom classification categories, entities, and sentiment scores that are relevant to your application.

#### Enable the AutoML Natural Language API

Visit the following link to enable the AutoML Natural Language API, just like you did for Natural Language API:

https://console.cloud.google.com/flows/enableapi?apiid=storage-component.googleapis.com,automl.googleapis.com,storage-api.googleapis.com&redirect=https://console.cloud.google.com&_ga=2.56437803.1602512825.1610975711-1349947163.1610541600

#### Document Classification

This tutorial demonstrates how to create a custom model for classifying content using AutoML Natural Language. The application trains a custom model using a corpus of crowd-sourced "happy moments" from the Kaggle open-source dataset HappyDB. The resulting model classifies happy moments into categories reflecting the causes of happiness.

#### ATENTION!

Importing and training a model in AutoML can take hours or days.  
To simplify and speedup this process, the dataset was reduced to a very small sample. This means that the performance of the model **CAN'T** be taken in consideration, knowing the very few examples provided.

Before creating the model in AutoML, let's explore the data:

In [79]:
with open('./files/happiness.csv', 'r') as f:
    lines = f.readlines()

In [82]:
print(lines[0])

I went on a successful date with someone I felt sympathy and connection with.,affection



In this case, we want to create a Classification model (single label) which takes as inputs a text and its label.  
[AutoML expects this inputs like the following](https://cloud.google.com/natural-language/automl/docs/prepare):

[TRAIN, VALIDATION, TEST],your text,[label]  
Example: TRAIN, I really want to go sky., travel

The first token, TRAIN, indicates that the text will be used in the training phase. If you don't include this token, AutoML will split your dataset automatically.

In [50]:
REGION_NAME='us-central1'

Let's create a bucket in Google Cloud Storage to upload our training data.

In [51]:
# Create a bucket in GCS to store the documents
!gsutil mb -p $PROJECT_ID -c regional -l $REGION_NAME gs://$PROJECT_ID-lcm/

Creating gs://cool-ml-demos-lcm/...


In [52]:
# Copy the Happiness dataset to your bucket
!gsutil cp ./files/happiness.csv gs://$PROJECT_ID-lcm/

Copying gs://cloud-ml-data/NL-classification/happiness.csv [Content-Type=text/csv]...
/ [1 files][  1.3 MiB/  1.3 MiB]                                                
Operation completed over 1 objects/1.3 MiB.                                      


#### Step 1 - Create the dataset in AutoML

Dataset is an empty container where we can upload our documents to later train the model.

In [60]:
def create_dataset(project_id, display_name):
    """Create a dataset."""
    # [START automl_language_text_classification_create_dataset]
    from google.cloud import automl

    client = automl.AutoMlClient()

    # A resource that represents Google Cloud Platform location.
    project_location = f"projects/{project_id}/locations/us-central1"
    # Specify the classification type
    # Types:
    # MultiLabel: Multiple labels are allowed for one example.
    # MultiClass: At most one label is allowed per example.
    metadata = automl.TextClassificationDatasetMetadata(
        classification_type=automl.ClassificationType.MULTICLASS
    )
    dataset = automl.Dataset(
        display_name=display_name,
        text_classification_dataset_metadata=metadata,
    )

    # Create a dataset with the dataset metadata in the region.
    response = client.create_dataset(parent=project_location, dataset=dataset)

    created_dataset = response.result()

    # Display the dataset information
    print("Dataset name: {}".format(created_dataset.name))
    print("Dataset id: {}".format(created_dataset.name.split("/")[-1]))
    
    return created_dataset.name.split("/")[-1]
    # [END automl_language_text_classification_create_dataset]

In [61]:
dataset_id = create_dataset(project_id, 'happydb')

Dataset name: projects/411150075841/locations/us-central1/datasets/TCN3738903034147635200
Dataset id: TCN3738903034147635200


To verify if the dataset was created, go to the Google Cloud Web console and navigate to:
 - AutoML Natural Language > Datasets
 
<img src="./images/3_2_menu_automl.png"
     alt="AutoML Dataset"
     style="width:25%"
     />
     
You should see an entry with the dataset you just created:

<img src="./images/3_3_newdataset.png"
     alt="AutoML Dataset"
     style="width:70%"
     />

#### Step 2 - Import the dataset

The next step is to populate the dataset with a list of training content items labeled using the target categories.

The import_dataset function interface takes as input a .csv file that lists the locations of all training documents and the proper label for each training document. (See Preparing your training data for details about the required format.) For this tutorial, we will be using happiness.csv, which you uploaded to Google Cloud Storage above.

In [None]:
def import_dataset(project_id, dataset_id, path):
    """Import a dataset."""
    # [START automl_import_data]
    from google.cloud import automl

    client = automl.AutoMlClient()
    # Get the full path of the dataset.
    dataset_full_id = client.dataset_path(
        project_id, "us-central1", dataset_id
    )
    # Get the multiple Google Cloud Storage URIs
    input_uris = path.split(",")
    gcs_source = automl.GcsSource(input_uris=input_uris)
    input_config = automl.InputConfig(gcs_source=gcs_source)
    # Import data from the input URI
    response = client.import_data(name=dataset_full_id, input_config=input_config)

    print("Processing import...")
    print("Data imported. {}".format(response.result()))
    # [END automl_import_data]

In [63]:
path = f'gs://{project_id}-lcm/happiness.csv'

In [None]:
# This process may take several minutes. 
import_dataset(project_id, dataset_id, path)

Processing import...


After the import is finished, you should see the data in the AutoML NL console.

You can see the quantity of itens uploaded for each specific label.
It is possible to click on a specific item and change its label.

<img src="./images/3_4_importdata.png"
     alt="AutoML Dataset"
     style="width:70%"
     />

#### Step 3 - Create the model
Now that you have a dataset of labeled training documents, you can train a new model.



In [67]:
def create_model(project_id, dataset_id, display_name):
    """Create a model."""
    from google.cloud import automl

    client = automl.AutoMlClient()

    # A resource that represents Google Cloud Platform location.
    project_location = f"projects/{project_id}/locations/us-central1"
    # Leave model unset to use the default base model provided by Google
    metadata = automl.TextClassificationModelMetadata()
    model = automl.Model(
        display_name=display_name,
        dataset_id=dataset_id,
        text_classification_model_metadata=metadata,
    )

    # Create a model with the model metadata in the region.
    response = client.create_model(parent=project_location, model=model)

    print(u"Training operation name: {}".format(response.operation.name))
    print("Training started...")

    # Return the model_id
    return response.operation.name.split("/")[-1]

In [66]:
model_id = create_model(project_id, dataset_id, 'mymodel')

Training operation name: projects/411150075841/locations/us-central1/operations/TCN2899544378718552064
Training started...


In the web console you should see the model creation progress:

<img src="./images/3_5_modelcreate.png"
     alt="AutoML Dataset"
     style="width:30%"
     />

#### Step 4 - Evaluate the model

After training, you can evaluate your model's readiness by reviewing its precision, recall, and F1 score.

The display_evaluation function takes the Model ID as a parameter.

In [70]:
def list_model_evaluations(project_id, model_id):
    """List model evaluations."""
    from google.cloud import automl

    client = automl.AutoMlClient()
    # Get the full path of the model.
    model_full_id = client.model_path(project_id, "us-central1", model_id)

    print("List of model evaluations:")
    for evaluation in client.list_model_evaluations(parent=model_full_id, filter=""):
        print("Model evaluation name: {}".format(evaluation.name))
        print(
            "Model annotation spec id: {}".format(
                evaluation.annotation_spec_id
            )
        )
        print("Create Time: {}".format(evaluation.create_time))
        print(
            "Evaluation example count: {}".format(
                evaluation.evaluated_example_count
            )
        )
        print(
            "Classification model evaluation metrics: {}".format(
                evaluation.classification_evaluation_metrics
            )
        )

In [None]:
list_model_evaluations(project_id, model_id)

#### Step 5 - Deploy the model

When your custom model meets your quality standards, you can deploy it and then make predictions request.

In [76]:
def deploy_model(project_id, model_id):
    """Deploy a model."""
    from google.cloud import automl

    client = automl.AutoMlClient()
    model_full_id = client.model_path(project_id, "us-central1", model_id)
    response = client.deploy_model(name=model_full_id)

    print(f"Model deployment finished. {response.result()}")

In [None]:
deploy_model(project_id, model_id)

#### Step 6 - Use the model to make a prediction

After you deploy your model, you can use it to classify novel content.

In [75]:
def predict(project_id, model_id, content):
    """Predict."""
    from google.cloud import automl
    prediction_client = automl.PredictionServiceClient()

    # Get the full path of the model.
    model_full_id = automl.AutoMlClient.model_path(
        project_id, "us-central1", model_id
    )

    # Supported mime_types: 'text/plain', 'text/html'
    # https://cloud.google.com/automl/docs/reference/rpc/google.cloud.automl.v1#textsnippet
    text_snippet = automl.TextSnippet(
        content=content, mime_type="text/plain"
    )
    payload = automl.ExamplePayload(text_snippet=text_snippet)
    response = prediction_client.predict(name=model_full_id, payload=payload)

    for annotation_payload in response.payload:
        print(
            u"Predicted class name: {}".format(annotation_payload.display_name)
        )
        print(
            u"Predicted class score: {}".format(
                annotation_payload.classification.score
            )
        )

In [None]:
predict(project_id, model_id, 'I finally passed the exams.')