### Install Watson Libraries

You only need to install once for your CPU.  If you are provisioning a new CPU on the cloud, you may need to install again.

In [None]:
!pip install --upgrade "ibm-watson>=4.3.0"

Requirement already up-to-date: ibm-watson>=4.3.0 in /usr/local/lib/python3.6/dist-packages (4.4.1)


In [None]:
import json, os, csv
import pandas as pd
from ibm_watson import DiscoveryV1
from ibm_watson import AssistantV2
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

### Add Google Drive

You can store your data in Google Drives

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Configure Watson Discovery and Watson Assistant

1. Go to cloud.ibm.com
2. If you have not setup, create a new IBM ID using your uci.edu email address
3. Login using your IBM ID
4. Click create resource and then choose AI from left menu to find Watson Assistant and Discovery Services.  In both cases, you should create a lite instance, so you can work without paying any charges.  Make sure you select Dallas as your region.
5. Click Resource List in menu (4 horizontal bars on top left) to list all resources.  Your Watson Assistant and Discovery Services will show up under services.
6. In each service click the service, and click Service Credentials on left to get to credentials.  Create new credentials and click add.
7. Open credentials and copy apikey and url.  Also not date created in the credentials table,  this is the version date, written using format YYYY-mm-dd.

In [None]:
configuration_file_with_path = "/content/drive/My Drive/covid_data/credentials_json"
with open(configuration_file_with_path, 'r') as infile:
    configuration_json = json.loads(infile.read())
discovery_apikey = configuration_json['Discovery']['apikey']
discovery_url = configuration_json['Discovery']['url']
discovery_version = configuration_json['Discovery']['version']
discovery_authenticator = IAMAuthenticator(discovery_apikey)
discovery = DiscoveryV1(
    version= discovery_version,
    authenticator=discovery_authenticator
)

discovery.set_service_url(discovery_url)

In [None]:
assistant_apikey = configuration_json['Watson_Assistant']['apikey']
assistant_url = configuration_json['Watson_Assistant']['url']
assistant_version = configuration_json['Watson_Assistant']['version']
assistant_authenticator = IAMAuthenticator(assistant_apikey)
assistant = AssistantV2(
    version= assistant_version,
    authenticator=assistant_authenticator
)
assistant.set_service_url(assistant_url)


### Collect News using Watson Discovery

If you have other sources of news, you can skip this step.  Discovery provides you a consolidated set of news for the last 60 days.  The data is also available to end users using http://news-explorer.mybluemix.net/.  Provide a query for news API.  I have chosen "Covid 19" in my example.
Discovery gives you 50 news item each time.  If you need 200 items, you can use offset to call Discovery multiple times and get 200 news item.  In the example below, Discovery is called 4 times to get 200 items.

In [None]:
environments = discovery.list_environments().get_result()
response_1 = discovery.query('system', 'news-en', natural_language_query="Coronavirus", filter='country:"US",crawl_date:"2020-04"', passages = True, count=50,  offset = 0, deduplicate=True)
response_2 = discovery.query('system', 'news-en', natural_language_query="Coronavirus", filter='country:"US",crawl_date:"2020-04"', passages = True, count=50,  offset = 50, deduplicate=True)
response_3 = discovery.query('system', 'news-en', natural_language_query="Coronavirus", filter='country:"US",crawl_date:"2020-04"', passages = True, count=50,  offset = 100, deduplicate=True)
response_4 = discovery.query('system', 'news-en', natural_language_query="Coronavirus", filter='country:"US",crawl_date:"2020-04"', passages = True, count=50,  offset = 150, deduplicate=True)


### Create Training and Testing sets
Take the 200 news items and store first 100 in training set and the other 100 in test set.  Create training and test files with the data


In [None]:
training_news_list = []
test_news_list = []
news_list_1 = response_1.get_result()
news_list_2 = response_2.get_result()
news_list_3 = response_3.get_result()
news_list_4 = response_4.get_result()
for news in news_list_1['results']:
  training_news_list.append(news)
for news in news_list_2['results']:
  training_news_list.append(news)
for news in news_list_3['results']:
  test_news_list.append(news)
for news in news_list_4['results']:
  test_news_list.append(news)
print("No of training sentences: ", len(training_news_list))
print("No of test sentences: ", len(test_news_list))

No of training sentences:  100
No of test sentences:  100


In [None]:
output_training_sentence_file_with_path = '/content/drive/My Drive/covid_data/Corona_Virus_News_training_sentences.csv'
with open(output_training_sentence_file_with_path, 'w') as sentence_file:
    sentence_writer = csv.writer(sentence_file, delimiter=',')
    for news in training_news_list:
        sentence_text = news['text']
        if len(sentence_text) > 1024:
            sentence_text = sentence_text[0:2047]
        clean_sentence = [sentence_text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '), ""]   
        sentence_writer.writerow(clean_sentence)

In [None]:
output_test_sentence_file_with_path = '/content/drive/My Drive/covid_data/Corona_Virus_News_test_sentences.csv'
with open(output_test_sentence_file_with_path, 'w') as sentence_file:
    sentence_writer = csv.writer(sentence_file, delimiter=',')
    for news in test_news_list:
        sentence_text = news['text']
        if len(sentence_text) > 1024:
            sentence_text = sentence_text[0:2047]
        clean_sentence = [sentence_text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' '), ""]   
        sentence_writer.writerow(clean_sentence)

### Create Intents
Take the training file and label each news item with a category, using about 4-6 categories.  You may find some duplicates.  You can either remove them or tag them consistently.  If Watson discovers any duplicates, they will not be loaded and ignored.
Take the intent file and load them into Watson as follows:
1. Launch Assistant
2. Create skill.
3. Import intents

Create an assistant and attach the newly created skill to the assistant.  Note down assistant ID and replace assistant_id with the ID you created.

Now you are ready to test.

In [4]:
assistant_id = '8c3c7f2b-d9ca-4912-82e8-055c04bcf973'
response = assistant.create_session(assistant_id= assistant_id).get_result()
session_id = response['session_id']

NameError: name 'assistant' is not defined

In [3]:
pd_test_sentence = pd.read_csv('/content/drive/My Drive/covid_data/Corona_Virus_News_test_sentences.csv', header=None)
test_sentence_json_str = pd_test_sentence.to_json(orient='records')
test_sentence_json = json.loads(test_sentence_json_str)
test_sentence_json[0]

NameError: name 'pd' is not defined

In [2]:
test_results = []
counter_start = 0
counter_end = 100
counter = counter_start
for news in test_sentence_json:
  sentence_text = news['0']
  if len(sentence_text) > 2047:
    sentence_text = sentence_text[0:2047]
  clean_sentence = sentence_text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
  message_response = assistant.message(assistant_id = assistant_id, 
                                       session_id = session_id, 
                                      alternate_intents = True, 
                                      input ={'message_type': 'text',
                                              'text': clean_sentence}).get_result()
  intent_list = message_response['output']['intents']
  for intent in intent_list:
    print(clean_sentence)
    print(intent)
    test_results.append({'sentence': clean_sentence, 'intent': intent})
  counter = counter + 1
  if counter > counter_end:
    break


NameError: name 'test_sentence_json' is not defined

In [1]:
test_results = []
counter_start = 0
counter_end = 100
counter = counter_start
for news in test_sentence_json:
  sentence_text = news['0']
  if len(sentence_text) > 2047:
    sentence_text = sentence_text[0:2047]
  clean_sentence = sentence_text.replace('\r', ' ').replace('\n', ' ').replace('\t', ' ')
  message_response = assistant.message(assistant_id = assistant_id, 
                                       session_id = session_id, 
                                      alternate_intents = True, 
                                      input ={'message_type': 'text',
                                              'text': clean_sentence}).get_result()

NameError: name 'test_sentence_json' is not defined

In [None]:
test_results_file_path = '/content/drive/My Drive/covid_data/Corona_Virus_News_test_results.csv'
pd_results = pd.DataFrame.from_dict(test_results)
pd_results.to_csv(test_results_file_path)