In [None]:
!pip install --upgrade watson-developer-cloud

In [None]:
import json
import time

## Load data from DB2 Warehouse on the Cloud to Spark Dataframe
Specify the credentials for your DB2 Warehouse on the cloud instance and read table data into Spark data frame. To do so:

- Click the Data icon (top right)
- Choose the Connections tab
- Select "Insert SparkSession DataFrame"
   - Select the correct schema
   - Choose Table DSX_CLOUDANT_SINGER_TWEETS

This should copy required code into the active notebook cell for accessing your DB2 Warehouse on the Cloud instance and read the table DSX_CLOUDANT_SINGER_TWEETS into a Spark dataframe.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
# The code was removed by DSX for sharing.

## Data Exploration and Curation
Run some analysis and exploration of the data to verify it is as expected

In [None]:
# copy data into brandTweetsDF dataframe for processing
brandTweetsDF = data_df_1

In [None]:
# Return top 2 rows of Spark DataFrame
brandTweetsDF.limit(2).toPandas()

In [None]:
# Print the schema of the loaded data
brandTweetsDF.printSchema()

In [None]:
## Drop unneeded columns
brandTweetsDF = brandTweetsDF.drop('_ID','_REV')

In [None]:
import datetime
from datetime import date
from dateutil import parser
from pyspark.sql.functions import udf
from pyspark.sql.types import DateType


def getDay(date):
    print('input date: ', date)
    day = parser.parse(str(date))
    day = day.date()
    return day

# Add a field for the day the tweet was created (ignoring hour/minute/second)
udfGetDay = udf(getDay, DateType())

brandTweetsDF = brandTweetsDF.withColumn('DAY',udfGetDay('CREATED_AT'))

# Verify added field is as expected
brandTweetsDF.select("DAY").limit(5).toPandas()

## Extract a Random Sample of Records
Next, we will extract a randome rample of records to run NLU enrichment on. This is needed to make sure we don't exceed our limit of free NLU calls per day.

In [None]:
## Take a sample of the data
## Limit to 1000 records as Watson NLU allows 1000 free calls per day
import random

num_records = brandTweetsDF.count()
sample_num_records = 500
fraction = float(sample_num_records)/float(num_records)

seed = random.randint(1, 100)
print('Number of records: ', num_records, ' Sample size: ', sample_num_records, ' Fraction: ', fraction, ' Seed: ', seed)
brandTweetsSampleDF = brandTweetsDF.sample(False, fraction, seed)


## Alternative Stratified Sampling approach
## Returns RDD with length of 2, first col is the key (day) and second col is the original row for the key
## Take only the actual data (column 1)
## If you'd like to use this approach, uncomment the following 4 lines

#fractionList = brandTweetsDF.rdd.map(lambda x: x['DAY']).distinct().map(lambda x: (x,fraction)).collectAsMap()
#keybyday = brandTweetsDF.rdd.keyBy(lambda x: x['DAY'])
#brandTweetsDFrdd = keybyday.sampleByKey(False,fractionList).map(lambda x: x[1])
#brandTweetsSampleDF = spark.createDataFrame(brandTweetsDFrdd,brandTweetsDF.schema)


print('Number of records to send to NLU:', brandTweetsSampleDF.count())

In [None]:
# plot number of tweets per day
from pyspark.sql import functions as F
brandTweetsSampleDFperDay = brandTweetsSampleDF.groupBy('DAY')\
                              .agg(F.count('ID')\
                              .alias('NUM_TWEETS_PER_DAY'))
brandTweetsSampleDFperDay.show()

Run a clean text function on all records to remove unwanted characters.

In [None]:
# Add a step to clean the text by removing certain characters such as \n, \r, &amp, ...
from pyspark.sql.types import StringType

def cleanText(text):
    print('input text: ', text)
    #utf8text = normalize('NFKD', text).encode('ascii','ignore').decode('ascii')
    utf8text = text
    text1 = utf8text.replace('\n',' ')
    text1 = text1.replace('//','')
    text1 = text1.replace('\\3','')
    text1 = text1.replace('\r','')
    text2 = text1.replace('&amp;',' ')
    text2 = text2.replace('&lt;',' ')
    text2 = text2.strip()
    text_clean = text2.replace('"','')
    return text_clean

udfcleanText = udf(cleanText, StringType())

brandTweetsCleanDF = brandTweetsSampleDF.withColumn('textnew',udfcleanText('TEXT_CLEAN'))

In [None]:
brandTweetsCleanDF.count()

## Specify NLU Credentials
Next, you need to specify the credentials for your Watson Natural Language Understanding (NLU) service. If you don't have an NLU service, you can create one by following [these instructions](https://console.bluemix.net/docs/services/natural-language-understanding/getting-started.html#getting-started-tutorial) and obtaining the service credentials. You need to specify the URL, username, and password.

In [None]:
# Specify NLU credentials
credentials_json= {
    "nlu_url":"https://gateway.watsonplatform.net/natural-language-understanding/api",
    "nlu_username": "3139b139-1699-4be0-84a1-28b0b4927271",
	"nlu_password": "BqVSOErX0jdo",
	"nlu_version": "2017-02-27"
}

## NLU Enrichment using REST datasource extension
In this notebook, we leverage the REST data source extension for Apache Spark as explained in the [blog](https://medium.com/ibm-data-science-experience/using-spark-as-a-parallel-processing-framework-for-accessing-rest-based-data-services-cd4c98526784) and [github repository](https://github.com/sourav-mazumder/Data-Science-Extensions/tree/master/spark-datasource-rest). The REST datasource extension allows us to leverage the distribtued compute power of Spark in making REST API calls. As explained on the github repository under the "Using Rest Data Source in IBM Data Science Experience (DSx)" section, you need to have access to your Apache Spark as a Service and upload the jar for this REST data source extension to your Apache Spark instance.

In [None]:
# Specify the NLU endpoint
# Note that the nlu_uri should be the complete endpoint 
# nlu_uri = "https://gateway.watsonplatform.net/natural-language-understanding/api/v1/analyze?version=2017-02-27"
nlu_uri = credentials_json['nlu_url'] + "/v1/analyze?version=" + credentials_json['nlu_version']

nlu_username = credentials_json['nlu_username']
nlu_password = credentials_json['nlu_password']                                     

In [None]:
from pyspark.sql.functions import lit

bd = brandTweetsCleanDF.selectExpr("textnew as text")
# Add a column titles features and specify the features you'd like NLU enrichment for, in this case keywords and sentiment
bd = bd.withColumn('features',lit('keywords,sentiment'))
bd.head(2)

### REST Datasource 
For further detaiils on these parms, please consult the [following github repository](https://github.com/sourav-mazumder/Data-Science-Extensions/tree/master/spark-datasource-rest).


In [None]:
bd.selectExpr("text","features").createOrReplaceTempView("bdtbl")

In [None]:

nluprms = {'url' : nlu_uri, 'input' : 'bdtbl', 'method' : 'GET', 'userId':nlu_username, 'userPassword':nlu_password, 'callStrictlyOnce': 'Y', 'partitions': '10', 'connectionTimeout':'2000', 'readTimeout':'10000'}

In [None]:
start_time = time.time()
brandtweetsNLUDF = spark.read.format("org.apache.dsext.spark.datasource.rest.RestDataSource").options(**nluprms).load()
#print(brandtweetsNLUDF)
print("total run time for NLU enrichment using REST data source: ", time.time() - start_time)

In [None]:
brandtweetsNLUDF.printSchema()

In [None]:
brandtweetsNLUDF.head(5)

In [None]:
brandtweetsNLUDF.count()

In [None]:
brandtweetsNLUDF.limit(5).toPandas()