<a href="https://colab.research.google.com/github/kareemrasheed89/DataQuestVisualization-Proj/blob/master/Natural_Language_Processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Import needed libraries
import requests
import json

from google.cloud import language
from google.oauth2 import service_account
from google.cloud.language import enums
from google.cloud.language import types

# Build language API client (requires service account key)
client = language.LanguageServiceClient.from_service_account_json('/content/image-classification-288410-72948dec7680.json')

# Define functions
def pull_googlenlp(client, url, invalid_types = ['OTHER'], **data):
   
        html = load_text_from_url(url, **data)
   
        if not html:
          return None
   
        document = types.Document(
        content=html,
        type=language.enums.Document.Type.HTML )

        features = {'extract_syntax': True,
                'extract_entities': True,
                'extract_document_sentiment': True,
                'extract_entity_sentiment': True,
                'classify_text': False
                }
   
        response = client.annotate_text(document=document, features=features)
        sentiment = response.document_sentiment
        entities = response.entities
   
        response = client.classify_text(document)
        categories = response.categories
         
        def get_type(type):
          return client.enums.Entity.Type(entity.type).name
   
        result = {}
   
        result['sentiment'] = []    
        result['entities'] = []
        result['categories'] = []

        if sentiment:
          result['sentiment'] = [{ 'magnitude': sentiment.magnitude, 'score':sentiment.score }]
         
        for entity in entities:
          if get_type(entity.type) not in invalid_types:
                result['entities'].append({'name': entity.name, 'type': get_type(entity.type), 'salience': entity.salience, 'wikipedia_url': entity.metadata.get('wikipedia_url', '-')  })
         
        for category in categories:
          result['categories'].append({'name':category.name, 'confidence': category.confidence})
         
         
        return result


def load_text_from_url(url, **data):

        timeout = data.get('timeout', 20)
   
        results = []
   
        try:
         
          print("Extracting text from: {}".format(url))
          response = requests.get(url, timeout=timeout)

          text = response.text
          status = response.status_code

          if status == 200 and len(text) > 0:
                return text
         
          return None
         

        except Exception as e:
          print('Problem with url: {0}.'.format(url))
        return None

In [17]:
url = "https://opensource.com/article/19/6/how-ssh-running-container"
pull_googlenlp(client,url)

Extracting text from: https://opensource.com/article/19/6/how-ssh-running-container


{'categories': [{'confidence': 0.9399999976158142,
   'name': '/Computers & Electronics/Programming'},
  {'confidence': 0.9200000166893005, 'name': '/Science/Computer Science'},
  {'confidence': 0.5, 'name': '/Computers & Electronics/Software'}],
 'entities': [{'name': 'SSH',
   'salience': 0.03275073692202568,
   'type': 'CONSUMER_GOOD',
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Ssh_(Secure_Shell)'},
  {'name': 'Linux',
   'salience': 0.022114111110568047,
   'type': 'CONSUMER_GOOD',
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Linux'},
  {'name': 'comments',
   'salience': 0.014860434457659721,
   'type': 'WORK_OF_ART',
   'wikipedia_url': '-'},
  {'name': 'Seth Kenlon',
   'salience': 0.010031153447926044,
   'type': 'PERSON',
   'wikipedia_url': '-'},
  {'name': 'Python',
   'salience': 0.0074731698259711266,
   'type': 'ORGANIZATION',
   'wikipedia_url': 'https://en.wikipedia.org/wiki/Python_(programming_language)'},
  {'name': 'DevOps',
   'salience': 0.00720455404