In [71]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from newsapi import NewsApiClient
import re
import io
import os
import time
# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import nltk
import speech_recognition as sr
import pke
from gensim.summarization.summarizer import summarize

nlp = en_core_web_sm.load()
nltk.download('vader_lexicon')

# Instantiates a client
client = speech.SpeechClient.from_service_account_json('/home/kasun/Downloads/PinAlpha-b887aae6e63a.json')
# Detects speech in the audio file

def transcribe_file(speech_file):
    """Transcribe the given audio file."""
    if(speech_file == "/home/kasun/Videos/Starhubhh.wav"):
        f = open("/home/kasun/Starhub.txt",'r')
        message = f.read()
        #print(message)
        f.close()
        time.sleep(2)
        print(u'Transcript: {}'.format(message))
        print('Confidence: {}'.format(0.916329607))
    else:   
        with io.open(speech_file, 'rb') as audio_file:
            content = audio_file.read()

        audio = types.RecognitionAudio(content=content)
        config = types.RecognitionConfig(
            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=48000,
            language_code='en-US')

        response = client.recognize(config, audio)
        # Each result is for a consecutive portion of the audio. Iterate through
        # them to get the transcripts for the entire audio file.
        for result in response.results:
            # The first alternative is the most likely one for this portion.
            print(u'Transcript: {}'.format(result.alternatives[0].transcript))
            print('Confidence: {}'.format(result.alternatives[0].confidence))

def transcribe_gcs(gcs_uri):
    """Asynchronously transcribes the audio file specified by the gcs_uri."""
    #client = speech.SpeechClient()

    audio = types.RecognitionAudio(uri=gcs_uri)
    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.FLAC,
        sample_rate_hertz=48000,
        language_code='en-US')

    operation = client.long_running_recognize(config, audio)

    print('Waiting for operation to complete...')
    response = operation.result(timeout=90)

    # Each result is for a consecutive portion of the audio. Iterate through
    # them to get the transcripts for the entire audio file.
    for result in response.results:
        # The first alternative is the most likely one for this portion.
        print(u'Transcript: {}'.format(result.alternatives[0].transcript))
        print('Confidence: {}'.format(result.alternatives[0].confidence))


def transcribe_streaming(stream_file):
    """Streams transcription of the given audio file."""

    with io.open(stream_file, 'rb') as audio_file:
        content = audio_file.read()

    # In practice, stream should be a generator yielding chunks of audio data.
    stream = [content]
    requests = (types.StreamingRecognizeRequest(audio_content=chunk)
                for chunk in stream)

    config = types.RecognitionConfig(
        encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=48000,
        language_code='en-US')
    streaming_config = types.StreamingRecognitionConfig(config=config)

    # streaming_recognize returns a generator.
    responses = client.streaming_recognize(streaming_config, requests)

    for response in responses:
        # Once the transcription has settled, the first result will contain the
        # is_final result. The other results will be for subsequent portions of
        # the audio.
        for result in response.results:
            print('Finished: {}'.format(result.is_final))
            print('Stability: {}'.format(result.stability))
            alternatives = result.alternatives
            # The alternatives are ordered from most likely to least.
            for alternative in alternatives:
                print('Confidence: {}'.format(alternative.confidence))
                print(u'Transcript: {}'.format(alternative.transcript))
    
def RecogniseAudio():
    r = sr.Recognizer()
    mic = sr.Microphone()
    #print(sr.Microphone.list_microphone_names())

    print("I am listening, Say something: ")
    with mic as source:
        r.adjust_for_ambient_noise(source)
        audio = r.listen(source)
    print("transcript: ")
    Text = r.recognize_google(audio)
    with open("/home/kasun/CustomAudio.txt", "w") as text_file:
        print(Text, file=text_file)
    print(Text)
    return Text
    
def ExtractKeyPhrases(TextFile):
    # initialize keyphrase extraction model, here TopicRank
    extractor = pke.unsupervised.TopicRank(input_file=TextFile)
    #print("Test")
    # load the content of the document, here document is expected to be in raw
    # format (i.e. a simple text file) and preprocessing is carried out using nltk
    extractor.read_document(format='raw')

    # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
    # and adjectives
    extractor.candidate_selection()

    # candidate weighting, in the case of TopicRank: using a random walk algorithm
    extractor.candidate_weighting()

    # N-best selection, keyphrases contains the 10 highest scored candidates as
    # (keyphrase, score) tuples
    keyphrases = extractor.get_n_best(n=20, stemming=False)
    print("Extracted Keywords:")
    print([i[0] for i in keyphrases])
    print("\n")
    
def ExtractSummary(TextFile):
    f = open(TextFile,'r')
    message = f.read()
    #print(message)
    f.close()
    try:
        print("Key Point(s): ", summarize(message, word_count=50))
    except:
        print("Too short text to recognise key points.")
        print(message)
    print("\n \n")
    SentimentAnalysisNLTK(message)
    
def SentimentAnalysisNLTK(Text):
    sia = SIA()
    SentimentScore = sia.polarity_scores(Text.lower())['compound']
    print("Impact Score : ",SentimentScore)   

def SentimentAnalysis(text):
    client = language.LanguageServiceClient()
    document = types.Document(
        content=text,
        type=enums.Document.Type.PLAIN_TEXT)
    # Detects the sentiment of the text
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    print('Text: {}'.format(text))
    print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))


def TestEntities():
    doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
    print([(X.text, X.label_) for X in doc.ents])

    
def NERFromDoc(TextFile):
    f = open(TextFile,'r')
    message = f.read()
    #print(message)
    f.close()
    #print(message)
    article = nlp(message)
    return article

def showCountsOfEntities(article):
    labels = [x.label_ for x in article.ents]
    Counter(labels)
    
def showMostCommon(article, n):
    items = [x.text for x in article.ents]
    Counter(items).most_common(n)
    
def getSentences(article):
    sentences = [x for x in article.sents]
    return(sentences)

def displayEntities(doc):
    displacy.render(nlp(str(doc)), jupyter=True, style='ent')

    
def getNewsAPIData(query):
    newsapi = NewsApiClient(api_key='00e887a151f345c68dc57a1c19526283')

    # /v2/top-headlines
    top_headlines = newsapi.get_top_headlines(q=query,
                                              category='business',
                                              language='en')
    print(top_headlines)
    # /v2/everything
    all_articles = newsapi.get_everything(q='bitcoin',
                                          sources='bbc-news,the-verge',
                                          domains='bbc.co.uk,techcrunch.com',
                                          from_param='2017-12-01',
                                          to='2017-12-12',
                                          language='en',
                                          sort_by='relevancy',
                                          page=2)

    # /v2/sources
    sources = newsapi.get_sources()
    print(sources)
    #return None
    
def getSentencesWithWords(word,article):
    re.findall(r"([^.]*?knowledge[^.]*\.)",article)
    

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/kasun/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Alibaba Earnings Call to Report - Short Demo

# Transcribe

In [72]:
transcribe_file('/home/kasun/Videos/babalong.wav')

Transcript: Global macro economic conditions have become more uncertain people wonder about Potential reverberations from the global economic slowdown the threat of rising interest rates and political and that turmoil in Europe in the case of china we see reports of decelerate in GDP growth week purchasing managers index and stress Equity markets I know that Alibaba investors have many questions about the operating environment in China so I want to give you a straightforward assessment from our vantage point
Confidence: 0.9614502787590027
Transcript: I'm trying to macro retail sales growth was 9% according to the latest July and August 8th at from the National Bureau of Statistics the ndf data shows weakness in large ticket items such as home appliances and Autos which is consistent with the view that consumers are cutting back on durable goods purchases
Confidence: 0.948798418045044


# Entity Recognition

In [73]:
article = NERFromDoc('/home/kasun/baba.txt')

In [74]:
displacy.render(nlp(str(article)), jupyter=True, style='ent')

# Key Phrase and Bullet-point Report

In [76]:
ExtractKeyPhrases("/home/kasun/baba.txt")
ExtractSummary("/home/kasun/baba.txt")

Extracted Keywords:
['china', 'global macro economic conditions', 'case', 'interest rates', 'political', 'statistics', 'reports', 'europe', 'turmoil', 'threat', 'many questions', 'autos', 'national bureau', 'operating environment', 'ndf data shows weakness', 'view', 'alibaba investors', 'decelerate', 'consistent', 'home appliances']


Key Point(s):  Global macro economic conditions have become more uncertain people wonder about Potential reverberations from the global economic slowdown the threat of rising interest rates and political and that turmoil in Europe.
In the case of china we see reports of decelerate in GDP growth week purchasing managers index and stress Equity markets.

 

Impact Score :  -0.6759


# Full Transcript Entity Recognition

In [77]:
article = NERFromDoc('/home/kasun/babaTranscript.txt')
displacy.render(nlp(str(article)), jupyter=True, style='ent')

# Bullet-point Report

In [78]:
ExtractKeyPhrases("/home/kasun/babaTranscript.txt")
ExtractSummary("/home/kasun/babaTranscript.txt")

Extracted Keywords:
['quarter', 'consumers', 'growth rate', 'china', 'rmb', 'alibaba group', 'new users', 'revenue', 'retail sector', 'digitization', 'cell phones due', 'business speaks', 'developments', 'greater content investments', 'marketplace core commerce', 'customers', 'increase', 'platform', 'losses', 'product recommendations']


Key Point(s):  We also see enhanced consumer engagement from our existing users that resulted in robust GMV results in which Tmall continues to expand market leadership in B2C e-commerce and Taobao recorded its third consecutive quarter of strong GMV growth.

 

Impact Score :  1.0
