In [36]:
import pandas as pd
import numpy as np
from newspaper import Article
import nltk
import pickle
from langdetect import detect
from gensim.summarization.summarizer import summarize as gensim_summarize 

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity 
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.stem.porter import PorterStemmer

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
### for summarization
from gensim.summarization.summarizer import summarize as gensim_summarize 

from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from langdetect import detect

In [12]:
def get_text(url):
    """
    Func: 1. get raw text from url 2. get summary & keyword from text
        Input: url, a link to article
        Output: dictionary contains 3 keys, text, summary & keywords
    """
    try:
        article = Article(url)
        article.download()

        ### parse html file
        article.parse()
        text = article.text
    
        return text
    except:
        print(f'fail to download news from {url}')
        return None

In [18]:
def detect_lang(text):
    ### translate to english
    try:
        language = detect(text)
        print(f"language is {language}")
    except:
        print("fail to detect language")
        language = "other"
    return language

In [27]:
def get_text_translate(text):
    if 'TRANSLATOR_TEXT_KEY' in os.environ:
        subscriptionKey = os.environ['TRANSLATOR_TEXT_KEY']
    else:
        print('Environment variable for TRANSLATOR_TEXT_KEY is not set.')
        #exit()
    # If you want to set your subscription key as a string, uncomment the line
    # below and add your subscription key.
    subscriptionKey = "331f7ace25a849639d0d319181758dff"

    base_url = 'https://api.cognitive.microsofttranslator.com'
    path = '/translate?api-version=3.0'
    params = '&to=en'
    constructed_url = base_url + path + params

    headers = {
        'Ocp-Apim-Subscription-Key': subscriptionKey,
        'Content-type': 'application/json',
        'X-ClientTraceId': str(uuid.uuid4())}
    return constructed_url, headers
    
def get_translated_text(text):
    constructed_url, headers = get_text_translate(text)
    body = [{'text': text}]
    request = requests.post(constructed_url, headers=headers, json=body)
    response = request.json()
    #return response
    return response[0]["translations"][0]["text"]

In [21]:
def summarize(string, **kwargs):
    """
    kwargs:
        1, ratio (float, optional) – Number between 0 and 1 that determines the proportion of the number of sentences 
           of the original text to be chosen for the summary.
        2, word_count (int or None, optional) – Determines how many words will the output contain. 
           If both parameters are provided, the ratio will be ignored.
        3, split (bool, optional) – If True, list of sentences will be returned. 
           Otherwise joined strings will bwe returned.
    """
    try:
        summarized = gensim_summarize(string,**kwargs)
    except:
        return string
    return summarized

In [43]:
def translation_clean(text):
    
    cidcompile1=re.compile(r'\(*\s*\(*\s*[cC]\s*i\s*\)*\s*d\s*\:*\s*\:*(?:\(*\s*[cC]\s*i\s*d\s*\:*)?\:*\s*[0-9]+\s*[0-9]{0,}\s*\)*\s*\)*')
    cidcompile2=re.compile(r'\(\s*[cC]\s*i\s*\)*\s*d\s*\:')
    cidcompile3=re.compile(r'\:\s*[0-9]+\s*\)')
    punctuation = re.compile(r',.:\'')

    t_str=re.sub(cidcompile1,' ',text)
    t_str=re.sub(cidcompile2,' ',t_str)
    t_str=re.sub(cidcompile3,' ',t_str)
    
    control = re.compile('\x00|\x01|\x02|\x03|\x04|\x05|\x06|\x07|\x08|\x09|\x0a|\x0b|\x0c|\x0d|\x0e|\x0f|\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f|\x7f|\xc2\x80|\xc2\x81|\xc2\x82|\xc2\x83|\xc2\x84|\xc2\x85|\xc2\x86|\xc2\x87|\xc2\x88|\xc2\x89|\xc2\x8a|\xc2\x8b|\xc2\x8c|\xc2\x8d|\xc2\x8e|\xc2\x8f|\xc2\x90|\xc2\x91|\xc2\x92|\xc2\x93|\xc2\x94|\xc2\x95|\xc2\x96|\xc2\x97|\xc2\x98|\xc2\x99|\xc2\x9a|\xc2\x9b|\xc2\x9c|\xc2\x9d|\xc2\x9e|\xc2\x9f')
    
    t_str = re.sub(control,' ',t_str)
    t_str = unicodedata.normalize("NFKD",t_str)
    t_str = re.sub('[\uE000-\uF8B6\uF8C1-\uF8E4]+',' ',t_str)
    t_str = html.unescape(t_str)
    
    return t_str

def pre_process(text,return_str=False):
    text = text.lower()
    # do not drop stop words, it may contain some info
    # Remove lemmatization
    text = list(map(lambda x:word_tokenize(x),text))
    # Remove stemmization
    stemmer = PorterStemmer()
    words = list(map(lambda x:stemmer.stem(x),text))
    print(words[:10])
    
    if return_str:
        return (' ').join(words)
    else:
        return words

In [5]:
raw = pd.read_csv("./data/bigquery_raw.csv")
raw.head()

Unnamed: 0.1,Unnamed: 0,DATE,THEMES,DocumentIdentifier
0,0,20190101060000,EDUCATION;SOC_POINTSOFINTEREST;SOC_POINTSOFINT...,https://www.daijiworld.com/chan/exclusiveDispl...
1,1,20190101061500,TAX_FNCACT;TAX_FNCACT_MAN;ARREST;SOC_GENERALCR...,https://caymannewsservice.com/2018/12/
2,2,20190101063000,TAX_FNCACT;TAX_FNCACT_LEADER;ENV_NUCLEARPOWER;...,https://www.vesti.bg/tehnologii/bil-gejts-sash...
3,3,20190101061500,ENV_GREEN;WB_507_ENERGY_AND_EXTRACTIVES;WB_525...,https://www.ajc.com/business/economy/georgia-p...
4,4,20190101061500,ENV_GREEN;WB_507_ENERGY_AND_EXTRACTIVES;WB_525...,https://pv-magazine-usa.com/2018/12/18/breakin...


In [10]:
# check null value
if raw.isnull().values.any():
    raw = raw.dropna()

In [15]:
# scrape news from website by calling get_text module
eg = get_text(raw.DocumentIdentifier[0])
eg[:100]


'January 1, 2019\n\nAbout a month and a half ago, I attended the Global Energy Forum at Stanford Univer'

In [25]:
lang = detect_lang(eg)
lang

language is en


'en'

In [23]:
# Text summarization
# give a summary of the article
summary = summarize(eg)
summary

'The forum addressed one of the most pressing issues of our lifetimes - global energy and climate change.\nIndia’s development will undoubtedly be fuelled by an increase in energy consumption, but this economic development belies a growing problem - climate change caused by CO2 emissions.\nThis includes increase in Earth’s mean surface temperature (also known as global warming), rise in sea level and acidification, extreme weather events, and so on.\nWhile the increase in global temperature and loss of polar ice has been strongly linked to anthropogenic activities (particularly CO2 emissions), there is no consensus among researchers about the link between extreme weather events such as forest fires, cyclones, droughts etc and anthropogenic causes.\nNow, concerning the timeline, it is expected that global temperatures will increase by over 2 degrees C by 2040 if emissions continue as before, well within our lifetimes for most of us reading this article.\nMillions of Indians live off the

In [26]:
# translate
if lang != 'en':
    eg = get_translated_text(eg)
    

In [49]:
# data preprocess
clean_text = pre_process(eg,return_str=False)
clean_text


['january', '1', '2019', 'about', 'a', 'month', 'and', 'a', 'half', 'ago']

In [None]:
# download whole news and events
texts = list(map(lambda x:get_text(x),raw.DocumentIdentifier))
# save as pickle
with open('data/news.pickle', 'wb') as handle:
    pickle.dump(texts, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Modularize

In [None]:
class ProcessPipeline:
    def __init__(self, texts):
        '''
        texts should be a list of texts
        '''
        self.texts = texts
    
    def get_text(self, url):
        """
        Func: 1. get raw text from url 2. get summary & keyword from text
            Input: url, a link to article
            Output: dictionary contains 3 keys, text, summary & keywords
        """
        try:
            article = Article(url)
            article.download()

            ### parse html file
            article.parse()
            text = article.text

            return text
        except:
            print(f'fail to download news from {url}')
            return None
    
    def detect_lang(self,text):
        ### translate to english
        try:
            language = detect(text)
            print("language is {}".format(language))
        except:
            print("fail to detect language")
            language = "other"
        return language

    def get_translated_text(self, text):
        def get_text_translate(text):
            if 'TRANSLATOR_TEXT_KEY' in os.environ:
                subscriptionKey = os.environ['TRANSLATOR_TEXT_KEY']
            else:
                print('Environment variable for TRANSLATOR_TEXT_KEY is not set.')
                #exit()
            # If you want to set your subscription key as a string, uncomment the line
            # below and add your subscription key.
            subscriptionKey = "331f7ace25a849639d0d319181758dff"

            base_url = 'https://api.cognitive.microsofttranslator.com'
            path = '/translate?api-version=3.0'
            params = '&to=en'
            constructed_url = base_url + path + params

            headers = {
                'Ocp-Apim-Subscription-Key': subscriptionKey,
                'Content-type': 'application/json',
                'X-ClientTraceId': str(uuid.uuid4())}
            return constructed_url, headers
        
        constructed_url, headers = get_text_translate(text)
        body = [{'text': text}]
        request = requests.post(constructed_url, headers=headers, json=body)
        response = request.json()
        #return response
        return response[0]["translations"][0]["text"]
    
    def summarize(self, string, **kwargs):
        """
        kwargs:
            1, ratio (float, optional) – Number between 0 and 1 that determines the proportion of the number of sentences 
               of the original text to be chosen for the summary.
            2, word_count (int or None, optional) – Determines how many words will the output contain. 
               If both parameters are provided, the ratio will be ignored.
            3, split (bool, optional) – If True, list of sentences will be returned. 
               Otherwise joined strings will bwe returned.
        """
        try:
            summarized = gensim_summarize(string,**kwargs)
        except:
            return string
        return summarized
    
    def pre_process(self, text, return_str=False):
        text = text.lower()
        # do not drop stop words, it may contain some info
        # Remove lemmatization
        text = list(map(lambda x:word_tokenize(x),text))
        # Remove stemmization
        stemmer = PorterStemmer()
        words = list(map(lambda x:stemmer.stem(x),text))
        print(words[:10])

        if return_str:
            return (' ').join(words)
        else:
            return words
    
    def process(self, text):
        eg = self.get_text(raw.DocumentIdentifier[0])
        lang = self.detect_lang(eg)
        if lang!='en':
            self.get_translated_text(eg)
        clean_text = pre_process(eg,return_str=False)
        return clean_text
        
    def run(self, return_str=False,workers=6):
        # multiprocess to speed up
        with ProcessPoolExecutor(max_workers=workers) as executor:
            if return_str:
                texts = executor.map(self.process, self.texts,[True]*len(self.texts))     
            else:
                texts = executor.map(self.process, self.texts)
        return list(texts)    


### done