In [151]:
# default_exp core

# WorldWideNewsMap

> Fetches the news all around the world

In [152]:
#hide
from nbdev.showdoc import *

# Article baseclass definition

In [153]:
#export
from pydantic.dataclasses import dataclass
from pydantic import HttpUrl
from datetime import datetime

@dataclass
class Article:
    title: str
    summary: str
    text:str
    publication_date: datetime
    source:str
    url: HttpUrl
    img_url: HttpUrl
    tags:list
    region:str
    language:str

In [154]:
import pandas as pd


In [155]:
from GoogleNews import GoogleNews

# Translating the keywords into all languages

In [156]:
import asyncio
from async_google_trans_new import AsyncTranslator
from unsync import unsync
import aiohttp
import backoff

@unsync
@backoff.on_exception(backoff.expo, aiohttp.ClientError, max_time=60)
async def translate_keywords(input_keywords:list,target_language:str,input_language:str):
    print('hi')
    translator = AsyncTranslator(url_suffix='com')
    tasks = [translator.translate(keyword,lang_tgt=target_language,lang_src=input_language) for keyword in input_keywords]
    return await asyncio.gather(*tasks)

 Translate some keywords to multiple languages

In [157]:
from pydantic import BaseModel,ComputedField,computed_field,PrivateAttr
from functools import lru_cache
from funcy import cached_property,cached_readonly


def list_to_tuple(function):
    def wrapper(*args):
        args = [tuple(x) if type(x) == list else x for x in args]
        result = function(*args)
        result = tuple(result) if type(result) == list else result
        return result
    return wrapper



class KeywordsPerMarket(BaseModel):
    input_keywords:list 
    region:str 
    language:str
    market = ComputedField(lambda self:f"{self.language}-{self.region}")
    input_language:str = 'en'
    _translated_keywords = PrivateAttr(None)
    
    @computed_field
    @property
    def keywords(self):
        self._translated_keywords = self._translated_keywords or translate_keywords(self.input_keywords,self.language,self.input_language).result()
        return self._translated_keywords

    @keywords.setter
    def keywords(self,keywords):
        self._translated_keywords = keywords




### document the error with cached_property and open issue on pydantic 

In [158]:
from pydantic import BaseModel,ComputedField,computed_field
from funcy import cached_property,cached_readonly


def list_to_tuple(function):
    def wrapper(*args):
        args = [tuple(x) if type(x) == list else x for x in args]
        result = function(*args)
        result = tuple(result) if type(result) == list else result
        return result
    return wrapper



class Translation(BaseModel):
    input:list 
    meta:str 
    
    @computed_field
    @cached_property
    def result(self):
        self._result = some_expensive_async_func(input=self.input).result()
        return self._result

## Fetch and cache the keywords, reuse for regions with the same language

In [159]:
keywords = ['aviation','airlines','flight','airplane']
languages = ['es','nl','de','fr']

# keywords_per_market = [KeywordsPerMarket(input_keywords=keywords,language=language,region='') for language in languages]

In [160]:
kpm = KeywordsPerMarket(input_keywords=keywords,language=languages[0],region='')

In [161]:
kpm.dict()

hi


{'input_keywords': ['aviation', 'airlines', 'flight', 'airplane'],
 'region': '',
 'language': 'es',
 'input_language': 'en',
 'market': 'es-',
 'keywords': ['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']}

In [162]:
kpm.keywords

['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']

In [163]:
kpm2 = KeywordsPerMarket(**kpm.dict())

In [164]:
kpm2.keywords = kpm.keywords

In [165]:
kpm2.keywords

['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']

In [166]:
kpm2.dict()

{'input_keywords': ['aviation', 'airlines', 'flight', 'airplane'],
 'region': '',
 'language': 'es',
 'input_language': 'en',
 'market': 'es-',
 'keywords': ['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']}

In [167]:
keywords_per_market

[KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='ET', language='am', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='EG', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='SA', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='DZ', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='IQ', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='SD', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airplane'], region='MA', language='ar', input_language='en'),
 KeywordsPerMarket(input_keywords=['aviation', 'airlines', 'flight', 'airpla

In [168]:
KeywordsPerMarket(**{'input_keywords': ['aviation', 'airlines', 'flight', 'airplane'],
  'region': '',
  'language': 'es',
  'input_language': 'en',
  'keywords': ['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']}).keywords

hi


['aviación ', 'aerolíneas ', 'vuelo ', 'avión ']

In [169]:
import pandas as pd
keywords = []
pd.DataFrame([KeywordsPerMarket(input_keywords=keywords,language=language,region='').dict() for language in languages])

hi
hi
hi
hi


Unnamed: 0,input_keywords,region,language,input_language,market,keywords
0,[],,es,en,es-,[]
1,[],,nl,en,nl-,[]
2,[],,de,en,de-,[]
3,[],,fr,en,fr-,[]


In [170]:
# filter on language which are spoken by at least million people

(pd.read_json(countries)
.pipe(lambda f:f.sort_values(by='Population',ascending=False))
.pipe(lambda df: df["Alpha_2"].apply(lambda iso_alpha_2:get_global("territory_languages").get(iso_alpha_2, {}).copy()))
)

NameError: name 'countries' is not defined

In [None]:
from babel.core import get_global
import pycountry
import pandas as pd 
import pkgutil
from async_google_trans_new.constant import LANGUAGES as google_trans_languages
language_mapper = {'zh':'zh-cn'}

def country_language_population(min_speakers=10e6):
    countries = pkgutil.get_data('pypopulation', 'resources/countries.json')
    df = pd.read_json(countries)
    df["Language"] = df["Alpha_2"].apply(lambda iso_alpha_2:get_global("territory_languages").get(iso_alpha_2, {}).copy())
    df['N_Speakers'] = df.apply(lambda row:[(v.get('population_percent',0)/100)*row.Population for k,v in row.Language.items()],axis=1)
    df = df.explode(['Language','N_Speakers']).sort_values(by='N_Speakers',ascending=False)
    df['Language'] = df['Language'].apply(lambda language :language_mapper.get(language) if language not in list(google_trans_languages.keys()) else language)
    df = df.dropna()
    df = df[df['N_Speakers'] > min_speakers]
    return df
    

def get_markets():
    df = country_language_population()
    return (df['Alpha_2']+'-'+df['Language']).tolist()

df = country_language_population()

In [None]:
df

Unnamed: 0,Country,Alpha_2,Alpha_3,Population,Language,N_Speakers
40,China,CN,CHN,1397715000,zh-cn,1257943500.0
87,India,IN,IND,1366417754,hi,560231279.14
205,United States,US,USA,328239523,en,315109942.08
87,India,IN,IND,1366417754,en,259619373.26
147,Pakistan,PK,PAK,216565318,ur,205737052.1
...,...,...,...,...,...,...
74,Greece,GR,GRC,10716322,el,10609158.78
179,South Africa,ZA,ZAF,58558270,xh,10540488.6
196,Tunisia,TN,TUN,11694719,ar,10525247.1
50,Czech Republic,CZ,CZE,10669709,cs,10456314.82


In [None]:
language_regions = df.groupby('Language').agg('Alpha_2').unique().apply(list).to_dict()

In [None]:
list(language_regions.keys())

In [None]:
keywords_per_market = []
for language,regions in language_regions.items():
    kpm = KeywordsPerMarket(
        input_keywords=keywords,
        region = '',
        language = language)
    for region in regions:
        _kpm = KeywordsPerMarket(**kpm.dict())
        _kpm.keywords = kpm.keywords
        _kpm.region = region
        keywords_per_market.append(_kpm)



In [None]:
keywords_per_market

In [None]:
keywords_per_market[0].dict()

{'input_keywords': ['aviation', 'airlines', 'flight', 'airplane'],
 'region': 'ET',
 'language': 'am',
 'input_language': 'en',
 'market': 'am-ET',
 'keywords': ['አቪዬሽን ', 'አየር መንገድ ', 'በረራ ', 'አውሮፕላን ']}

In [None]:
languages = df['Language'].unique().tolist()

## Use the KeywordsPerMarket as input for GoogleNews

In [None]:
kpm = keywords_per_market[1]

In [None]:
from datetime import datetime
class GoogleNewsArticle(BaseModel):
    title:str
    region:str
    language:str
    img:str
    link:str
    site:str
    date:str

In [None]:
from GoogleNews import GoogleNews
from typing import List

@unsync
def parse_article(result:dict,article_validator:BaseModel)->BaseModel:
    try: return article_validator(**result)
    except Exception as e:pass

def google_news_fetcher(keywords:KeywordsPerMarket)->List[GoogleNewsArticle]:
    googlenews = GoogleNews(lang=keywords.language,region=keywords.region)
    googlenews.get_news(' '.join(keywords.keywords))
    results = googlenews.results()
    if results !=[]:
        tasks = [parse_article({**result,**dict(language=keywords.language,region=keywords.region)},GoogleNewsArticle) for result in results]
        return [task.result() for task in tasks]


In [171]:
google_news_fetcher(kpm)

[GoogleNewsArticle(title='COVID-19: ¿Qué tan seguro es volar en plena pandemia?', region='', language='es', img='https://lh3.googleusercontent.com/proxy/CEpfu87DwIG-adytJRc9BMXI-Z5SHtVo1GHqjiqFU7lpENj7_utKy-6I-FAlMaBXSZuJvunG90VXFLfPMWzS_q0Dtf-ckjI=s0-w100-h100-p-df', link='news.google.com/./articles/CAIiEAcbUZy_Y7LCKtrz2mJpb6UqGQgEKhAIACoHCAowwfehCTDkrpcCMJKcvwM?hl=es-419&gl=US&ceid=US%3Aes-419', site='DW (Español)', date='22 abr.'),
 GoogleNewsArticle(title='Confirmado: Nuevo vuelo de Aerolíneas a Rusia en busca de vacunas Sputnik V', region='', language='es', img='https://lh3.googleusercontent.com/proxy/Q412QK1hRHZbjdTEtRrVegRRTM70KzVMiLCIuimmPl4y5rKvNYWfcGtnfhopa3k74FBjYDSymdf2utJysE1PDsgLl09alqoZiv3p0K4Kk1TVFBxzp5mLAZFqLdmj6B-iQnCVMebFftvUIaLMJMdTCcvOB-CQcYTo6dQ3fHPEI6hjTgg438zZRQom0jnjVDzkh_rhf2U=s0-w100-h100-p-df', link='news.google.com/./articles/CBMicGh0dHBzOi8vd3d3LmF2aWFjaW9ubGluZS5jb20vMjAyMS8wMy9jb25maXJtYWRvLW51ZXZvLXZ1ZWxvLWRlLWFlcm9saW5lYXMtYS1ydXNpYS1lbi1idXNjYS1kZS12Y

In [172]:
google_news_articles = [google_news_fetcher(kpm) for kpm in keywords_per_market]

In [None]:
google_news_articles

In [None]:
[article for article in google_news_articles]

import requests
news_df['link'] =  news_df['link'].apply(lambda url: requests.get(url).url)

## Process all the news articles

* Validate and resolve all the urls
* download and parse the complete article using newspaper3k
* parse the datetime
* translate the title and the full text to the target language (async)
* summarize the full text body

In [113]:
import numpy as np

google_news_article  = np.random.choice(google_news_articles)

In [148]:
import pandas as pd 

pd.DataFrame([article.dict() for article in google_news_articles if article is not None])

Unnamed: 0,title,region,language,img,link,site,date
0,81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية و...,EG,ar,https://lh3.googleusercontent.com/proxy/oPL7ET...,news.google.com/./articles/CBMi8wFodHRwczovL3d...,الاصلاح الدستوري,قبل يومين (2)
1,81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية و...,EG,ar,https://lh3.googleusercontent.com/proxy/oPL7ET...,news.google.com/./articles/CBMi8wFodHRwczovL3d...,الاصلاح الدستوري,قبل يومين (2)
2,81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية و...,EG,ar,https://lh3.googleusercontent.com/proxy/oPL7ET...,news.google.com/./articles/CBMi8wFodHRwczovL3d...,الاصلاح الدستوري,قبل يومين (2)
3,81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية و...,EG,ar,https://lh3.googleusercontent.com/proxy/oPL7ET...,news.google.com/./articles/CBMi8wFodHRwczovL3d...,الاصلاح الدستوري,قبل يومين (2)
4,81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية و...,EG,ar,https://lh3.googleusercontent.com/proxy/oPL7ET...,news.google.com/./articles/CBMi8wFodHRwczovL3d...,الاصلاح الدستوري,قبل يومين (2)
...,...,...,...,...,...,...,...
11344,ديور تطلق أحدث عطورها حصرياً لمطار حمد,EG,ar,https://lh3.googleusercontent.com/proxy/JiFrCK...,news.google.com/./articles/CBMi8AFodHRwczovL20...,الشرق,5 فبراير
11345,ديور تطلق أحدث عطورها حصرياً لمطار حمد,EG,ar,https://lh3.googleusercontent.com/proxy/JiFrCK...,news.google.com/./articles/CBMi8AFodHRwczovL20...,الشرق,5 فبراير
11346,ديور تطلق أحدث عطورها حصرياً لمطار حمد,EG,ar,https://lh3.googleusercontent.com/proxy/JiFrCK...,news.google.com/./articles/CBMi8AFodHRwczovL20...,الشرق,5 فبراير
11347,ديور تطلق أحدث عطورها حصرياً لمطار حمد,EG,ar,https://lh3.googleusercontent.com/proxy/JiFrCK...,news.google.com/./articles/CBMi8AFodHRwczovL20...,الشرق,5 فبراير


In [None]:
googlenews.results(sort=True)

In [70]:
googlenews.results()[0].keys()

dict_keys(['title', 'desc', 'date', 'datetime', 'link', 'img', 'media', 'site'])

In [69]:
googlenews.results()

[{'title': '81 رحلة مصر للطيران تعلن عدد رحلاتها الدولية والداخلية',
  'desc': 'bookmark_border',
  'date': 'قبل 56 دقيقة',
  'datetime': None,
  'link': 'news.google.com/./articles/CBMi8wFodHRwczovL3d3dy5yZWZlcmVuZHVtLXR1bmlzaWUub3JnL25ld3MvMTU0NzM0LzgxLSVEOCVCMSVEOCVBRCVEOSU4NCVEOCVBOS0lRDklODUlRDglQjUlRDglQjEtJUQ5JTg0JUQ5JTg0JUQ4JUI3JUQ5JThBJUQ4JUIxJUQ4JUE3JUQ5JTg2LSVEOCVBQSVEOCVCOSVEOSU4NCVEOSU4Ni0lRDglQjklRDglQUYlRDglQUYtJUQ4JUIxJUQ4JUFEJUQ5JTg0JUQ4JUE3JUQ4JUFBJUQ5JTg3JUQ4JUE3LSVEOCVBNyVEOSU4NCVEOCVBRi_SAQA?hl=ar&gl=EG&ceid=EG%3Aar',
  'img': 'https://lh4.googleusercontent.com/proxy/oPL7ETcfXN1pr9c4zFr2ULKDr1d7t1XLzt-a1AcbawiusDiypnRn0Up4yP3_1bN-CGmr8T_MuMZw-TgiNhGMx_nki6eQo1tRVsz7sPPCfK1NzmGOU-i5-8mI=-p-df-h100-w100',
  'media': None,
  'site': 'الاصلاح الدستوري'},
 {'title': 'القطرية تشغّل أول رحلة في العالم تحمل مسافرين وطاقم طيران تلقوا لقاح كورونا',
  'desc': 'bookmark_border',
  'date': '6 أبريل',
  'datetime': None,
  'link': 'news.google.com/./articles/CBMiqgNodHRwczovL20u

In [67]:
import pandas as pd
news_df = pd.DataFrame(googlenews.results(sort=True))
# news_df['link'] = 'http://'+news_df['link']
# news_df = news_df[['title','datetime','link','img','site']]
news_df.sample(5)

'<' not supported between instances of 'NoneType' and 'NoneType'


Unnamed: 0,title,desc,date,datetime,link,img,media,site
26,القطرية تزيد رحلاتها إلى مطار شيكاغو أوهير 3 ي...,bookmark_border,26 يونيو,,news.google.com/./articles/CAIiEL3wZsSI1PdPNx7...,https://lh4.googleusercontent.com/proxy/Yc7NJl...,,Al Sharq
29,القطرية: 140 وجهة حول العالم قبل نهاية يوليو,bookmark_border,20 يونيو,,news.google.com/./articles/CAIiEJ_PV26rKNjTWb-...,https://lh5.googleusercontent.com/proxy/o9JhcT...,,Al Sharq
75,القطرية تسير رحلة طيران استثنائية بطاقم نسائي ...,bookmark_border,18‏/10‏/2020,,news.google.com/./articles/CBMiswJodHRwczovL20...,https://lh4.googleusercontent.com/proxy/13ECdi...,,الشرق
12,القطرية: 1200 رحلة أسبوعياً إلى 140 وجهة بحلول...,bookmark_border,15 أبريل,,news.google.com/./articles/CBMisAJodHRwczovL20...,https://lh4.googleusercontent.com/proxy/zbjjR8...,,الشرق
58,هل سيتم تطبيقه عربياً؟ .. كل ما تريد أن تعرفه ...,bookmark_border,2 يونيو,,news.google.com/./articles/CBMizQJodHRwczovL20...,https://lh3.googleusercontent.com/proxy/AXTqIc...,,الشرق


In [110]:
import pandas as pd
keywords = ['aviation','airlines','flight','airplane']
Keywords = [KeywordsPerMarket(input_keywords=keywords,language=language,region='') for language in languages]
#keywords_df = pd.DataFrame([KeywordsPerMarket(input_keywords=keywords,language=language,region='').dict() for language in languages])

In [38]:

class WorldWideNews:
    """
    
    """
    def __init__(self,keywords:list[str]},regions:list,output_language:str='en') -> None:
        self.keywords = keywords
        self.regions = regions
        self.output_language = output_language

    def translate_keywords(self)-> List[KeywordsPerMarket]:
        
        

    def fetch_articles(self):


IndentationError: expected an indented block (2346431134.py, line 13)