<a href="https://colab.research.google.com/github/igseg/google_query/blob/master/google_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pygooglenews

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygooglenews
  Downloading pygooglenews-0.1.2-py3-none-any.whl (10 kB)
Collecting beautifulsoup4<5.0.0,>=4.9.1
  Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 4.7 MB/s 
[?25hCollecting requests<3.0.0,>=2.24.0
  Downloading requests-2.28.1-py3-none-any.whl (62 kB)
[K     |████████████████████████████████| 62 kB 1.5 MB/s 
[?25hCollecting dateparser<0.8.0,>=0.7.6
  Downloading dateparser-0.7.6-py2.py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 32.3 MB/s 
[?25hCollecting feedparser<6.0.0,>=5.2.1
  Downloading feedparser-5.2.1.zip (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 52.0 MB/s 
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)
Building wheels for collected packages: feedparser
  Building wheel for feedparser (setup.py) ... 

In [None]:
from pygooglenews import GoogleNews
import unicodedata
import re

In [None]:

def search(query, language = 'es', country = 'ES', date = '7d'):
  '''
  query is the search we are making.

  language is the browser's language ('en' for English)

  country is the browser's country ('US' for United States)

  date limits how old the entry may be. '7d' for, at most, 1 week old entries.
  '''

  # Notes for myself: Add parameter for max_searchs (rn it's 100)

  gn = GoogleNews(lang = language, country = country)

  s = gn.search(f" '{query}' when:{date} ")

  search_entries = s['entries']
  search_results = []

  for entry in search_entries:
    ''' entry keys are: 'title', 'title_detail', 'links', 'link', 'id', 'guidislink'
     , 'published', 'published_parsed', 'summary', 'summary_detail', 'source', 'sub_articles'

     We are interested in: title, link, summary
    '''
    objective = {
        'title'   : entry.title,
        'link'    : entry.link,
        'summary' : entry.summary,
        'score'   : 0
    }
    search_results.append(objective)

  return search_results

def remove_accents(input_str):
    nfkd_form  = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii

def string_treatment(string):
  # String cleaning
  res_without_accents = remove_accents(string)                                           # remove accent marks
  res_clean           = re.sub(r'\W+', ' ', res_without_accents.decode("utf-8")).lower() # remove non alphanumerical characters and lower everything
  res_clean_list      = res_clean.split(' ')                                             # split string by blanks

  return res_clean_list

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(documents):
  import math
  N = len(documents)
  
  idfDict = dict.fromkeys(documents[0].keys(), 0)
  for document in documents:
      for word, val in document.items():
          if val > 0:
              idfDict[word] += 1
  
  for word, val in idfDict.items():
      idfDict[word] = math.log(N / float(val))
  return idfDict

def computeTFIDF(tfBagOfWords, idfs):
  tfidf = {}
  for word, val in tfBagOfWords.items():
      tfidf[word] = val * idfs[word]
  return tfidf

def scoring_function(keywords, tf_idf):
  '''
  Calculate the score of the keywords tf_idf
  '''
  score = 0
  for keyword in keywords:
    try: 
      score += tf_idf[keyword]
    except KeyError: 
      pass
    
  return score

def calibrate_scores_tfidf(results,keywords):
  '''
  Calcualte tfidf for each word in each document, then sum the scores of the
  keywords in order to produce the final score
  '''
  bag_of_words = []
  for i in range(len(results)):
    bag_of_words.append(string_treatment(results[i]['summary']))

  unique_words = set(bag_of_words[0])

  for i in range(len(results)):
    if i == 0:
      continue

    unique_words = unique_words.union(bag_of_words[i])

  num_of_words = []

  for i in range(len(results)):
    num_of_words.append(dict.fromkeys(unique_words, 0))
    for word in bag_of_words[i]:
      num_of_words[i][word] += 1

  tf = []

  for i in range(len(results)):
    tf.append(computeTF(num_of_words[i], bag_of_words[i]))

  idfs = computeIDF(num_of_words)

  tfidf = []

  for i in range(len(results)):
    tfidf.append(computeTFIDF(tf[i], idfs))

  for i in range(len(results)):
    results[i]['score'] = scoring_function(keywords, tfidf[i])

  return results

In [None]:
keywords = ['hidrogeno', 'verde', 'planta', 'proyecto'] # This list has to be expanded trought expert criteria
                                                        # IMPORTANT: keywords in lowercase WITHOUT special characters

results  = search('hidrogeno verde')                    # Example for just one query, implement loop for multiple searches

scored_results = calibrate_scores_tfidf(results, keywords)
sorted_results = sorted(scored_results, key=lambda item: -item['score'])

In [None]:
sorted_results

[{'title': 'Haro presenta al programa Misiones Ciencia e Innovacion su proyecto para crear una planta de hidrógeno verde - Haro Digital',
  'link': 'https://harodigital.com/medioambiente/haro-presenta-al-programa-misiones-ciencia-e-innovacion-su-proyecto-para-crear-una-planta-de-hidrogeno-verde/',
  'summary': '<a href="https://harodigital.com/medioambiente/haro-presenta-al-programa-misiones-ciencia-e-innovacion-su-proyecto-para-crear-una-planta-de-hidrogeno-verde/" target="_blank">Haro presenta al programa Misiones Ciencia e Innovacion su proyecto para crear una planta de hidrógeno verde</a>&nbsp;&nbsp;<font color="#6f6f6f">Haro Digital</font>',
  'score': 0.2506119054097133},
 {'title': 'La Junta proyecta una planta de hidrógeno verde en Los Barrios (Cádiz) - El Periodico de la Energía',
  'link': 'https://elperiodicodelaenergia.com/junta-proyecta-planta-hidrogeno-verde-barrios-cadiz/',
  'summary': '<a href="https://elperiodicodelaenergia.com/junta-proyecta-planta-hidrogeno-verde-ba

In [None]:
bag_of_words = []
for i in range(len(results)):
  bag_of_words.append(string_treatment(results[i]['summary']))

unique_words = set(bag_of_words[0])

for i in range(len(results)):
  if i == 0:
    continue

  unique_words = unique_words.union(bag_of_words[i])

In [None]:
unique_words

{'',
 '0',
 '00031664984409461433521',
 '0003_202210g4p30991',
 '0004',
 '04',
 '05',
 '06',
 '07',
 '08',
 '1',
 '10',
 '104',
 '1045',
 '107074',
 '1087091',
 '11',
 '11981644',
 '11981685',
 '122',
 '13',
 '134',
 '134132923',
 '13474',
 '142883',
 '165gw',
 '180',
 '2',
 '200',
 '2000',
 '2022',
 '20221004',
 '20221006134558021895',
 '20221006162336',
 '20221007',
 '202210070333432264809',
 '20221010',
 '20221095047',
 '202290321',
 '2023',
 '2025',
 '2025_111665143857',
 '2030',
 '20minutos',
 '22',
 '2302031',
 '2302059',
 '250',
 '2534184',
 '2535379',
 '2874546',
 '2874723',
 '2874747',
 '30',
 '370859',
 '4',
 '5',
 '50',
 '5063793',
 '60',
 '606',
 '633ed899e4d4d839628b45e9',
 '65gw',
 '6f6f6f',
 '709679031_0',
 '76744951',
 '76994875',
 '83',
 '8561908',
 '9',
 '9012247',
 '944235',
 '945015',
 '_blank',
 'a',
 'abc',
 'abo',
 'abono',
 'acceso',
 'acelera',
 'aceleracion',
 'actual',
 'actualidad',
 'acuerda',
 'acuerdan',
 'acuerdos',
 'adapta',
 'adslzone',
 'aduanas',
 '

In [None]:
set(bag_of_words[0])

{'',
 '10',
 '11981644',
 '22',
 '6f6f6f',
 '_blank',
 'a',
 'color',
 'de',
 'el',
 'eleconomista',
 'energia',
 'es',
 'font',
 'hidrogeno',
 'href',
 'html',
 'https',
 'javier',
 'nbsp',
 'noticias',
 'petroquimico',
 'polo',
 'repsol',
 'sancho',
 'target',
 'tarragona',
 'transformara',
 'verde',
 'www'}