In [1]:

import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve_docs_and_clean():

  r = requests.get('https://sports.ndtv.com/fifa-world-cup-2022/news')
  soup = BeautifulSoup(r.content, 'html.parser')

  #THE FOLLOWING CODE NEED TO BE MODIFIED TO SUITE FOR THE ABOVE URL
  link = []
  for i in soup.find('div', {'class':"lst-pg_hd"}).find_all('a', {'class':"lst-pg_ttl"}):
    i['href'] = "https://sports.ndtv.com/" + i['href'] + '?page=all'
    link.append(i['href'])

  

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'sp-cn pg-str-com js-ad-section'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [2]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()
     

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
ability,0.0,0.0,0.0,0.0,0.0,0.076148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063235,0.0,0.0,0.0
about,0.0,0.026346,0.0,0.0,0.0,0.043201,0.017578,0.0,0.0,0.0,0.0,0.0,0.0,0.04915,0.0,0.049172,0.0,0.0
above,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024829,0.0
absent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
accepting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:

def get_similar_articles(q, df):
  print("query:", q)
  print("The following are articles with the highest cosine similarity values: ")
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Similarity Values:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'spain'
q3 = 'argentina'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
The following are articles with the highest cosine similarity values: 
----------------------------------------------------------------------------------------------------
query: spain
The following are articles with the highest cosine similarity values: 
Similarity Values: 0.14731142669842734

Similarity Values: 0.020450677778402003
neymar could make his return to the world cup stage on monday as brazil continue their bid to be crowned kings for a record extending sixth time against south korea superstar forward neymar has been absent for brazil since spraining his ankle in his team s opening group g win against serbia last month and their supporters have been sweating on his fitness ever since coach tite said neymar would be assessed in brazil s final pre game training session on sunday but gave a heavy hint that the paris saint germain attacker would start window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai a

In [4]:
from gensim.summarization.bm25 import BM25

def bm25_ranking(query):
  docs_tokens = [s.split() for s in docs]
  bm25 = BM25(docs_tokens)
  qry = query.split()
  scores = bm25.get_scores(qry, average_idf=20)

  best = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
  for i,b in enumerate(best):
    print(f"Rank {i+1}: {docs[b]}")


In [5]:

bm25_ranking('argentina')

Rank 1: lionel messi produced a moment of trademark quality to score the opener in a win over australia and help send argentina into the quarter finals of the world cup on saturday messi s th goal in his th career appearance but first in the knockout rounds of the world cup helped set up a last eight clash with the unbeaten netherlands on friday julian alvarez scored the other in the th minute after some calamitous australian defending before enzo fernandez s th minute deflected own goal set up an unexpectedly nervy finale window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js after the game a tv presenter showed messi a video of his wife antonela roccuzzo and children thiago mateo and ciro celebrating his first half goal his reaction was priceless watch lionel messi s priceless reaction on seeing wife kids celebrate his goa

In [7]:

bm25_ranking('messi')
     

Rank 1: lionel messi produced a moment of trademark quality to score the opener in a win over australia and help send argentina into the quarter finals of the world cup on saturday messi s th goal in his th career appearance but first in the knockout rounds of the world cup helped set up a last eight clash with the unbeaten netherlands on friday julian alvarez scored the other in the th minute after some calamitous australian defending before enzo fernandez s th minute deflected own goal set up an unexpectedly nervy finale window rrcode window rrcode rrcode push function function v d o ai ai d createelement script ai defer true ai async true ai src v location protocol o d head appendchild ai window document a vdo ai core v ndtv vdo ai js after the game a tv presenter showed messi a video of his wife antonela roccuzzo and children thiago mateo and ciro celebrating his first half goal his reaction was priceless watch lionel messi s priceless reaction on seeing wife kids celebrate his goa