# 1. Setup

## Import and read dataset

In [1]:
# link: https://drive.google.com/file/d/1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl/view?usp=sharing
!gdown --id 1ydGNBdRVloX9rtxsKrMSnUNFG43Qv1sl
!unzip news_corpus.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: news_corpus/NEWS_069403.txt  
  inflating: news_corpus/NEWS_001402.txt  
  inflating: news_corpus/NEWS_105164.txt  
  inflating: news_corpus/NEWS_108900.txt  
  inflating: news_corpus/NEWS_095239.txt  
  inflating: news_corpus/NEWS_130689.txt  
  inflating: news_corpus/NEWS_178360.txt  
  inflating: news_corpus/NEWS_042044.txt  
  inflating: news_corpus/NEWS_148066.txt  
  inflating: news_corpus/NEWS_094609.txt  
  inflating: news_corpus/NEWS_151009.txt  
  inflating: news_corpus/NEWS_079889.txt  
  inflating: news_corpus/NEWS_111269.txt  
  inflating: news_corpus/NEWS_128100.txt  
  inflating: news_corpus/NEWS_088832.txt  
  inflating: news_corpus/NEWS_129622.txt  
  inflating: news_corpus/NEWS_029152.txt  
  inflating: news_corpus/NEWS_032313.txt  
  inflating: news_corpus/NEWS_145860.txt  
  inflating: news_corpus/NEWS_042270.txt  
  inflating: news_corpus/NEWS_004225.txt  
  inflating: news_corpus/NEWS_17

## Define normalize text function and create vocab

In [2]:
# download vietnamese stopwords: https://raw.githubusercontent.com/stopwords/vietnamese-stopwords/master/vietnamese-stopwords.txt
!gdown --id 1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
!unzip vn_stopwords.zip

Downloading...
From: https://drive.google.com/uc?id=1W9zVRz--bHlbBXbCSmoWHBO_2Cs4EhPY
To: /content/vn_stopwords.zip
100% 6.89k/6.89k [00:00<00:00, 5.39MB/s]
Archive:  vn_stopwords.zip
  inflating: vietnamese-stopwords.txt  


In [3]:
import string
import os
from tqdm import tqdm
import numpy as np

def remove_punctuations(text: str) -> str:
  return text.translate(str.maketrans('', '', string.punctuation))

with open('vietnamese-stopwords.txt', 'r', encoding='utf8') as f:
  vn_stopwords = f.readlines()
def remove_stopwords(text: str) -> str:
  new_text = text
  for w in vn_stopwords:
    if w in new_text:
      new_text = new_text.replace(w, '')

  return new_text

def normalize_text(text: str) -> str:
  normalized_text = text.lower()
  normalized_text = remove_punctuations(normalized_text)
  normalized_text = remove_stopwords(normalized_text)

  return normalized_text

## Create similiarity measurement function using cosine similarity

$cosine\_similarity(a, b) = \frac{a ⋅ b}{|a||b|} = \frac{\sum_{i = 1}^{N}a_ib_i}{\sqrt{\sum_{i = 1}^{N}a_i^2}\sqrt{\sum_{i = 1}^{N}b_i^2}}$

In [4]:
def distance(a: np.ndarray, b: np.ndarray) -> np.ndarray:
  numerator = np.dot(a, b)
  denominator = np.linalg.norm(a) * np.linalg.norm(b)
  
  return numerator / denominator

## Create vectorize function using binary bag-of-words on a provided vocab

In [5]:
def vectorize(text: str, vocab: list) -> np.ndarray:
  normalized_text = normalize_text(text)
  vec = []
  for word in vocab:
    if word in normalized_text:
      vec.append(1)
    else:
      vec.append(0)
  
  return np.array(vec)

# 2. Building Text Retrieval system using Vector Space Model



## 2.1. Create vocab

In [6]:
doc_lists = []
vocab = []
dataset_root_path = 'news_corpus'
filenames = os.listdir(dataset_root_path)
for i in tqdm(range(len(filenames) // 20)):
  filename = filenames[i]
  filepath = os.path.join(dataset_root_path, filename)
  with open(filepath, 'r', encoding='utf8') as f:
    lines = f.readlines()
    title = lines[0].strip()
    article = ' '.join(lines[1:]).strip()
    article = normalize_text(article)
    if (title, article) not in doc_lists:
      doc_lists.append((title, article)) 
    tokens = article.split(' ')
    for token in tokens:
      if token not in vocab:
        vocab.append(token)

100%|██████████| 9226/9226 [04:26<00:00, 34.67it/s]


## 2.2. Create document-term matrix

In [7]:
term_document_matrix = {}
for (title, article) in tqdm(doc_lists):
  vec = vectorize(article, vocab)
  term_document_matrix[(title, article)] = vec

100%|██████████| 9225/9225 [19:05<00:00,  8.06it/s]


## 2.3. Ranking

In [8]:
def ranking(query: str, term_document_matrix: dict, print_top_10: bool = True) -> list:
  query_vec = query_vec = vectorize(query, vocab)
  rankings = []
  i = 1
  for doc_info, vec in tqdm(term_document_matrix.items()):
    score = distance(query_vec, vec)
    rankings.append((score, (doc_info[0])))
    i += 1
  rankings.sort(reverse=True)

  if print_top_10 == True:
    for rank in rankings[:10]:
      print(rank)

  return rankings

In [9]:
query = "điểm thi đại học"
rankings = ranking(query, term_document_matrix, True)

100%|██████████| 9225/9225 [00:06<00:00, 1504.19it/s]

(0.5636018619766345, 'Lịch quan sát 3 hiện tượng thiên văn sắp xuất hiện tại Việt Nam')
(0.4583492485141056, 'Những phát minh kỳ lạ nhất trong lịch sử nhân loại, dù chưa chắc hữu dụng nhưng tính sáng tạo thì vô biên')
(0.43876345447627835, 'Bò ‘đi dạo’ trên những cánh đồng muối bỏ hoang ở Nghệ An')
(0.412514323662695, "Nỗ lực 'tẩy xanh' của gã khổng lồ thời trang Shein")
(0.4117647058823529, 'Nữ thần nhảy xa 17 tuổi của Nga')
(0.4117647058823529, 'Hoa khôi nhảy xa 15 tuổi của Ukraine')
(0.3796283011826483, 'Video: Dự án tiếp nước cải tạo sông Tích 11 năm chưa hoàn thành')
(0.3796283011826483, 'Video: Cận cảnh cầy mangut đánh nát đầu rắn hổ mang')
(0.3796283011826483, 'Video: Căn nhà 4 mặt tiền chình ình giữa đường vành đai nghìn tỷ ở Hà Nội')
(0.37573457465108967, 'Ô tô bất ngờ phát nổ rồi bốc cháy dữ dội khi để giữa trời năng tại Trung Quốc')



