# Plan thực hiện
# Mô tả dữ liệu: 
- mỗi d_123 là một document 
# Cấu trúc lưu trữ để dể tính toán cho các bước sau:
# Luồng thực hiện
- Xử lý dữ liệu văn bản từ file cô cho thành dạng: { doc_id, num, wd_count, sentence }

In [167]:
%pip install pd numpy nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [168]:
import os
import pandas as pd
import re

class DocReader:
    '''
    Tiền xử lý dữ liệu
    Biến đỗi các từ trong tập dữ liệu DUC sang CSV để tiện xử lý. ở các bước tiếp theo
    '''
    def __init__(self, file_name):
        self.file_name = file_name

    def parse_sentence(self, line):
        match = re.match(r'<s docid="([^"]+)" num="(\d+)" wdcount="(\d+)"> (.*)</s>', line)
        if match:
            return {
                'docid': match.group(1),
                'num': int(match.group(2)),
                'wdcount': int(match.group(3)),
                'sentence': match.group(4)
            }
        return None

    def to_df(self):
        '''
        Duyệt tất các file trong thư mục data - sau đó parse senctence.
        Dữ liệu sẽ được lưu vào DataFrame
        '''
        data = []
        file_path = os.path.join(self.file_name)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    line = line.strip()
                    if line.startswith('<s docid='):
                        parsed = self.parse_sentence(line)
                        if parsed:
                            data.append(parsed)

        return pd.DataFrame(data)

# reader = DocReader('/workspaces/py_env_research/NLP/DUC_TEXT/train/d061j')
# result = reader.to_df()
# result.head()


In [169]:
class TextPreProcessing:
    def __init__(self, sentences):
        self.sentences = sentences

    def clean_text(self):
        cleaned_sentences = [re.sub(r'[^\w\s]', '', sentence) for sentence in self.sentences]
        return cleaned_sentences

In [170]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

class SimilarityProcess:
  def __init__(self, sentences):
    self.sentences = sentences
    self.tf_idf_matrix = None
    self.init_tf_idf()
    
  def init_tf_idf(self):
    vectorizer = TfidfVectorizer()
    self.tf_idf_matrix = vectorizer.fit_transform(self.sentences)
  
  def sim(self, doc_index_1, doc_index_2):
    vector_doc_1 = self.tf_idf_matrix[doc_index_1]
    vector_doc_2 = self.tf_idf_matrix[doc_index_2]
    
    cosine_sim = cosine_similarity(vector_doc_1, vector_doc_2)
    
    return cosine_sim[0][0]

sentences = [
  "wondered why the President of the Un",
  "wondered why the President of the Un",
  "committee that President Bush's transition.",
  "which Senate committees may assess presidential nominees"
]

sim_proc = SimilarityProcess(sentences)
sim = sim_proc.sim(0, 1)
print("Cosine Similarity between doc 0 and doc 1:", sim)


Cosine Similarity between doc 0 and doc 1: 1.0


# Algorithm: PageRank for Undirected Graph
1. Chuyen matran thanh matran kieu numpy de thuc hien cac buoc tinh toan de dang hon như shape, sum,....
2. Tim bac cua tung node.
4. Luu y:
- Co mot so cong thuc no dung ma tran chuyen vi graph.T may cai do dung cho do thi co huong thoi:
  + Vi sao?
5. Tai sao?
  - numpy.dot? Thay vi loop de kiem tra dieu kien - dinh co lien ket voi dinh khac de tinh pr - khi ham nay se nhan 2 ma tran lai voi nhau - cai nao co chi so (1) thi co value, cai nao khogn co chi so (0) thi khong co value
  - dot([[0, 1, 0],[1, 0, 1],[0, 1, 0]], [[0, 1/2, 0],[1/3, 0, 1/2],[0, 1/2, 0]])
  

In [171]:

'''
  Phần này là tính toan PageRank cho ma trận kề đã tạo ở bước trước.
  - Sử dụng ma trận kề đã tạo để tính toan PageRank.
  - Kết quả là ma trận PageRank cho từng node trong ma trận kề.
  - Test voi vi du cua cô trên lớp.
  NOTe:
  -
'''
class PageRankAlgorithm:
    def __init__(self, undirected_graph, damping_factor=0.85, max_loop=100, delta=1e-10):
      self.undirected_graph = undirected_graph
      self.damping_factor = damping_factor
      self.max_loop = max_loop
      self.delta = delta

    def compute(self):
      import numpy 
      undirected_graph = numpy.array(self.undirected_graph, dtype=float)

      total_node = undirected_graph.shape[0]
      degree_of_node = undirected_graph.sum(axis=1)
      
      base_matrix = numpy.ones(total_node)
      # [[1,1,1,1]]
      degree_of_node[degree_of_node == 0] = 1 # To avoid division by zero :)
      
      for _ in range(self.max_loop):
        new_page_rank_matrix = self.damping_factor * numpy.dot(undirected_graph, base_matrix/degree_of_node) + (1 - self.damping_factor) / total_node
        if sum(abs(new_page_rank_matrix - base_matrix)) < self.delta:
          break
        base_matrix = new_page_rank_matrix

      return base_matrix

# Example usage:
# data = [
#   [0, 1, 1, 1, 1, 0],
#   [1, 0, 1, 1, 1, 0],
#   [1, 1, 0, 1, 0, 0],
#   [1, 1, 1, 0, 0, 1],
#   [1, 1, 0, 0, 0, 1],
#   [0, 0, 0, 1, 1, 0]
# ]

# pg = PageRankAlgorithm(data)
# home = pg.compute()
# home

In [172]:
'''
    Sau khi tính xong rồi thì chuyển qua bước tóm nội dung:
'''

class Summarizer:
    def __init__(self, doc_df, page_rank_scores, n_top=0.15):
        self.doc_df = doc_df
        self.page_rank_scores = page_rank_scores
        self.n_top = n_top
    

    def map_pr_score(self):
        # Map PageRank scores to DataFrame
        df_clone = self.doc_df.copy()
        df_clone['pr_score'] = self.page_rank_scores
        return df_clone
    def summarize(self):
        '''
            b2: Map index to page rank score
            b3: Select top n_top sentences with highest page rank
            b4: Return a DataFrame of the selected sentences, ordered by their original order in the document
        '''
        df_pr = self.map_pr_score()
        # Sort by pr_score descending, select top n_top
        n_top = int(len(df_pr) * self.n_top)
        top_df = df_pr.sort_values('pr_score', ascending=False).head(n_top)
        # Order by docid and num to preserve original order
        top_df = top_df.sort_values(['docid', 'num'])
        return top_df.reset_index(drop=True)


In [173]:
class PageRankEvaluate:
  def __init__(self, summary_df, ref_df):
    self.summary_df = summary_df
    self.ref_df = ref_df

  def evaluate(self):
    # Convert to sets of (docid, num)
    pred_set = set(zip(self.summary_df['docid'], self.summary_df['num']))
    ref_set = set(zip(self.ref_df['docid'], self.ref_df['num']))

    correct = len(pred_set & ref_set)
    total_pred = len(pred_set)
    total_ref = len(ref_set)

    precision = correct / total_pred if total_pred > 0 else 0
    recall = correct / total_ref if total_ref > 0 else 0
    
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Main process

In [174]:
# Main process:
# Import lib
import numpy as np
# Read
# Process text
DOC_FILE_NAME = 'd105g'  
reader = DocReader(f'/workspaces/py_env_research/NLP/DUC_TEXT/train/{DOC_FILE_NAME}')
reader_sum = DocReader(f'/workspaces/py_env_research/NLP/DUC_SUM/{DOC_FILE_NAME}')
df = reader.to_df()
df_sum = reader_sum.to_df()

df['cleaned_sentence'] = TextPreProcessing(df['sentence']).clean_text()

# Caculate similarity
sim = SimilarityProcess(df['cleaned_sentence'])

In [175]:
similarity_matrix = np.zeros((len(df), len(df)))
SIMILARITY_THRESHOLD = 0.1
for i in range(len(df)):
    for j in range(i + 1, len(df)):
        sim_value = sim.sim(i, j)
        if sim_value > SIMILARITY_THRESHOLD:
            similarity_matrix[i, j] = 1
# Compute the similarity matrix in one call
similarity_matrix


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [176]:
pr = PageRankAlgorithm(similarity_matrix)
pr_score = pr.compute()
pr_score

array([0.00405458, 0.02745402, 0.00728025, 0.01227122, 0.01915563,
       0.00300221, 0.00672749, 0.01139009, 0.01003673, 0.00328956,
       0.00668319, 0.03400125, 0.02730662, 0.00776519, 0.00546825,
       0.00313601, 0.00834947, 0.04479379, 0.0207701 , 0.00316023,
       0.00612923, 0.00514792, 0.01843306, 0.01738212, 0.00625341,
       0.00769964, 0.00694046, 0.01516974, 0.00172421, 0.00499734,
       0.02302352, 0.01573131, 0.01420591, 0.01100888, 0.00860848,
       0.01356287, 0.00799957, 0.00689638, 0.01151268, 0.00113104,
       0.00229965, 0.00848637, 0.00719214, 0.02325949, 0.00567786,
       0.01299275, 0.01274724, 0.00473249, 0.00842097, 0.00295163,
       0.00880597, 0.00533768, 0.01481781, 0.02378161, 0.00710464,
       0.00120869, 0.00592093, 0.0101029 , 0.01195013, 0.00800255,
       0.00904249, 0.01556157, 0.00868952, 0.00408473, 0.01465256,
       0.00855417, 0.00517858, 0.0105441 , 0.00322055, 0.0049207 ,
       0.00379283, 0.00441098, 0.00818439, 0.00320745, 0.01918

In [186]:
# Sumarize
summarizer = Summarizer(df, pr_score, n_top=0.28)
predict = summarizer.summarize()
predict

Unnamed: 0,docid,num,wdcount,sentence,cleaned_sentence,pr_score
0,AP880729-0155,10,35,The Soviet leader told a meeting of the Commun...,The Soviet leader told a meeting of the Commun...,0.027454
1,AP880729-0155,12,31,State-run media also indicated Friday that a c...,Staterun media also indicated Friday that a cr...,0.012271
2,AP880729-0155,13,24,The six-month conflict between the two republi...,The sixmonth conflict between the two republic...,0.019156
3,AP880729-0155,16,24,The conflict over Nagorno-Karabakh is one of t...,The conflict over NagornoKarabakh is one of th...,0.011390
4,AP880729-0155,17,24,He indicated Friday he wants to defuse tension...,He indicated Friday he wants to defuse tension...,0.010037
...,...,...,...,...,...,...
69,FT934-2439,34,41,Although the Conference for Security and Co-op...,Although the Conference for Security and Coope...,0.008805
70,FT934-2439,38,27,One of the ways in which we can help prevent c...,One of the ways in which we can help prevent c...,0.011328
71,LA070289-0167,10,44,Appealing for calm and for the cool considerat...,Appealing for calm and for the cool considerat...,0.010998
72,LA070289-0167,15,69,With serious ethnic clashes in Soviet Central ...,With serious ethnic clashes in Soviet Central ...,0.013947


In [187]:

eval = PageRankEvaluate(
  summary_df=df_sum,
  ref_df=predict 
)
precision, recall, f1 = eval.evaluate()
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

Precision: 0.6364, Recall: 0.0946, F1 Score: 0.1647


# Evaluate
1. Base of result use some metrix to evalue algorithm to optimize
2. Lam sao de doan tom tat tro len muot ma, tu nhien, luu loat

# Recall: The ratio of correctly predicted positive observations to the all observations in actual class.
1. P: So ket qua dung / so mo hinh trich xuat ra
2. R: So ket qua dung / cai chung ta ky vong.
3. F1: 2*(P*R)/(P+R)

In [179]:
'''
Bước này vẽ biểu đồ:
1. Mục tiêu của bước thể hiện được tổng quan accurracy & số lương document đạt accuracy ây
2. Đọc biểu đồ theo kiểu: Cố 20 câu đạt accurray là 0.2
3. Trung bình trên toàn bộ dữ liệu là 0.18
'''
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(acc_df['f1'], bins=20, color='skyblue', edgecolor='black')
plt.xlabel('Accuracy')
plt.ylabel('Number of Documents')
plt.title('Distribution of Accuracy Scores per Document')

median_f1 = acc_df['f1'].median()

plt.axvline(median_f1, color='red', linestyle='dashed', linewidth=2, label=f'Median: {median_f1:.2f}')

plt.legend()
plt.grid(True)
plt.show()


NameError: name 'acc_df' is not defined

<Figure size 1000x600 with 0 Axes>