## 1. Importing the required libraries



In [None]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup



---



## 2. Loading the data into the data frame.

In [None]:
df = pd.read_csv("/content/new_data_law.csv", encoding = 'utf-8')
# To display the top 5 rows
df.head(5)

Unnamed: 0,Links,Aspect
0,https://baotainguyenmoitruong.vn/gop-y-2-du-th...,Luật sửa đổi
1,https://baotainguyenmoitruong.vn/som-hoan-thie...,Luật sửa đổi
2,https://baotainguyenmoitruong.vn/tap-trung-hoa...,Luật sửa đổi
3,https://baotainguyenmoitruong.vn/xay-dung-luat...,Luật sửa đổi
4,https://baotainguyenmoitruong.vn/gop-y-2-du-th...,Luật sửa đổi




---



### Checking the types of data

In [None]:
df.dtypes

Links     object
Aspect    object
dtype: object

### Dropping the duplicate rows

In [None]:
df.shape

(47, 2)

In [None]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (3, 2)


In [None]:
# df.drop_duplicates(inplace=True)
# df


In [None]:
df.count()      # Used to count the number of rows

Links     47
Aspect    47
dtype: int64



---



### Dropping the missing or null values.

In [None]:
print(df.isnull().sum())
df = df.dropna()    # Dropping the missing values.
df.count()

Links     0
Aspect    0
dtype: int64


Links     47
Aspect    47
dtype: int64



---



### Pre-process Linh_vuc and Chuyen_muc, Muc do, Link


In [None]:
# Thay thế các nhãn trùng lặp trong cột 'Linh_vuc'
df['Linh_vuc'].replace('KTTV -BĐKH', 'KTTV-BĐKH', inplace=True)
df['Linh_vuc'].replace('KTTV - BĐKH', 'KTTV-BĐKH', inplace=True)
df['Linh_vuc'].replace('KTTV-BĐKH', 'Khí tượng thủy văn - Biến đổi khí hậu', inplace=True)

KeyError: 'Linh_vuc'

In [None]:
# import re
# df['Chuyen_muc'] = df['Chuyen_muc'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()) if 'thông tin khác' in x else x)

In [None]:
# Ánh xạ các giá trị của cột 'Muc_do' sử dụng từ điển
# mapping = {1: "Rất tiêu cực", 2: "Tiêu cực", 3: "Tích cực", 4: "Trung tính"}
# df['Muc_do_ten'] = df['Muc_do'].map(mapping)


In [None]:
def remove_links_from_column(df, column_name, links_to_remove):
    """
    Remove links from a specific column in a DataFrame.

    Args:
    - df (DataFrame): The DataFrame to remove links from.
    - column_name (str): The name of the column to check for links.
    - links_to_remove (list): A list of links to be removed from the DataFrame.

    Returns:
    - DataFrame: The DataFrame with the specified links removed.
    """
    # Iterate through each link to remove
    for link in links_to_remove:
        # Check if the link is present in any cell of the specified column
        mask = df[column_name].str.contains(link)
        # Remove rows where the link is present in the specified column
        df = df[~mask]

    return df

# List of links to be removed
links_to_remove = [
    # các link quá ít báo
    "antt.vn",
    "dangcongsan.vn",
    "dttc.sggp.org.vn",
    "kinhtemoitruong.vn",
    "lifestyle.zingnews.vn",
    "phunuvietnam.vn",
    "quochoitv.vn",
    "thanglong.chinhphu.vn",
    "www.phapluatplus.vn",
    "quanly.moitruongvadothi.vn",
    "amp.vtc.vn",
    "baohaiduong.vn",
    "baonghean.vn",
    "baotayninh.vn",
    "baovephapluat.vn",
    "bnews.vn",
    "cafef.vn",
    "daklak24h.com.vn",
    "etv.quochoi.vn",
    "giaoduc.net.vn",
    "hcmcpv.org.vn",
    "meeyland.com",
    "nangluongvietnam.vn",
    "news.baodansinh.vn",
    "petrotimes.vn",
    "taichinhdoanhnghiep.net.vn",
    "theleader.vn",
    "thethaovanhoa.vn",
    "thoibaotaichinhvietnam.vn",
    "tintucvietnam.vn",
    "vovgiaothong.vn",
    "www.doisongphapluat.com",
    "www.youtube.com",

    # 3 link đang lỗi 1 link quốc hội k cho quét
    "nongnghiep.vn",
    "baotintuc.vn",
    "vietnamnet.vn"
    "quochoi.vn"
    "Downloads"
]

# Name of the column to check for links
column_name = 'Links'  # Replace 'Your_Column_Name' with the actual column name

# Call the function to remove links from the specified column directly in the original DataFrame
df_cleaned = remove_links_from_column(df, column_name, links_to_remove)


In [None]:
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned

Unnamed: 0,Links,Aspect
0,https://baotainguyenmoitruong.vn/gop-y-2-du-th...,Luật sửa đổi
1,https://baotainguyenmoitruong.vn/som-hoan-thie...,Luật sửa đổi
2,https://baotainguyenmoitruong.vn/tap-trung-hoa...,Luật sửa đổi
3,https://baotainguyenmoitruong.vn/xay-dung-luat...,Luật sửa đổi
4,https://baotainguyenmoitruong.vn/gop-y-2-du-th...,Luật sửa đổi
5,https://baotainguyenmoitruong.vn/uy-ban-thuong...,Luật sửa đổi
6,https://baotainguyenmoitruong.vn/dua-cac-tieu-...,Luật sửa đổi
7,https://baotainguyenmoitruong.vn/khao-sat-tham...,Luật sửa đổi
8,https://baotainguyenmoitruong.vn/tang-cuong-co...,Luật sửa đổi
9,https://baotainguyenmoitruong.vn/ho-so-luat-di...,Luật sửa đổi


# Fetch Data from URL Link

## process text

In [None]:
import re

def preprocess_text(text):
    # Loại bỏ khoảng trắng dư thừa giữa các từ
    text = re.sub(r'\s+', ' ', text)
    # Loại bỏ khoảng trắng ở đầu và cuối chuỗi
    text = text.strip()
    return text



## Get Title

In [None]:
def get_title(url):
  def get_soup(response):
    return BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

  def get_h1_title(url):
      response = requests.get(url)
      if response.status_code == 200:
          soup = get_soup(response)
          title = soup.find('h1', class_=re.compile('title', re.IGNORECASE))
          if title:
              return title.get_text().strip()

          title = soup.find(attrs={"data-role": "title"})
          if title:
              return title.get_text().strip()

          title = soup.find('div', class_=re.compile('title', re.IGNORECASE))
          if title:
              return title.get_text().strip()
      return None

  return get_h1_title(url)

## Get Description


In [None]:
def get_description_from_meta(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag:
            description = description_tag.get('content', '')
            if description:
                return description

        # Nếu không tìm thấy từ thẻ <meta name="description">, thử tìm từ thẻ <meta property="og:description">
        og_description_tag = soup.find('meta', property='og:description')
        if og_description_tag:
            og_description = og_description_tag.get('content', '')
            if og_description:
                return og_description

    return None

In [None]:
def get_description(url):
    try:
        # Fetch the webpage
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4xx/5xx status codes

        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find meta tag with name="description"
        meta_tag = soup.find('meta', attrs={'name': 'description'})

        if meta_tag:
            # Extract content attribute
            description = meta_tag.get('content')
            return description
        else:
            # If name="description" not found, try og:description
            og_description_tag = soup.find('meta', attrs={'property': 'og:description'})
            if og_description_tag:
                og_description = og_description_tag.get('content')
                return og_description
            else:
                return "Description meta tag not found."

    except Exception as e:
        return f"An error occurred: {str(e)}"


## Get paragraph from Different URL

In [None]:
def get_paragraphs(url):
  def get_soup(response):
    # Sử dụng mã hóa UTF-8 khi parse nội dung HTML
    return BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

  def get_itemprop_articleBody(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        article_body_div = soup.find('div', itemprop='articleBody')
        if article_body_div:
            paragraphs = article_body_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_itemprop_articleBody_article(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        article_body_div = soup.find('article', itemprop='articleBody')
        if article_body_div:
            paragraphs = article_body_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None
  def get_div_class(url, class_name):
      response = requests.get(url)
      if response.status_code == 200:
          soup = get_soup(response)
          main_div = soup.find('div', class_= class_name)
          if main_div:
              paragraphs = main_div.find_all('p')
              if paragraphs:
                  return '\n'.join([p.get_text() for p in paragraphs])
      return None

  def get_div_id(url, id_name):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        specific_div = soup.find('div', id=id_name)
        if specific_div:
            paragraphs = specific_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_article_id(url, id_name):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        specific_article = soup.find('article', id=id_name)
        if specific_article:
            paragraphs = specific_article.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_article_class(url, class_name):
      response = requests.get(url)
      if response.status_code == 200:
          soup = get_soup(response)
          specific_article = soup.find('article', class_= class_name)
          if specific_article:
              paragraphs = specific_article.find_all('p')
              if paragraphs:
                  return '\n'.join([p.get_text() for p in paragraphs])
      return None

  # 1
  if url.__contains__('laodong.vn'):
    return get_itemprop_articleBody(url)
  # 2
  elif url.__contains__('www.vietnamplus.vn'):
    return get_itemprop_articleBody(url)
  # 3
  elif url.__contains__('tienphong.vn'):
    return get_itemprop_articleBody(url)
  # 4
  elif url.__contains__('tuoitre.vn'):
    return get_itemprop_articleBody(url)
  # 5
  elif url.__contains__('baoxaydung.com.vn'):
    return get_div_class(url, 'item-content')
  # 6
  elif url.__contains__('plo.vn'):
    return get_itemprop_articleBody(url)
  # 7
  elif url.__contains__('dantri.com.vn'):
    return get_div_class(url, 'singular-content')
  # 8
  elif url.__contains__('thanhnien.vn'):
    return get_itemprop_articleBody(url)
  # 9 ####################Hiện đang k có quyền truy xuất ############################
  elif url.__contains__('baotintuc.vn'):
    return get_itemprop_articleBody(url)
   ####################Hiện đang k có quyền truy xuất ------------------------
  # 10
  elif url.__contains__('daibieunhandan.vn'):
    return get_div_class(url, 'detail-content-body')
  # 11
  elif url.__contains__('vov.vn'):
    return get_itemprop_articleBody_article(url)
  # 12
  elif url.__contains__('vtv.vn'):
    return get_div_id(url, 'entry-body')
  # 13
  elif url.__contains__('www.sggp.org.vn'):
    return get_itemprop_articleBody(url)
  # 14
  elif url.__contains__('kinhtedothi.vn'):
    return get_itemprop_articleBody(url)
  # 15
  elif url.__contains__('daidoanket.vn'):
    return get_div_class(url, 'b-maincontent')
  # 16
  elif url.__contains__('baotainguyenmoitruong.vn'):
    return get_div_class(url, 'entry') # entry
  # 17
  elif url.__contains__('nld.com.vn'):
    return get_itemprop_articleBody(url)
  # 18
  elif url.__contains__('www.moitruongvadothi.vn'):
    return get_article_id(url, "main-detail")
  # 19
  elif url.__contains__('baochinhphu.vn'):
    return get_div_class(url, 'main')
  # 20
  elif url.__contains__('hanoimoi.vn'):
    return get_div_class(url, 'entry')
  # 21
  elif url.__contains__('baodautu.vn'):
    return get_div_id(url, 'content_detail_news')
  # 22 ###### ĐANG LỖI HIỂN THỊ UNIKEY cho chữ -------------------------------
  elif url.__contains__('vietnamnet.vn'):
    return get_div_id(url, 'maincontent')
  # 23
  elif url.__contains__('congthuong.vn'):
    return get_itemprop_articleBody(url)
  # 24
  elif url.__contains__('danviet.vn'):
    return get_div_class(url, 'entry-body')
  # 25
  elif url.__contains__('thanhtra.com.vn'):
    return get_itemprop_articleBody(url)
  # 26
  elif url.__contains__('vnexpress.net'):
    return get_article_class(url, 'fck_detail')
  # 27
  elif url.__contains__('nhandan.vn'):
    return get_itemprop_articleBody(url)
  # 28
  elif url.__contains__('www.baogiaothong.vn'):
    return get_itemprop_articleBody(url)
  # 29
  elif url.__contains__('baophapluat.vn'):
    return get_itemprop_articleBody(url)
  # 30
  elif url.__contains__('congly.vn'):
    return get_div_class(url, 'entry')
  # 31
  elif url.__contains__('vneconomy.vn'):
    return get_div_class(url, 'detail__content')
  # 32
  elif url.__contains__('www.nguoiduatin.vn'):
    return get_article_class(url, 'article-content')
  # 33
  elif url.__contains__('www.congluan.vn'):
    return get_div_class(url, 'content-detail')
  # 34
  elif url.__contains__('diendandoanhnghiep.vn'):
    return get_article_id(url, 'detail-content')
  # 35
  elif url.__contains__('vtc.vn'):
    return get_itemprop_articleBody(url)
  # 36
  elif url.__contains__('1thegioi.vn'):
    return get_div_class(url, 'c-news-detail')


  # 37 ##### đang lỗi k nhận đc response: https://nongnghiep.vn/vu-can-bo-lua-dan-ban-dat-ai-lam-sai-thi-bo-tien-ra-ma-den-d369867.html
  elif url.__contains__('nongnghiep.vn'):
    # return get_itemprop_articleBody(url)
    return get_div_class(url, 'content')
  # 38
  elif url.__contains__('www.qdnd.vn'):
    return get_itemprop_articleBody(url)
 # 39
  elif url.__contains__('kinhtenongthon.vn'):
    return get_div_class(url, 'detainew')
  # 40
  elif url.__contains__('reatimes.vn'):
    return get_itemprop_articleBody(url)
  # 41
  elif url.__contains__('cand.com.vn'):
    return get_div_class(url,'detail-content-body')
  # 42
  elif url.__contains__('giaoducthoidai.vn'):
    return get_itemprop_articleBody(url)
  # 43
  elif url.__contains__('suckhoedoisong.vn'):
    return get_itemprop_articleBody(url)
  # 44
  elif url.__contains__('baoquangnam.vn'):
    return get_div_class(url, 'entry')
  # 45  # không thể crawl data ở web này, chắc là có bảo mật riêng ngăn chặn truy cập bằng tool
  elif url.__contains__('quochoi__.vn'):
    return get_itemprop_articleBody(url)
  # 46
  elif url.__contains__('thesaigontimes.vn'):
    return get_div_class(url, 'td-post-content')
  # 47
  elif url.__contains__('vnmedia.vn'):
    return get_div_class(url, 'td-post-content')
  # 48
  elif url.__contains__('www.phunuonline.com.vn'):
    return get_div_id(url, 'newscontents')
  # 49
  elif url.__contains__('www.quochoitv.vn'):
    return get_div_class(url, 'uk-article')

  # stop here
  else:
    return get_itemprop_articleBody(url)


In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Processing Article

In [None]:
import csv

# Set a limit for the number of records to process at a time
batch_limit = 100

# Count the number of records processed
records_processed = 0

# Iterate through each link in the 'Link_tai_bai' column and fetch article text
for index, link_tin_bai in enumerate(df_cleaned['Links']):
    print(f"Processing article {index + 1}/{len(df_cleaned)}")

    description_from_meta = ""
    paragraph = ""
    title = ""

    try:
        title = get_title(link_tin_bai)
        title = preprocess_text(title)
        df_cleaned.at[index, 'Tieu_de'] = title

    except:
        title = ""
        print("No title in link: " + link_tin_bai)


    try:
        description = get_description(link_tin_bai)
        description = preprocess_text(description)
        df_cleaned.at[index, 'Description'] = description

    except:
        description = ""
        print("No description in link: " + link_tin_bai)


    try:
        paragraph = get_paragraphs(link_tin_bai)
        paragraph = preprocess_text(paragraph)
        df_cleaned.at[index, 'Noi_dung_tin_bai'] = paragraph

    except:
        paragraph = ""
        print("No paragraph in link: " + link_tin_bai)


Processing article 1/47
Processing article 2/47
Processing article 3/47
Processing article 4/47
Processing article 5/47
Processing article 6/47
Processing article 7/47
Processing article 8/47
Processing article 9/47
Processing article 10/47
Processing article 11/47
Processing article 12/47
Processing article 13/47
Processing article 14/47
Processing article 15/47
Processing article 16/47
Processing article 17/47
Processing article 18/47
Processing article 19/47
Processing article 20/47
Processing article 21/47
Processing article 22/47
Processing article 23/47
Processing article 24/47
Processing article 25/47
Processing article 26/47
Processing article 27/47
Processing article 28/47
Processing article 29/47
Processing article 30/47
No paragraph in link: https://baotainguyenmoitruong.vn/bo-truong-bo-tn-mt-dang-quoc-khanh-bao-cao-quoc-hoi-ve-du-an-luat-dat-dai-sua-doi-359508.html
Processing article 31/47
Processing article 32/47
Processing article 33/47
Processing article 34/47
Processing

In [None]:
# Save the DataFrame to a new CSV file
folder_path = '/content/drive/MyDrive/UET_LAB_DSKT/BaoChi/'
name_csv =  f"{folder_path}TNMT_law.csv"
name_excel =  f"{folder_path}TNMT_law.xlsx"

df_cleaned.to_csv(name_csv, index=False, encoding='utf-8')
df_cleaned.to_excel(name_excel, index = False)

In [None]:
# Thay thế các giá trị "" bằng NaN
df_cleaned.replace("", np.nan, inplace=True)

# Loại bỏ các hàng có ít nhất một ô dữ liệu là NaN
df_cleaned = df_cleaned.dropna(how='any')

print(df_cleaned.count())  # Đếm lại số lượng hàng sau khi loại bỏ các ô dữ liệu là ""

Links               45
Aspect              45
Tieu_de             45
Description         45
Noi_dung_tin_bai    45
dtype: int64


In [None]:
# Save the DataFrame to a new CSV file
folder_path = '/content/drive/MyDrive/UET_LAB_DSKT/BaoChi/'
name_csv =  f"{folder_path}TNMT_law_clean.csv"
name_excel =  f"{folder_path}TNMT_law_clean.xlsx"

df_cleaned.to_csv(name_csv, index=False, encoding='utf-8')
df_cleaned.to_excel(name_excel, index = False)