## 1. Importing the required libraries

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
path_output_folder = '/content/....'

In [44]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup



---



## 2. Loading the data into the data frame.

In [45]:
!git clone https://github.com/huongntt309/BaoChi.git

fatal: destination path 'BaoChi' already exists and is not an empty directory.


In [46]:
df = pd.read_csv("/content/BaoChi/TNMT_link.csv")
# To display the top 5 rows
df.head(5)

Unnamed: 0,Thoigian,Linh_vuc,Chuyen_muc,Dia_diem,Muc_do,Nguon_tin,Tin_bai,Link_tin_bai,Tom_tat_noi_dung
0,1/12/2023,KTTV -BĐKH,"chính sách, quản lý",toàn quốc,4,"Chính phủ, Tiền phong, Tin tức, Nhân Dân, Lao ...","Thủ tướng Phạm Minh Chính tới Dubai, bắt đầu t...",https://baochinhphu.vn/thu-tuong-pham-minh-chi...,Sau khi kết thúc tốt đẹp chuyến thăm chính thứ...
1,1/12/2023,KTTV -BĐKH,"chính sách, quản lý",toàn quốc,4,VOV,Việt Nam kỳ vọng COP28 sẽ đạt những bước tiến ...,https://vov.vn/xa-hoi/viet-nam-ky-vong-cop28-s...,Thứ trưởng Lê Công Thành cho biết.
2,1/12/2023,KTTV -BĐKH,"chính sách, quản lý",toàn quốc,4,Một Thế giới,Việt Nam tham gia hàng loạt sáng kiến ứng phó ...,https://1thegioi.vn/viet-nam-tham-gia-hang-loa...,Thứ trưởng Bộ Tài nguyên và Môi trường Lê Công...
3,1/12/2023,KTTV -BĐKH,"chính sách, quản lý",toàn quốc,4,VietNamPlus,Tham dự COP28: Việt Nam quyết tâm thúc đẩy giả...,https://www.vietnamplus.vn/tham-du-cop28-viet-...,"Trao đổi với báo chí, Phó Cục trưởng Cục Biến ..."
4,1/12/2023,KTTV -BĐKH,"chính sách, quản lý",toàn quốc,4,VOV,Hội nghị COP28: Khẳng định cam kết ứng phó với...,https://vov.vn/xa-hoi/hoi-nghi-cop28-khang-din...,"theo Bộ Tài nguyên và Môi trường, COP 28 là Hộ..."




---



## Checking the types of data

In [47]:
df.dtypes

Thoigian            object
Linh_vuc            object
Chuyen_muc          object
Dia_diem            object
Muc_do               int64
Nguon_tin           object
Tin_bai             object
Link_tin_bai        object
Tom_tat_noi_dung    object
dtype: object

## Dropping the duplicate rows

In [48]:
df.shape

(1086, 9)

In [49]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)

number of duplicate rows:  (0, 9)


In [50]:
# df.drop_duplicates(inplace=True)
# df

In [51]:
df.count()      # Used to count the number of rows

Thoigian            1086
Linh_vuc            1086
Chuyen_muc          1086
Dia_diem            1086
Muc_do              1086
Nguon_tin           1086
Tin_bai             1086
Link_tin_bai        1081
Tom_tat_noi_dung    1086
dtype: int64



---



## Dropping the missing or null values.

In [52]:
print(df.isnull().sum())
df = df.dropna()    # Dropping the missing values.
df.count()

Thoigian            0
Linh_vuc            0
Chuyen_muc          0
Dia_diem            0
Muc_do              0
Nguon_tin           0
Tin_bai             0
Link_tin_bai        5
Tom_tat_noi_dung    0
dtype: int64


Thoigian            1081
Linh_vuc            1081
Chuyen_muc          1081
Dia_diem            1081
Muc_do              1081
Nguon_tin           1081
Tin_bai             1081
Link_tin_bai        1081
Tom_tat_noi_dung    1081
dtype: int64



---



## Pre-process Linh_vuc and Chuyen_muc, Muc do, Link


In [53]:
# Thay thế các nhãn trùng lặp trong cột 'Linh_vuc'
df['Linh_vuc'].replace('KTTV -BĐKH', 'KTTV-BĐKH', inplace=True)
df['Linh_vuc'].replace('KTTV - BĐKH', 'KTTV-BĐKH', inplace=True)

In [54]:
import re
df['Chuyen_muc'] = df['Chuyen_muc'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()) if 'thông tin khác' in x else x)

In [55]:
# Ánh xạ các giá trị của cột 'Muc_do' sử dụng từ điển
mapping = {1: "Rất tiêu cực", 2: "Tiêu cực", 3: "Tích cực", 4: "Trung tính"}
df['Muc_do_ten'] = df['Muc_do'].map(mapping)


In [56]:
def remove_links_from_column(df, column_name, links_to_remove):
    """
    Remove links from a specific column in a DataFrame.

    Args:
    - df (DataFrame): The DataFrame to remove links from.
    - column_name (str): The name of the column to check for links.
    - links_to_remove (list): A list of links to be removed from the DataFrame.

    Returns:
    - DataFrame: The DataFrame with the specified links removed.
    """
    # Iterate through each link to remove
    for link in links_to_remove:
        # Check if the link is present in any cell of the specified column
        mask = df[column_name].str.contains(link)
        # Remove rows where the link is present in the specified column
        df = df[~mask]

    return df

# List of links to be removed
links_to_remove = [
    # các link quá ít báo
    "antt.vn",
    "dangcongsan.vn",
    "dttc.sggp.org.vn",
    "kinhtemoitruong.vn",
    "lifestyle.zingnews.vn",
    "phunuvietnam.vn",
    "quochoitv.vn",
    "thanglong.chinhphu.vn",
    "www.phapluatplus.vn",
    "quanly.moitruongvadothi.vn",
    "amp.vtc.vn",
    "baohaiduong.vn",
    "baonghean.vn",
    "baotayninh.vn",
    "baovephapluat.vn",
    "bnews.vn",
    "cafef.vn",
    "daklak24h.com.vn",
    "etv.quochoi.vn",
    "giaoduc.net.vn",
    "hcmcpv.org.vn",
    "meeyland.com",
    "nangluongvietnam.vn",
    "news.baodansinh.vn",
    "petrotimes.vn",
    "taichinhdoanhnghiep.net.vn",
    "theleader.vn",
    "thethaovanhoa.vn",
    "thoibaotaichinhvietnam.vn",
    "tintucvietnam.vn",
    "vovgiaothong.vn",
    "www.doisongphapluat.com",
    "www.youtube.com",

    # 3 link đang lỗi 1 link quốc hội k cho quét
    "nongnghiep.vn",
    "baotintuc.vn",
    "vietnamnet.vn"
    "quochoi.vn"
    "Downloads"
]

# Name of the column to check for links
column_name = 'Link_tin_bai'  # Replace 'Your_Column_Name' with the actual column name

# Call the function to remove links from the specified column directly in the original DataFrame
df_cleaned = remove_links_from_column(df, column_name, links_to_remove)


In [None]:
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned

# Fetch Data from URL Link

## process text

In [58]:
def preprocess_text(text):
    # Loại bỏ khoảng trắng dư thừa và kí tự không mong muốn
    text = text.strip()  # Loại bỏ khoảng trắng ở đầu và cuối chuỗi
    text = re.sub(r'\s+', '.', text)  # Loại bỏ khoảng trắng dư thừa
    text = re.sub(r'[^\w\s]', ' ', text)  # Loại bỏ các kí tự không phải là chữ cái hoặc số
    # Thực hiện chuẩn hóa khoảng trắng giữa các từ
    words = text.split()
    text = ' '.join(words)
    return text

# Function to fetch text content from a URL
def get_article_text(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Extract text content (you may need to adjust this based on the structure of the website)
            text_content = soup.get_text()
            return text_content
        else:
            print(f"Failed to fetch data from {url}")
            return None
    except Exception as e:
        print(f"Error fetching data from {url}: {e}")
        return None


def get_description_from_meta(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        # Tìm mô tả từ thẻ <meta name="description">
        description_tag = soup.find('meta', attrs={'name': 'description'})
        if description_tag:
            description = description_tag.get('content', '')
            if description:
                return description

        # Nếu không tìm thấy từ thẻ <meta name="description">, thử tìm từ thẻ <meta property="og:description">
        og_description_tag = soup.find('meta', property='og:description')
        if og_description_tag:
            og_description = og_description_tag.get('content', '')
            if og_description:
                return og_description

    return None


## Get paragraph from Different URL

In [59]:
def get_paragraphs(url):
  def get_soup(response):
    # Sử dụng mã hóa UTF-8 khi parse nội dung HTML
    return BeautifulSoup(response.content, 'html.parser', from_encoding='utf-8')

  def get_itemprop_articleBody(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        article_body_div = soup.find('div', itemprop='articleBody')
        if article_body_div:
            paragraphs = article_body_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_itemprop_articleBody_article(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        article_body_div = soup.find('article', itemprop='articleBody')
        if article_body_div:
            paragraphs = article_body_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None
  def get_div_class(url, class_name):
      response = requests.get(url)
      if response.status_code == 200:
          soup = get_soup(response)
          main_div = soup.find('div', class_= class_name)
          if main_div:
              paragraphs = main_div.find_all('p')
              if paragraphs:
                  return '\n'.join([p.get_text() for p in paragraphs])
      return None

  def get_div_id(url, id_name):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        specific_div = soup.find('div', id=id_name)
        if specific_div:
            paragraphs = specific_div.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_article_id(url, id_name):
    response = requests.get(url)
    if response.status_code == 200:
        soup = get_soup(response)
        specific_article = soup.find('article', id=id_name)
        if specific_article:
            paragraphs = specific_article.find_all('p')
            if paragraphs:
                return '\n'.join([p.get_text() for p in paragraphs])
    return None

  def get_article_class(url, class_name):
      response = requests.get(url)
      if response.status_code == 200:
          soup = get_soup(response)
          specific_article = soup.find('article', class_= class_name)
          if specific_article:
              paragraphs = specific_article.find_all('p')
              if paragraphs:
                  return '\n'.join([p.get_text() for p in paragraphs])
      return None

  # 1
  if url.__contains__('laodong.vn'):
    return get_itemprop_articleBody(url)
  # 2
  elif url.__contains__('www.vietnamplus.vn'):
    return get_itemprop_articleBody(url)
  # 3
  elif url.__contains__('tienphong.vn'):
    return get_itemprop_articleBody(url)
  # 4
  elif url.__contains__('tuoitre.vn'):
    return get_itemprop_articleBody(url)
  # 5
  elif url.__contains__('baoxaydung.com.vn'):
    return get_div_class(url, 'item-content')
  # 6
  elif url.__contains__('plo.vn'):
    return get_itemprop_articleBody(url)
  # 7
  elif url.__contains__('dantri.com.vn'):
    return get_div_class(url, 'singular-content')
  # 8
  elif url.__contains__('thanhnien.vn'):
    return get_itemprop_articleBody(url)
  # 9 ####################Hiện đang k có quyền truy xuất ############################
  elif url.__contains__('baotintuc.vn'):
    return get_itemprop_articleBody(url)
   ####################Hiện đang k có quyền truy xuất ------------------------
  # 10
  elif url.__contains__('daibieunhandan.vn'):
    return get_div_class(url, 'detail-content-body')
  # 11
  elif url.__contains__('vov.vn'):
    return get_itemprop_articleBody_article(url)
  # 12
  elif url.__contains__('vtv.vn'):
    return get_div_id(url, 'entry-body')
  # 13
  elif url.__contains__('www.sggp.org.vn'):
    return get_itemprop_articleBody(url)
  # 14
  elif url.__contains__('kinhtedothi.vn'):
    return get_itemprop_articleBody(url)
  # 15
  elif url.__contains__('daidoanket.vn'):
    return get_div_class(url, 'b-maincontent')
  # 16
  elif url.__contains__('baotainguyenmoitruong.vn'):
    return get_div_class(url, 'entry') # entry
  # 17
  elif url.__contains__('nld.com.vn'):
    return get_itemprop_articleBody(url)
  # 18
  elif url.__contains__('www.moitruongvadothi.vn'):
    return get_article_id(url, "main-detail")
  # 19
  elif url.__contains__('baochinhphu.vn'):
    return get_div_class(url, 'main')
  # 20
  elif url.__contains__('hanoimoi.vn'):
    return get_div_class(url, 'entry')
  # 21
  elif url.__contains__('baodautu.vn'):
    return get_div_id(url, 'content_detail_news')
  # 22 ###### ĐANG LỖI HIỂN THỊ UNIKEY cho chữ -------------------------------
  elif url.__contains__('vietnamnet.vn'):
    return get_div_id(url, 'maincontent')
  # 23
  elif url.__contains__('congthuong.vn'):
    return get_itemprop_articleBody(url)
  # 24
  elif url.__contains__('danviet.vn'):
    return get_div_class(url, 'entry-body')
  # 25
  elif url.__contains__('thanhtra.com.vn'):
    return get_itemprop_articleBody(url)
  # 26
  elif url.__contains__('vnexpress.net'):
    return get_article_class(url, 'fck_detail')
  # 27
  elif url.__contains__('nhandan.vn'):
    return get_itemprop_articleBody(url)
  # 28
  elif url.__contains__('www.baogiaothong.vn'):
    return get_itemprop_articleBody(url)
  # 29
  elif url.__contains__('baophapluat.vn'):
    return get_itemprop_articleBody(url)
  # 30
  elif url.__contains__('congly.vn'):
    return get_div_class(url, 'entry')
  # 31
  elif url.__contains__('vneconomy.vn'):
    return get_div_class(url, 'detail__content')
  # 32
  elif url.__contains__('www.nguoiduatin.vn'):
    return get_article_class(url, 'article-content')
  # 33
  elif url.__contains__('www.congluan.vn'):
    return get_div_class(url, 'content-detail')
  # 34
  elif url.__contains__('diendandoanhnghiep.vn'):
    return get_article_id(url, 'detail-content')
  # 35
  elif url.__contains__('vtc.vn'):
    return get_itemprop_articleBody(url)
  # 36
  elif url.__contains__('1thegioi.vn'):
    return get_div_class(url, 'c-news-detail')


  # 37 ##### đang lỗi k nhận đc response: https://nongnghiep.vn/vu-can-bo-lua-dan-ban-dat-ai-lam-sai-thi-bo-tien-ra-ma-den-d369867.html
  elif url.__contains__('nongnghiep.vn'):
    # return get_itemprop_articleBody(url)
    return get_div_class(url, 'content')
  # 38
  elif url.__contains__('www.qdnd.vn'):
    return get_itemprop_articleBody(url)
 # 39
  elif url.__contains__('kinhtenongthon.vn'):
    return get_div_class(url, 'detainew')
  # 40
  elif url.__contains__('reatimes.vn'):
    return get_itemprop_articleBody(url)
  # 41
  elif url.__contains__('cand.com.vn'):
    return get_div_class(url,'detail-content-body')
  # 42
  elif url.__contains__('giaoducthoidai.vn'):
    return get_itemprop_articleBody(url)
  # 43
  elif url.__contains__('suckhoedoisong.vn'):
    return get_itemprop_articleBody(url)
  # 44
  elif url.__contains__('baoquangnam.vn'):
    return get_div_class(url, 'entry')
  # 45  # không thể crawl data ở web này, chắc là có bảo mật riêng ngăn chặn truy cập bằng tool
  elif url.__contains__('quochoi__.vn'):
    return get_itemprop_articleBody(url)
  # 46
  elif url.__contains__('thesaigontimes.vn'):
    return get_div_class(url, 'td-post-content')
  # 47
  elif url.__contains__('vnmedia.vn'):
    return get_div_class(url, 'td-post-content')
  # 48
  elif url.__contains__('www.phunuonline.com.vn'):
    return get_div_id(url, 'newscontents')
  # 49
  elif url.__contains__('www.quochoitv.vn'):
    return get_div_class(url, 'uk-article')

  # stop here
  else:
    return get_itemprop_articleBody(url)


In [None]:
import csv


# Set a limit for the number of records to process at a time
batch_limit = 100

# Count the number of records processed
records_processed = 0

# Iterate through each link in the 'Link_tai_bai' column and fetch article text
for index, link_tin_bai in enumerate(df_cleaned['Link_tin_bai']):
    print(f"Processing article {index + 1}/{len(df_cleaned)}")

    description_from_meta = ""
    paragraph = ""
    article_text = ""

    # try:
    #     description_from_meta = get_description_from_meta(link_tin_bai)
    # except:
    #     description_from_meta = ""

    try:
        paragraph = get_paragraphs(link_tin_bai)
        article_text = paragraph
    except:
        paragraph = ""
        article_text = ""

    # if description_from_meta != None and paragraph != None:
    #     article_text = description_from_meta + '. ' + paragraph
    # elif description_from_meta == "" or description_from_meta == None:
    #     article_text = paragraph
    #     print("No description link: " + link_tin_bai)
    if paragraph == "" or paragraph == None:
        # article_text = description_from_meta
        print("No paragraph in link: " + link_tin_bai)
        article_text = ""
    try:
        article_text = preprocess_text(article_text)
    except:
        article_text = ''

    if article_text != '':
        # Assign the fetched text content to the corresponding row in the new column
        df_cleaned.at[index, 'Noi_dung_tin_bai'] = article_text

    # Increment the count of records processed
    records_processed += 1

    # Check if the batch limit is reached
    if records_processed % batch_limit == 0:
        name_csv =  f"{path_output_folder}{records_processed}_noidung_tinbai.csv"
        df_cleaned.to_csv(name_csv, index=False, encoding='utf-8')
        print(f"{name_csv} has been created.")

# Save the DataFrame to a new CSV file
df_cleaned.to_csv(name_csv, index=False, encoding='utf-8')
df_cleaned.to_excel(f"{path_output_folder}{records_processed}_noidung_tinbai.xlsx", index = False)