# 1. Đọc dataframe

In [None]:
import os
import json
import pandas as pd

folder_paths = {
    'data/raw/fake': 'Fake',
    'data/raw/real': 'Real'
    }

dfs = [] 

for relative_path, label in folder_paths.items():
    folder_path = os.path.join(os.getcwd(), relative_path)  # Path to folder
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]  # Filter json file

    for file in json_files:
        file_path = os.path.join(folder_path, file)  # Path to json
        
        try:
            with open(file_path, "r", encoding="utf-8") as f:  # Read json
                data = json.load(f)

            if isinstance(data, dict):  # Convert a json dictionary to a list
                data = [data]
            df = pd.DataFrame(data)

            if df.shape[0] == 0:  # Check if the data file is empty
                print(f"File {file} don't have data.")
                continue

            # Extract ID from the name of file (Fake_68, Real_15, ...)
            file_parts = file.split("_")  
            file_id = "_".join(file_parts[-2:]).replace(".json", "")  
            
            # Unify maintext and text into maintext, if a maintext is missing, replace it with an empty space
            if "text" in df.columns:
                df.rename(columns={"text": "maintext"}, inplace=True)
            if "maintext" not in df.columns:  
                df["maintext"] = ""  

            # Handle authors, if a name is missing, replace it with an empty space
            if "authors" in df.columns:
                df["authors"] = df["authors"].apply(lambda x: ", ".join(x) if isinstance(x, list) and len(x) > 0 else "")

            # Add id and target (Fake/Real) to the dataframe
            df['id'] = file_id  
            df['target'] = label  

            dfs.append(df)

        except Exception as e:
            print(f"Error when reading file {file}: {e}")

# Concat all the dataframes 
df = pd.concat(dfs, ignore_index=True, join='outer')
df = df.fillna('')
df.reset_index(drop=True, inplace=True)


In [None]:
df = df[['id', 'authors', 'source_domain', 'language', 'title', 'description', 'maintext', 'target']]
df.reset_index(drop=True, inplace=True)
df.head()


In [None]:
df.shape


# 2. Tiền xử lí dữ liệu

In [None]:
df.describe()


The feature `Language` has 3 values (vi, en, empty) in comparision with others, so it doesn't play any crucial role in the Decision tree (at section Rule-based systen) below.

In [None]:
# Check for duplicates
df[df.duplicated(subset=df.columns.difference(['id']), keep=False)]


In [None]:
df.drop(index=105, inplace=True)  # Drop the row with index 105
df.reset_index(drop=True, inplace=True) 


In [None]:
# Check for imbalance. Fake and Real values are nearly equal, so no resampling is needed
df['target'].value_counts()  


# 3. Tìm rule-based system 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Create a Decision Tree with entropy as the criterion
def model_tree(depth):
    model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=42) 
    return model


# Measure an accuracy for Decision tree
def calculate_acc(dataframe, x_features, y_feature, model=model_tree):    
    X = x_features 
    y = dataframe[y_feature].values

    # Take 80% dataset for training, 20% for testing model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  

    # Training model
    model.fit(X_train, y_train)

    # Calculate accuracy after training the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return f"Accuracy of Decision Tree: {accuracy:.2%}"


# Create a tree plot
def plot_decision_tree(model, x_features):
    plt.figure(figsize=(12, 8))
    plot_tree(model, feature_names=x_features, class_names=["Fake", "Real"], filled=True)
    plt.show()


## 3.1 Tìm RB dựa trên `source_domain` và `authors`

In [None]:
df1 = df[['authors', 'source_domain', 'target']]
df1.shape


In [None]:
# Encode source_domain and authors using LabelEncoder
from sklearn.preprocessing import LabelEncoder

encoder_source = LabelEncoder()
encoder_authors = LabelEncoder()

df1["source_domain_encoded"] = encoder_source.fit_transform(df1["source_domain"])
df1["authors_encoded"] = encoder_authors.fit_transform(df1["authors"])


In [None]:
tree_model1 = model_tree(depth=4)

X_le = np.hstack([df1["source_domain_encoded"].values.reshape(-1, 1),
                  df1["authors_encoded"].values.reshape(-1, 1)])

print(calculate_acc(df1, X_le, 'target', model=tree_model1))
print(plot_decision_tree(tree_model1, X_le))


In [None]:
# Create a mapping table for source_domain
df_mapping_source = pd.DataFrame({"source_domain_encoded": df1["source_domain_encoded"],
                                  "source_domain_original": df["source_domain"]}
                                  ).drop_duplicates().sort_values(by="source_domain_encoded", ascending=True)
df_mapping_source


In [None]:
# Save the mapping table for source_domain as a txt file
with open("mapping_source.txt", "w") as f:
    f.write(df_mapping_source.to_string(index=False))


In [None]:
# df[df['source_domain'].str.contains('giadinhtiepthi.com', case=False, na=False)]


In [None]:
# Create a mapping table for authors
df_mapping_authors = pd.DataFrame({"authors_encoded": df1["authors_encoded"], 
                                   "authors_original": df["authors"]}
                                   ).drop_duplicates().sort_values(by="authors_encoded", ascending=True)
df_mapping_authors


In [None]:
# Save the mapping table for authors as a txt file
with open("mapping_authors.txt", "w", encoding="utf-8") as f:
    f.write(df_mapping_authors.to_string(index=False))
    

## 3.2 Tìm RB dựa trên `title`

In [None]:
df2 = df[['title', 'target']]
df2.shape


In [None]:
# Load n-grams from a file
def read_ngrams(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f.readlines()]

# Read the n-gram list from three files
dictionaries_path = os.path.join(os.getcwd(), "dictionaries")
bi_grams = read_ngrams(os.path.join(dictionaries_path, "bi_gram.txt"))
tri_grams = read_ngrams(os.path.join(dictionaries_path, "tri_gram.txt"))
four_grams = read_ngrams(os.path.join(dictionaries_path, "four_gram.txt"))

# Compile a list of all n-gram
all_ngrams = set(bi_grams + tri_grams + four_grams)

print("The number of n-grams:", len(all_ngrams))


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Extract title data
titles = df2["title"].astype(str).tolist()

# Initialize TfidfVectorizer with an n-grams list from the file
vectorizer = TfidfVectorizer(ngram_range=(2, 4), vocabulary=all_ngrams)

# Calculate TF-IDF
X_tfidf = vectorizer.fit_transform(titles)

# Get the list of feature words
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = np.array(X_tfidf.mean(axis=0)).flatten()

# Convert to DataFrame and sort TF-IDF weights in descending order
df2_tfidf = pd.DataFrame({'N-gram': feature_names, 'TF-IDF Score': tfidf_scores}).sort_values(by="TF-IDF Score", ascending=False)

df2_tfidf.head(12)


In [None]:
# Select the top 10 n-grams with the highest weights
top_ngrams = df2_tfidf.nlargest(300, 'TF-IDF Score')["N-gram"].tolist()


# Create a feature matrix from df2['title']
vectorizer_top = TfidfVectorizer(ngram_range=(2, 4), vocabulary=top_ngrams)
# Vector the data
X_tfidf_top = vectorizer_top.fit_transform(df2["title"]).toarray()


tree_model2 = model_tree(depth=50)
print(calculate_acc(df2, X_tfidf_top, 'target', model=tree_model2))
# print(plot_decision_tree(tree_model2, vectorizer_top.get_feature_names_out()))

feature_names = vectorizer_top.get_feature_names_out().tolist()
print(export_text(tree_model2, feature_names=feature_names))
# print(export_text(tree_model2, feature_names=vectorizer_top.get_feature_names_out()))

# 4. Tổng hợp rule-based system

In [None]:
import string
import math  

def rule_based_classification(source, author, text_title):
    # List of fake news sources
    fake_sources = {
        '2sao.vn', 'anninh247.xyz', 'autoxe.net', 'baoangiang.com.vn', 'baonuocmy.com', 
        'baophapluat.online', 'binhluan.biz', 'blogxcy.wordpress.com', 'cand.com.vn', 'congtintuc24gio.com',
        'thoibao.de', 'thoibao.today', 'tingame.info', 'tintuconline.com.vn', 'tintucqpvn.net', 
        'tinvn.info', 'trumpandq.blogspot.com', 'tuvanannam.com', 'vietgiaitri.com', 'vinaexpress.com.vn'}

    # List of suspicious authors
    fake_author_keywords = {'An Bình', 'CÔNG TRUNG', 'Cùng Tác Giả', 'D.KIM THOA', 'Gioi Tre Viet',
                            'Hiếu Công', 'Hoàng Vy Thế Giới Trẻ', 'Hạ Huyền', 'J', 'Daisy'}
  
    # Keywords and TF-IDF threshold: Above this means fake
    keywords = {'sự thật': 0.17,
                'con gái': 0.22,
                'kinh dị': 0.25,
                'tài xế': 0.24,
                'bí mật': 0.25,
                'đàn ông': 0.43,
                'tất cả': 0.18,
                'báo mộng': 0.25,
                'lý do': 0.26,
                'cải cách': 0.28,
                'cảnh tượng': 0.20}
    
    # List of punctuation marks to check (excluding period . and comma ,)
    punctuation_to_check = set(string.punctuation) - {'.', ','}
    
    # Rule 1: Determine the Fake ratio based on source and author
    fake_source = source in fake_sources
    fake_author = any(keyword in author for keyword in fake_author_keywords) 

    if fake_source and fake_author:
        fake_score_source_author = 1 * 0.9  # Fake both source and author
    elif fake_source or fake_author:
        fake_score_source_author = 0.5 * 0.9  # Fake just source or author
    else:
        fake_score_source_author = 0  # No Fake in both
    
    # Rule 2: Calculate TF-IDF to check keywords
    vectorizer = TfidfVectorizer(vocabulary=keywords.keys()) 
    tfidf_matrix = vectorizer.fit_transform([text_title])
    
    fake_score_keywords = 0
    count_violations = 0
    
    feature_names = vectorizer.get_feature_names_out()
    for idx, word in enumerate(feature_names):
        if tfidf_matrix[0, idx] > keywords[word]:  
            count_violations += 1  

    # Check punctuation directly in text_title
    count_punctuation_violations = sum(1 for p in punctuation_to_check if p in text_title)
    count_violations += count_punctuation_violations 
    
    # Calculate the Fake score based on the number of violations
    if count_violations > 0:
        fake_score_keywords = 1 - math.pow(0.75, count_violations) 
    
    # Calculate the Fake/Real ratio from source/author and keywords
    fake_score = fake_score_source_author + fake_score_keywords
    real_score = 1 - fake_score

    return {'fake_score':round(fake_score, 2), 'real_score': round(real_score, 2) }


In [None]:
# Re-test with an example 
source = 'thoibao.today'
author = ''
text_title = 'Khám phá sửng sốt về trí nhớ \"như thần\" của động vật' 

result = rule_based_classification(source, author, text_title)
result
# result['fake_score']  # Chỉ in Fake_score

# 5. Fine-tune LLM 

Dùng Pho-BERT để huấn luyện bộ tin tức Tiếng Việt

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "vinai/phobert-base-v2"

# Load tokenizer and model
bert = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


## 5.1 Tạo ngẫu nhiên giá trị cho cột fake_score trong df

Cho tin tức có target là 'Real', thì sẽ được gán giá trị fake_score từ trong khoảng từ 0 - 0.1. Còn 'Fake' thì là từ 0.11 đến 1.0

In [None]:
def assign_fake_score(target):
    if target == "Fake":
        return np.random.uniform(0.11, 1.0)
    else:
        return np.random.uniform(0.0, 0.1)

# Apply function to create fake_score column
df["fake_score"] = df["target"].apply(assign_fake_score)

df.head()


## 5.2 Chia dataset ra tập train, validation và test

Chia tập dataset Train-Validation-Test theo ratio 70:15:15

In [None]:
# Train-Validation split
train_texts, test_texts, train_labels, test_labels = train_test_split(df['maintext'], df['fake_score'],
                                                                    random_state=2018,
                                                                    test_size=0.3)
# Validation-Test split
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels,
                                                                random_state=2018,
                                                                test_size=0.5)

## 5.3 Tokenize dataset

In [None]:
%pip install torch
import torch

In [None]:
MAX_LENGTH = 218
tokens_train = tokenizer.batch_encode_plus(
    train_texts.tolist(),
    max_length = MAX_LENGTH,
    padding='max_length',
    truncation=True
)
# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_texts.tolist(),
    max_length = MAX_LENGTH,
    padding='max_length',
    truncation=True
)
# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_texts.tolist(),
    max_length = MAX_LENGTH,
    padding='max_length',
    truncation=True
)

# Convert lists to tensors
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())


train_labels = torch.tensor(train_labels.tolist()) # Convert to flat tensor
val_labels = torch.tensor(val_labels.tolist())
test_labels = torch.tensor(test_labels.tolist())

train_y = train_y.view(-1, 1)  # Reshape labels to (num_samples, 1)
val_y = val_y.view(-1, 1)
test_y = test_y.view(-1, 1)

## 5.4 Dựng Dataloader

In [None]:
# Data Loader structure definition
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
batch_size = 32                                               #define a batch size

train_data = TensorDataset(train_seq, train_mask, train_y)    # wrap tensors
train_sampler = RandomSampler(train_data)                     # sampler for sampling the data during training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
                                                              # dataLoader for train set
val_data = TensorDataset(val_seq, val_mask, val_y)            # wrap tensors
val_sampler = SequentialSampler(val_data)                     # sampler for sampling the data during training
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)
                                                              # dataLoader for validation set

test_data = TensorDataset(test_seq, test_mask, test_y)    # wrap tensors
test_sampler = SequentialSampler(test_data)                     # sampler for sampling the data during training
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

## 5.5 Dựng model

Đảm bảo là gradient vẫn sẽ được tính qua các layer

In [None]:
for param in bert.parameters():
    param.requires_grad = True 

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW

# Define the PhoBERT-based regression model
class PhoBERTRegressor(nn.Module):
    def __init__(self, bert):
        super(PhoBERTRegressor, self).__init__()
        self.bert = bert
        self.sigmoid = nn.Sigmoid()  # Ensures output is between 0 and 1

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # Use logits directly
        return self.sigmoid(logits)  # Apply sigmoid for output in (0,1)

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model and move it to the appropriate device
model = PhoBERTRegressor(bert).to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)  # Learning rate
loss_fn = nn.MSELoss().to(device)  # Mean Squared Error for regression tasks

# Set number of training epochs
epochs = 2

## 5.6 Dựng training loop

In [None]:
# Define training function
def train():
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):  
        # Progress update every 50 batches
        if step % 50 == 0 and step != 0:
            print(f'  Batch {step:,} of {len(train_dataloader):,}.')

        # Move batch data to device
        sent_id, mask, scores = [t.to(device) for t in batch]
        scores = scores.float()  # Ensure labels are float for regression

        # Clear previous gradients
        model.zero_grad()

        # Forward pass
        preds = model(input_ids=sent_id, attention_mask=mask)

        # Compute loss
        loss = loss_fn(preds, scores)
        total_loss += loss.item()

        # Backward pass and optimization
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Prevent exploding gradients
        optimizer.step()

    # Compute average training loss
    avg_loss = total_loss / len(train_dataloader)
    return avg_loss


# Define evaluation function
def evaluate():
    print("\nEvaluating...")
    model.eval()  # Set model to evaluation mode
    total_loss = 0

    with torch.no_grad():  # Disable gradient calculation
        for step, batch in enumerate(val_dataloader):
            # Progress update every 50 batches
            if step % 50 == 0 and step != 0:
                print(f'  Batch {step:,} of {len(val_dataloader):,}.')

            # Move batch data to device
            sent_id, mask, scores = [t.to(device) for t in batch]
            scores = scores.float()  # Ensure labels are float for regression

            # Forward pass
            preds = model(sent_id, mask)

            # Compute loss
            loss = loss_fn(preds, scores)
            total_loss += loss.item()

    # Compute average validation loss
    avg_loss = total_loss / len(val_dataloader)
    return avg_loss


## 5.7 Huấn luyện model

In [None]:
best_valid_loss = float('inf')
train_losses=[]                   # empty lists to store training and validation loss of each epoch
valid_losses=[]

for epoch in range(epochs):
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    train_loss = train()                       # train model
    valid_loss = evaluate()                    # evaluate model
    if valid_loss < best_valid_loss:              # save the best model
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'model_weights.pt')
    train_losses.append(train_loss)               # append training and validation loss
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


## 5.8 Chạy bộ test data với model

In [None]:
def predict_from_dataloader(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    predictions = []

    with torch.no_grad():  # No gradient calculation needed
        for batch in dataloader:
            input_ids, attention_mask, _ = batch  # Ignore labels (if present)
            
            # Move inputs to the same device as the model
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            # Get predictions
            outputs = model(input_ids, attention_mask)
            scores = outputs.cpu().numpy().flatten()  # Convert to NumPy
            
            predictions.extend(scores)  # Store results

    return predictions  # Returns a list of predicted scores
test_predictions = predict_from_dataloader(model, test_dataloader, device)

# Create DataFrame
test_df = pd.DataFrame({"text": test_texts, "predicted_realness_score": test_predictions})

# Save results to CSV
test_df.to_csv("test_predictions.csv", index=False)

# Display first few rows
print(test_df.head())


# 6. Kết hợp Rule-based và LLM

Kết hợp fake-score của Rule-based và của LLM để cho ra nhận định cuối là tin thật hay giả