In [1]:
import re
import pandas as pd
import argparse
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
import numpy as np
import scipy
import gc
import os  


tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")
model = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("chinese-roberta model loaded on %s"%device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


chinese-roberta model loaded on cpu


In [None]:
# Define the target terms
terms = ['他', '她', '它']

def get_prediction(original_sent, masked_sent):
    if '[MASK]' not in masked_sent:
        return np.zeros((len(terms),))

    # Get the IDs of target terms in the vocabulary
    target_inds = [tokenizer.get_vocab().get(x, -1) for x in terms]
    if -1 in target_inds:
        raise ValueError("Some terms are not in the tokenizer vocabulary!")

    # **Step 1: Expose RoBERTa to the original sentence**
    with torch.no_grad():
        original_input_ids = tokenizer.encode(original_sent, return_tensors='pt').to(device)
        model(original_input_ids)  # RoBERTa processes full context but doesn't return output

    # **Step 2: Predict on the masked sentence**
    masked_input_ids = tokenizer.encode(masked_sent, return_tensors='pt').to(device)

    # Find the position of the [MASK] token
    masked_position = (masked_input_ids.squeeze() == tokenizer.mask_token_id).nonzero()

    if not masked_position.size(0):
        return np.zeros((len(terms),))

    predictions = torch.zeros(masked_position.size(0), len(terms))

    for i in range(masked_position.size(0)):
        masked_pos = masked_position[i]

        with torch.no_grad():
            output = model(masked_input_ids)  # Get predictions for masked sentence

        # Extract the logits of the [MASK] token
        last_hidden_state = output.logits[0, masked_pos].squeeze()

        # Compute softmax over the vocabulary
        lhs_softmax = torch.softmax(last_hidden_state, dim=0)

        for j, term_index in enumerate(target_inds):
            predictions[i, j] = lhs_softmax[term_index]

    return predictions[0].cpu().numpy()

In [None]:
def parse_sentences_from_file(input_filename, entities_file, text_column_name, id_column_name, output_filename, total_sentences_filename):
    with open(entities_file, 'r', encoding='utf-8') as f:
        entities = sorted([line.strip() for line in f.readlines()], key=len, reverse=True)

    df = pd.read_csv(input_filename)

    results = []
    total_sentences_data = []  
    total_sentences = 0  

    def custom_sentence_split(text):
        sentences = []
        pattern = r'([^。\n]*[。]?)(?:」)?(?=\n|$|[^」])|([^！\n]*[！]?)(?:」)?(?=\n|$|[^」])'   
        matches = list(re.finditer(pattern, text))

        last_end = 0
        for match in matches:
            sentence = match.group(0)
            sentences.append(sentence)
            last_end = match.end()

        if last_end < len(text):
            sentences.append(text[last_end:])

        return [s.strip() for s in sentences if s.strip()]

    for i, row in df.iterrows():
        text = row[text_column_name]
        text_id = row[id_column_name]

        sentences = custom_sentence_split(text)
        total_sentences += len(sentences) 
        total_sentences_data.extend([{"text_id": text_id, "sentence": s} for s in sentences])

        for sentence in sentences:
            original_sentence = sentence  
            text_id = row[id_column_name]
            positions = []

            for entity in entities:
                if re.search(r'[\u4e00-\u9fff]', entity):  # For Chinese terms
                    pattern = r'(' + re.escape(entity) + r')'
                else:  # For English Terms
                    pattern = r'(?<![a-zA-Z0-9.])(' + re.escape(entity) + r')(?![a-zA-Z0-9.])'

                matches = list(re.finditer(pattern, sentence))
                for match in matches:
                    positions.append((match.start(), match.end(), entity))

            # Sort by the position of entity occurrences
            positions.sort(key=lambda x: x[0])

            # Mask based on the original sentence, masking only one entity at a time
            for pos in positions:
                start, end, entity = pos
                masked_sentence = (
                    original_sentence[:start] +
                    "[MASK]" +
                    original_sentence[end:]
                )
                results.append({
                    "sentence": original_sentence,  
                    "masked_sentence": masked_sentence,
                    "text_id": text_id,
                    "original_term": entity
                })

    print(f"Total sentences parsed: {total_sentences}")

    # Create a DataFrame for all sentences
    total_sentences_df = pd.DataFrame(total_sentences_data)
    total_sentences_df.to_csv(total_sentences_filename, index=False, encoding='utf-8')
    print(f"Total sentences saved to {total_sentences_filename}")

    # Create a DataFrame for sentences containing entities
    output_df = pd.DataFrame(results)
    output_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f"{len(output_df)} sentences containing entities processed and saved to {output_filename}")


In [None]:
parse_sentences_from_file(
    input_filename="Dataset_filename",
    entities_file="entities.txt",
    text_column_name="content",
    id_column_name="id",
    output_filename="Output_filename",
    total_sentences_filename="Total_output_filename"
)

In [None]:
pd.options.display.float_format = '{:.7f}'.format  

def get_probs(filename):
    df = pd.read_csv(filename)
    terms = ['他', '她', '它']

    # Initialize columns for each term with 0.0
    for term in terms:
        df[term] = 0.0

    for index, row in df.iterrows():
        try:
            # Let RoBERTa see the complete sentence first, then predict the MASKed sentence
            scores = get_prediction(row['sentence'], row['masked_sentence'])

            # If the length of scores matches the length of terms, update the DataFrame
            if len(scores) == len(terms):
                for i, term in enumerate(terms):
                    df.at[index, term] = scores[i]
            else:
                print(f"Warning: Unexpected scores length for sentence '{row['masked_sentence']}'")
        
        except Exception as e:
            print(f"Error processing sentence '{row['masked_sentence']}': {e}")
            continue

    output_filename = 'updated_' + filename
    output_dir = os.path.dirname(output_filename)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    df.to_csv(output_filename, float_format='%.7f', index=False)


In [None]:
get_probs('Output_filename')

In [None]:
## Remove duplicates
raw_df = pd.read_csv('updated_Output_filename')
original_len = len(raw_df)
duplicate_rows = raw_df[raw_df['masked_sentence'].duplicated(keep=False)]
selected_columns = duplicate_rows[['masked_sentence', 'text_id']]

raw_df_unique = raw_df.drop_duplicates(subset='masked_sentence', keep='first')
unique_len = len(raw_df_unique)
print("Number of unique entries after dropping duplicates:", unique_len)
dropped_rows = original_len - unique_len
print("Number of dropped rows:", dropped_rows)
raw_df_unique.to_csv('unique_dataset', index=False)

In [None]:
# threshold filtering

df = pd.read_csv('updated_Output_filename')
original_len = len(df) 
print(f'原始資料筆數: {original_len}')

columns_to_check = ['他', '她', '它']
thresh = 1 / 21128

def filter_any_column_in_range(df, columns, ranges):
    # Initialize condition to False (not matching)
    condition = pd.Series(False, index=df.index)

    for min_val, max_val in ranges:
        for col in columns:
            # Update condition: True if any column matches the range
            condition |= (df[col] >= min_val) & (df[col] <= max_val)

    return df[condition]

# Store the previous counts for comparison
previous_counts = {col: original_len for col in columns_to_check}  # Initialize to the original length

for factor in range(1, 31):
    ranges_to_check = [(factor * thresh, 1.0)]

    filtered_df = filter_any_column_in_range(df, columns_to_check, ranges_to_check)

    print(f'倍數 * {factor}')
    print(f'篩選後的筆數: {len(filtered_df)}')

    for col in columns_to_check:
        col_condition = (filtered_df[col] >= (factor * thresh)) & (filtered_df[col] <= 1.0)
        current_count = col_condition.sum()

        # Calculate how many rows were deducted compared to the previous round
        rows_deducted = previous_counts[col] - current_count

        # For the first round, do not show "減" and only show the count
        if factor == 1:
            print(f'{col} 符合條件的筆數: {current_count}')
        else:
            print(f'{col} 符合條件的筆數: {current_count} (減 {rows_deducted} 筆) ({round(rows_deducted/original_len * 100,4)}%)')

        previous_counts[col] = current_count

    print()  

In [None]:
# Saving factor 14 data
round_to_save = 14

for factor in range(1, 31):
    ranges_to_check = [(factor * thresh, 1.0)]
    
    filtered_df = filter_any_column_in_range(df, columns_to_check, ranges_to_check)

    if factor == round_to_save:
        filtered_df.to_csv(f'f{factor}dataset.csv', index=False)
        print(f'File saved as round_{factor}.csv')

In [None]:
# Calculate anthroscores A(sx)

def get_anthroscores(sentence_filename):
    df = pd.read_csv(sentence_filename)
    df['human_scores'] = df['他'] + df['她']
    df['nonhuman_scores'] = df['它']
    df['human_scores'] = np.where(df['human_scores'] > 0, df['human_scores'], 1e-8)
    df['nonhuman_scores'] = np.where(df['nonhuman_scores'] > 0, df['nonhuman_scores'], 1e-8)

    df['anthroscore'] = np.log(df['human_scores']) - np.log(df['nonhuman_scores'])
    
    for col in ['他', '她', '它', 'human_scores', 'nonhuman_scores']:
        df[col] = df[col].apply(lambda x: f'{x:.6f}')

    output_filename = os.path.join('updated_Dataset', 'anthro_' + os.path.basename(sentence_filename))    
    df.to_csv(output_filename, index=False)

In [None]:
get_anthroscores('14dataset.csv')

In [None]:
## Compute Average Score
def compute_average_scores(input_file, output_file, text_id_name):
    if input_file.endswith('csv'):
        original_df = pd.read_csv(input_file)
    
    # Calculate average scores without modifying original_df
    unique_df = original_df.copy()
    id_counts = unique_df[text_id_name].value_counts()
    
    # Add Average_anthroscore column only to unique_df
    unique_df['Average_anthroscore'] = unique_df['anthroscore']
    duplicate_ids = id_counts[id_counts > 1].index
    
    for text_id in duplicate_ids:
        mask = unique_df[text_id_name] == text_id
        total_sentences = id_counts[text_id]
        sum_scores = unique_df.loc[mask, 'anthroscore'].sum()
        avg_score = sum_scores / total_sentences
        unique_df.loc[mask, 'Average_anthroscore'] = avg_score
    
    # Keep only one row per text_id
    unique_df = unique_df.drop_duplicates(subset=[text_id_name]).copy()
    unique_df.to_csv(output_file, index=False)
    
    return unique_df

In [None]:
# Usage
input_file = 'anthro_f14_dataset08.csv'
output_file = 'avg_anthro_f14_dataset08.csv'
text_id_name = 'text_id'  
unique_df = compute_average_scores(input_file, output_file, text_id_name)