In [9]:
import pandas as pd
import dask.dataframe as dd

from dask import delayed, compute
from dask.distributed import Client

import re
import sys
import csv
import time
import string
from pathlib import Path
from nltk.tokenize import word_tokenize
from nltk.translate import bleu_score
from rapidfuzz import fuzz
from rouge_score import rouge_scorer

In [10]:
# __init__
def _normalize_(x: str, remove_latex_flag: bool = True) -> str:
    REMOVE_PUNCT = str.maketrans("", "", string.punctuation)

    if remove_latex_flag:
        x = _remove_latex_(x)

    x = x.translate(REMOVE_PUNCT)
    x = re.sub(r"\s+", " ", x)
    x = x.lower()
    x = x.strip()

    return x

def _extract_latex_(text):
    if pd.isna(text):
        return []

    latex_pattern = re.compile(r'(\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\)|\\begin\{.*?\}.*?\\end\{.*?\})', re.DOTALL)

    latex_expressions = latex_pattern.findall(text)

    stripped_text = latex_pattern.sub('', text).strip()

    return latex_expressions

def _remove_latex_(text: str) -> str:
    if pd.isna(text):
        return ''
    try:
        text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)
        text = re.sub(r'\$(.*?)\$', r'\1', text)
        text = re.sub(r'\$\$(.*?)\$\$', r'\1', text)
        text = re.sub(r'\\\[(.*?)\\\]', r'\1', text)
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        text = re.sub(r'\{|\}', '', text)
    except:
        text = ''

    return text

In [11]:
# tokenize()
def safe_word_tokenize(text):
    if pd.isna(text):
        return []
    return word_tokenize(text)

def parallel_tokenize(series):
    """
    Tokenize a Dask Series in parallel.
    """
    return series.map(safe_word_tokenize, meta=('token', 'object'))

def extract_parts(token_list):
    """
    Extract sublist (of tokens) post tokenization of the text column.
    """
    length = len(token_list)
    first_10 = token_list[:max(1, length // 10)]
    mid_start = length // 2 - max(1, length // 20)
    mid_end = length // 2 + max(1, length // 20)
    middle_10 = token_list[mid_start:mid_end]
    last_10 = token_list[-max(1, length // 10):]
    
    return first_10, middle_10, last_10

In [12]:
# compute_metrics
# - BLEU
def compute_bleu(row, reference_col, candidate_col):
    return bleu_score.sentence_bleu(
        [row[reference_col]], 
        row[candidate_col], 
        smoothing_function=bleu_score.SmoothingFunction().method1
    )

def parallel_bleu(df, reference_col, candidate_col):
    return df.apply(lambda row: compute_bleu(row, reference_col, candidate_col), axis=1, meta=('x', 'f8'))


In [13]:
# - CAR (Character Accuracy Rate)
def compute_approx_car(row, reference_col, candidate_col):
    '''
    Complement of character error rate (CER); hence: character accuracy rate (CAR)
    '''
    similarity = fuzz.ratio(row[reference_col], row[candidate_col])
    return similarity / 100.0

# Compute CAR scores in parallel for the entire dataframe
def parallel_car(df, reference_col, candidate_col):
    return df.apply(lambda row: compute_approx_car(row, reference_col, candidate_col), axis=1, meta=('car_score', 'f8'))


In [14]:
# - ROUGE-1
def compute_rouge1(row, reference_col, candidate_col, scorer):
    reference_text = str(row[reference_col]) if pd.notnull(row[reference_col]) else ""
    candidate_text = str(row[candidate_col]) if pd.notnull(row[candidate_col]) else ""
    score = scorer.score(reference_text, candidate_text)
    return score['rougeL'].fmeasure

def parallel_rouge1(df, reference_col, candidate_col, scorer):
    return df.apply(lambda row: compute_rouge1(row, reference_col, candidate_col, scorer), axis=1, meta=('x', 'f8'))


In [15]:
class DaskResponseTable:
    def __init__(self,
                 db_src_filename: str,
                 db_dst_filename: str,
                 chunk_index: int = -1,
                 chunk_size: int = -1,
                 num_cores: int = 100,
                 overwrite_flag: bool = False,
                 parser_columns: list[str] = ['pymupdf', 'nougat', 'grobid'],
                 root_dir: Path = Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database')) -> None:
        """
        Assume `html` is ground truth text
        """

        max_int = sys.maxsize
        while True:
            try:
                csv.field_size_limit(max_int)
                break
            except OverflowError:
                max_int = int(max_int / 10)

        # Validate root directory
        root_dir = Path(root_dir)
        assert root_dir.is_dir(), f"Path `root_dir` does not exist or is not directory. Invalid directory: {root_dir}"
        
        # Paths
        self.db_src_path = Path(root_dir) / db_src_filename
        self.db_dst_path = Path(root_dir) / db_dst_filename
        self.overwrite_flag = overwrite_flag
        self.parser_columns = parser_columns
        self.chunk_index = chunk_index
        self.chunk_size = chunk_size
        self.num_cores = num_cores
        
        # Validate file paths
        assert self.db_src_path.is_file(), f"Source CSV path invalid. No such path: {self.db_src_path}"
        if not self.overwrite_flag:
            assert not self.db_dst_path.is_file(), f"Destination Path invalid. File already exists at path: {self.db_dst_path}"
        
        # Load DataFrame with Dask
        self.df = dd.read_csv(self.db_src_path, sep='|', sample=5000000, sample_rows=10, on_bad_lines='skip', engine='python')

        # fill NaNs
        for col in self.df.columns:
            if self.df[col].dtype == 'object' and col not in {'html', 'path'}:
                self.df[col] = self.df[col].fillna('')
        
        # Drop NA rows
        df_proc = self.df.dropna()

        # Subset DataFrame
        if self.chunk_index != -1 and self.chunk_size != -1:
            # Check validity 
            assert self.chunk_index >= 0, f"`chunk_index` should be non-negative (or -1) but is {self.chunk_index}"
            assert self.chunk_size > 0, f"`chunk_size` should be positive (or -1) but is {self.chunk_size}"
            
            # Overwrite self.db_dst_path
            db_dst_filename = Path(db_dst_filename).stem + f'_{self.chunk_index}-{len(self.df) // self.chunk_size}' + Path(db_dst_filename).suffix
            self.db_dst_path = Path(root_dir) / db_dst_filename
            
            # Subset DataFrame
            i_start, i_end = self.chunk_index * self.chunk_size, min((self.chunk_index + 1) * self.chunk_size, len(self.df))
            if i_start >= len(self.df):
                raise ValueError(f'i_start index exceeds length of Dataframe: i_start={i_start} for len(df_proc)={len(df_proc)}')
            self.df = self.df.loc[i_start:i_end]

            # DEBUG
            print(f'len(df_proc): {len(self.df)}, i_start/i_end: ', i_start, i_end)

        # Status
        print(f'DF loaded, nrows : {len(self.df)} after removing NANs & subsetting')

        # Normalized text columns
        for parser_col in ['html'] + self.parser_columns:
            self.df[f'{parser_col}_norm'] = self.df.apply(lambda row: _normalize_(row[parser_col]), axis=1, meta=('norm', 'object'))

        # LaTeX text columns
        for parser_col in ['html'] + self.parser_columns:
            self.df[f'{parser_col}_latex'] = self.df.apply(lambda row: _extract_latex_(row[parser_col]), axis=1, meta=('latex', 'object'))

        # Initialize Dask client for parallel processing
        self.client = Client(n_workers=self.num_cores)

    def tokenize(self,):
        """
        Tokenize html/parser text columns in the DataFrame.
        """
        # This works great!
        for parser_col in ['html'] + self.parser_columns:
            print(f'Tokenizing {parser_col} ... ')
            self.df[f'{parser_col}_token'] = parallel_tokenize(self.df[parser_col])
            self.df[f'{parser_col}_norm_token'] = parallel_tokenize(self.df[f'{parser_col}_norm'])
            print('... completed!\n')

        # Extract and assign beginning, middle, and end parts
        def extract_and_assign(series, col_prefix):
            length = series.map(len, meta=('length', 'int'))
            self.df[f'{col_prefix}Beg_token'] = series.map(lambda x: x[:max(1, len(x) // 10)], meta=('token', 'object'))
            self.df[f'{col_prefix}Mid_token'] = series.map(lambda x: x[len(x) // 2 - max(1, len(x) // 20):len(x) // 2 + max(1, len(x) // 20)], meta=('token', 'object'))
            self.df[f'{col_prefix}End_token'] = series.map(lambda x: x[-max(1, len(x) // 10):], meta=('token', 'object'))
    
        for parser_col in ['html'] + self.parser_columns:
            extract_and_assign(self.df[f'{parser_col}_token'], parser_col)
            extract_and_assign(self.df[f'{parser_col}_norm_token'], f'{parser_col}_norm')
    
        pass

    def compute_metrics(self):
        """
        Compute metrics BLEU, ROUGE, METEOR, CAR etc.
        """

        # BLEU
        if True:
            for parser_col in self.parser_columns:
                print(f'Computing BLEU for {parser_col} ... ')
                self.df[f'{parser_col}_bleu'] = parallel_bleu(self.df, f'{parser_col}_token', 'html_token')
                self.df[f'{parser_col}_norm_bleu'] = parallel_bleu(self.df, f'{parser_col}_norm_token', 'html_norm_token')
                print(f'... completed!\n')

        # CAR
        if True:
            for parser_col in self.parser_columns:
                print(f'Computing CAR for {parser_col} ... ')
                self.df[f'{parser_col}_car'] = parallel_car(self.df, f'{parser_col}_token', 'html_token')
                self.df[f'{parser_col}_norm_car'] = parallel_car(self.df, f'{parser_col}_norm_token', 'html_norm_token')
                print(f'... completed!\n')

        # ROUGE-2
        if True:
            scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
            for parser_col in self.parser_columns:
                print(f'Computing ROUGE2 for {parser_col} ... ')
                self.df[f'{parser_col}_rougeL'] = parallel_rouge1(self.df, f'{parser_col}', 'html', scorer)
                self.df[f'{parser_col}_norm_rougeL'] = parallel_rouge1(self.df, f'{parser_col}_norm', 'html_norm', scorer)
                print(f'... completed!\n')

        pass

    def save(self, output_filename='out_metrics.csv'):
        """
        Save the computed metrics DataFrame to a CSV file with '|' as the separator.
        """
        
        self.df.to_csv(output_filename, sep='|', index=False)
        print(f"Metrics saved to {output_filename}.")
        
        pass

In [None]:
t = DaskResponseTable(root_dir='/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database',
                      chunk_index=0,
                      chunk_size=4564,
                      num_cores=8,
                      db_src_filename='parser_output_raw.csv', 
                      db_dst_filename='out_df_500.csv')

In [None]:
t.df.compute()

In [None]:
# works
#pandas_df = t.df.compute()
#pandas_df.head()

t.tokenize()

In [None]:
print()
t.compute_metrics()

In [None]:
print('start: ', time.time())
t.save('./tmp/random.csv')
print('end: ', time.time())

In [None]:
# 16 cores, tokenization only
#1725230038.5646489 - 1725229906.095144
# 8 cores, tokenization only
#1725230700.3915436 - 1725230568.392145
# 8 cores with CAR only
#1725231052.6019797 - 1725230791.3853154
# 8 cores with BLEU only
#1725231464.9768543 - 1725231148.9887695
# 8 cores with ROUGE2 only BUT ONLY 8 ROWS!
#1725238226.0218575 - 1725238104.1046085
# 8 cores with ROUGE2 (proper)
#1725239619.280274 - 1725239053.972454
# 8 cores with ROUGE1 (proper) -> 565
#1725243417.534024 - 1725242852.1266394

In [None]:
# METEOR
def compute_meteor(row, reference_col, candidate_col):
    return translate.meteor_score.meteor_score(
        [row[reference_col]], 
        row[candidate_col]
    )

def parallel_meteor(df, reference_col, candidate_col):
    return df.apply(lambda row: self.compute_meteor(row, reference_col, candidate_col), axis=1, meta=('x', 'f8'))

# CAR (Character Accuracy Rate)
def compute_approx_car(row, reference_col, candidate_col):
    '''
    Complement of character error rate (CER); hence: character accuracy rate (CAR)
    '''
    similarity = fuzz.ratio(row[reference_col], row[candidate_col])
    return similarity / 100.0

def parallel_car(df, reference_col, candidate_col):
    return df.apply(lambda row: compute_approx_car(row, reference_col, candidate_col), axis=1, meta=('x', 'f8'))

# ROUGE scores
def compute_rouge(row, reference_col, candidate_col, scorer):
    scores = scorer.score(row[reference_col], row[candidate_col])
    return [scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure]

def parallel_rouge(df, reference_col, candidate_col, scorer):
    return df.apply(lambda row: compute_rouge(row, reference_col, candidate_col, scorer), axis=1, meta=('x', 'f8'))


class FastResponseTable:
    def __init__(self,
                 db_src_filename:str,
                 db_dst_filename:str,
                 chunk_index:int=-1,
                 chunk_size:int=-1,
                 num_cores:int=100,
                 overwrite_flag:bool = False,
                 parser_columns:list[str]=['pymupdf', 'nougat', 'grobid'],
                 root_dir:Path=Path('/lus/eagle/projects/argonne_tpc/siebenschuh/aurora_gpt/database'),
                 n:int=-1) -> None:
        """
        Assume `html` is groudntruth text
        """

        # validate
        root_dir = Path(root_dir)
        assert root_dir.is_dir(), f"Path`root_dir` does not exist or is not directory. Invalid directory: {root_dir}"
        
        # paths
        self.db_src_path = Path(root_dir) / db_src_filename
        self.db_dst_path = Path(root_dir) / db_dst_filename
        self.overwrite_flag = overwrite_flag
        self.n = n
        self.parser_columns = parser_columns
        self.chunk_index = chunk_index
        self.chunk_size = chunk_size
        self.num_cores = num_cores
        
        # raw data
        assert self.db_src_path.is_file(), f"Source CSV path invalid. No such path: {self.db_src_path}"
        if not self.overwrite_flag:
            assert not self.db_dst_path.is_file(), f"Destination Path invalid. File already exists at path: {self.db_dst_path}"
        
        # load df
        df_raw = pd.read_csv(self.db_src_path, sep='|')
        
        # drop NA rows
        df_proc = df_raw.dropna()

        # subset DataFrame
        if self.chunk_index != -1 and self.chunk_size != -1:
            # check validity 
            assert self.chunk_index >= 0, f"`chunk_index` should be non-negative (or -1) but is {chunk_index}"
            assert self.chunk_size > 0, f"`chunk_size` should be positive (or -1) but is {chunk_size}"
            
            # overwrite self.db_dst_path
            db_dst_filename = Path(db_dst_filename).stem + f'_{self.chunk_index}-{len(df_proc) // self.chunk_size}' + Path(db_dst_filename).suffix
            self.db_dst_path = Path(root_dir) / db_dst_filename
            
            # subset dataframe
            i_start, i_end = chunk_index * chunk_size, min((chunk_index+1) * chunk_size, len(df_proc))
            if i_start >= len(df_proc):
                raise ValueError(f'i_start index exceeds length of Dataframe: i_start={i_start} for len(df_proc)={len(df_proc)}')
            df_proc = df_proc.iloc[i_start:i_end]

            #  DEBUG
            print(f'len(df_proc): {len(df_proc)}, i_start/i_end: ', i_start, i_end)

        # status
        print(f'DF loaded...\n{len(df_raw)} rows\n... {len(df_proc)} after removing NANs & subsetting')
        
        # subset
        if n > 0:
            df_proc = df_proc.iloc[:round(n)]
            print(f'n={n} ... Only use first n rows.')

        # normalized text columns
        df_proc['html_norm'] = df_proc.apply(lambda row: self.normalize(row['html']), axis=1)
        for parser_col in self.parser_columns:
            df_proc[f'{parser_col}_norm'] = df_proc.apply(lambda row: self.normalize(row[parser_col]), axis=1)

        # latex text columns
        df_proc['html_latex'] = df_proc.apply(lambda row: self.extract_latex(row['html']), axis=1)
        for parser_col in self.parser_columns:
            df_proc[f'{parser_col}_latex'] = df_proc.apply(lambda row: self.extract_latex(row[parser_col]), axis=1)
            
        # assign
        self.df = df_proc
        
        # Convert to Dask DataFrame
        self.df = dd.from_pandas(df_proc, npartitions=self.num_cores)

        # Initialize Dask client for parallel processing
        self.client = Client(n_workers=self.num_cores)

        pass

    def extract_latex(self, text):
        """
        Extracts LaTeX expressions from the input text and returns both the LaTeX expressions and the text with LaTeX stripped.
    
        Parameters:
            text (str): The input string containing LaTeX expressions.
    
        Returns:
            tuple: A tuple containing two elements:
                - A list of extracted LaTeX expressions.
                - A string with the LaTeX expressions removed.
        """
        # Regular expression to match LaTeX expressions
        latex_pattern = re.compile(r'(\$.*?\$|\\\[.*?\\\]|\\\(.*?\\\)|\\begin\{.*?\}.*?\\end\{.*?\})', re.DOTALL)
    
        # Extract all LaTeX expressions
        latex_expressions = latex_pattern.findall(text)
    
        # Strip the LaTeX expressions from the text
        stripped_text = latex_pattern.sub('', text).strip()
    
        return latex_expressions
        
    def remove_latex(self, text:str) -> str:
        """
        Remove LaTeX formatting from a string
        """
        # Remove LaTeX commands (e.g., \textbf{...}, \emph{...}, etc.)
        text = re.sub(r'\\[a-zA-Z]+\{(.*?)\}', r'\1', text)
        
        # Remove inline math (e.g., $...$)
        text = re.sub(r'\$(.*?)\$', r'\1', text)
        
        # Remove display math (e.g., \[...\] or $$...$$)
        text = re.sub(r'\$\$(.*?)\$\$', r'\1', text)
        text = re.sub(r'\\\[(.*?)\\\]', r'\1', text)
        
        # Remove other LaTeX-specific characters (e.g., \, \%, etc.)
        text = re.sub(r'\\[a-zA-Z]+', '', text)
        
        # Remove braces and any content between them
        text = re.sub(r'\{|\}', '', text)
        
        return text
    
    def normalize(self, x:str, remove_latex_flag:bool=True) -> str:
        """
        Normalize the text
        """
    
        # const
        REMOVE_PUNCT = str.maketrans("", "", string.punctuation)
    
        # remove latex
        if remove_latex_flag:
            x = self.remove_latex(x)
    
        # remove escape characters
        x = x.translate(REMOVE_PUNCT)
        x = re.sub(r"\s+", " ", x)
        x = x.lower()
        x = x.strip()
        
        return x
        
    def tokenize_columns(self):
        """
        Tokenize html/parser text columns in dataframe
        """

        # get current df
        df_proc = self.df

        # Tokenize columns in parallel
        for parser_col in ['html'] + self.parser_columns:
            print(f'Tokenizing {parser_col} ... ')
            df_proc[f'{parser_col}_token'] = parallel_tokenize(df_proc[parser_col])
            df_proc[f'{parser_col}_norm_token'] = parallel_tokenize(df_proc[f'{parser_col}_norm'])
            print('... completed!\n')

        # append beginning/middle/end part
        for parser_col in ['html'] + self.parser_columns:
            f_proc[f'{parser_col}Beg_token'], df_proc[f'{parser_col}Mid_token'], df_proc[f'{parser_col}End_token'] = zip(*df_proc[f'{parser_col}_token'].map(self.extract_parts).compute())
            df_proc[f'{parser_col}Beg_norm_token'], df_proc[f'{parser_col}Mid_norm_token'], df_proc[f'{parser_col}End_norm_token'] = zip(*df_proc[f'{parser_col}_norm_token'].map(self.extract_parts).compute())
        
        # re-assign
        self.df = df_proc

    def compute_metrics(self, normalized: bool = True) -> None:
        """
        Processes the table in parallel using Dask
        """
        
        # Copy frame of processed columns
        df_proc = self.df

        # BLEU
        print('BLEU ...')
        for parser_col in self.parser_columns:
            print(f'   {parser_col}')
            for part in ['', 'Beg', 'Mid', 'End']:
                df_proc[f'{parser_col}{part}_bleu'] = self.parallel_bleu(df_proc, f'html{part}_token', f'{parser_col}_token')
                df_proc[f'{parser_col}{part}_bleu_norm'] = self.parallel_bleu(df_proc, f'html{part}_norm_token', f'{parser_col}_norm_token')
        print('...done')

        # ROUGE
        print('ROUGE ...')
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        for parser_col in self.parser_columns:
            print(f'   {parser_col}')
            for part in ['']:
                # raw text
                rouge_scores = self.parallel_rouge(df_proc, f'html{part}', f'{parser_col}{part}', scorer)
                df_proc[f'{parser_col}{part}_rouge1'], df_proc[f'{parser_col}{part}_rouge2'], df_proc[f'{parser_col}{part}_rougeL'] = zip(*rouge_scores.compute())

                # normalized text
                rouge_scores_norm = self.parallel_rouge(df_proc, f'html{part}_norm', f'{parser_col}{part}_norm', scorer)
                df_proc[f'{parser_col}{part}_rouge1_norm'], df_proc[f'{parser_col}{part}_rouge2_norm'], df_proc[f'{parser_col}{part}_rougeL_norm'] = zip(*rouge_scores_norm.compute())
        print('...done')

        # CAR (Character Accuracy Rate)
        print('CAR ...')
        for parser_col in self.parser_columns:
            print(f'   {parser_col}')
            for part in ['', 'Beg', 'Mid', 'End']:
                df_proc[f'{parser_col}{part}_car'] = self.parallel_car(df_proc, f'html{part}_token', f'{parser_col}{part}_token')
                df_proc[f'{parser_col}{part}_car_norm'] = self.parallel_car(df_proc, f'html{part}_norm_token', f'{parser_col}{part}_norm_token')
        print('...done')

        # Assign the final DataFrame to self.df_metrics
        self.df_metrics = df_proc.compute()

    def save_table(self) -> None:
        """Store processed table"""
        self.df_metrics.to_csv(self.db_dst_path, sep='|')

    def load_table(self) -> None:
        """Load processed table"""
        assert self.db_dst_path.is_file(), f"File does not exist at path: {self.db_dst_path}"
        self.df_metrics = pd.read_csv(self.db_dst_path, sep='|')

In [None]:
t = FastResponseTable(root_dir='./tmp',
                      chunk_index=0,
                      chunk_size=100,
                      num_cores=100,
                      db_src_filename='df_500.csv', 
                      db_dst_filename='out_df_500.csv')

In [None]:
%%time
t.tokenize_columns()

In [None]:
df_processed = t.df.compute()

In [None]:
df_processed.head()