In [1]:
import datasets
ds = datasets.load_dataset('Asap7772/open_web_math_raw_0_1000000', split='train')
ds

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'subgoal_setting_raw', 'backward_chaining_raw'],
    num_rows: 777000
})

In [2]:
import re

def extract_variables(markdown_text):
    """
    Extracts variable values from a markdown string.
    The markdown is expected to have sections where a header (starting with '##')
    is immediately followed (after optional blank lines) by a single line
    containing the variable value.
    
    Args:
        markdown_text (str): The markdown content as a string.
        
    Returns:
        dict: A dictionary where keys are header texts and values are the extracted variable lines.
    """
    # This regex works as follows:
    # - It finds a header line that starts with '##', capturing any text that follows.
    # - It then matches one or more newline characters (allowing for blank lines),
    #   and captures the first non-empty line that follows as the variable value.
    pattern = r"##\s*(.*?)\s*\n+(?!##)([^\n]+)"
    
    matches = re.findall(pattern, markdown_text)
    
    variables = {}
    for header, var in matches:
        # Strip any extra whitespace from the variable text
        variables[header] = var.strip()
    return variables

In [3]:
ds[0]

{'url': 'https://www.shaalaa.com/question-bank-solutions/the-equations-given-of-the-two-regression-lines-are-2x-3y-6-0-and-5x-7y-12-0-find-a-correlation-coefficient-b-x-y-lines-regression-x-y-y-x-or-equation-line-regression_599',
 'text': 'The equations given of the two regression lines are 2x + 3y - 6 = 0 and 5x + 7y - 12 = 0. Find: (a) Correlation coefficient (b) σxσy - Mathematics and Statistics\n\nSum\n\nThe equations given of the two regression lines are 2x + 3y - 6 = 0 and 5x + 7y - 12 = 0.\n\nFind:\n\n(a) Correlation coefficient\n\n(b)\xa0sigma_x/sigma_y\n\nSolution\n\nWe assume that 2x + 3y - 6 = 0 to be the line of regression of y on x.\n\n2x + 3y - 6 = 0\n\n⇒ x = - 3/2y + 3\n\n⇒ "bxy" = - 3/2\n\n5x + 7y - 12 = 0 to be the line of regression of x on y.\n\n5x + 7y - 12 = 0\n\n⇒ y = - 5/7x + 12/7\n\n⇒\xa0 "byx" = - 5/7\n\nNow,\n\nr = sqrt("bxy.byx") = sqrt(15/14)\n\nbyx = (rσ_y)/(σ_x) = - 5/7, "bxy" = (rσ_x)/(σ_y) = - 3/2\n\n⇒ (σ_x^2)/(σ_y^2) = \xa0(3/2)/(5/7)\n\n⇒ (σ_x^2)/(σ_y^

In [4]:
extracted_vars = extract_variables(ds['backtracking_raw'][0])
print("Extracted variables:")
for header, var in extracted_vars.items():
    print(f"'{header}': '{var}'")

Extracted variables:
'Does backtrack?': 'no'
'Number of backtrack steps': '<count>0</count>.'
'Rationale': 'The provided text does not exhibit any backtracking behavior. The writer presents a problem, outlines the equations given, and proceeds step-by-step to solve for the correlation coefficient and the ratio of standard deviations without indicating any realization that a path won't work or explicitly going back to try a different approach. The solution unfolds in a linear and logical manner, suggesting that the writer did not abandon any thoughts or backtrack to previous computations.'


In [5]:
import os

def map_fn_backtrack(examples):
    ret_dict = {}
    # Process each sample in the batch
    for i in range(len(examples['text'])):
        curr_backtrack = examples['backtracking_raw'][i]
        try:
            extracted_vars = extract_variables(curr_backtrack)
        except Exception:
            extracted_vars = {}  # Use an empty dict if extraction fails
        
        required_keys = ['Does backtrack?', 'Number of backtrack steps', 'Rationale']
        mapped_keys = ['is_backtrack', 'backtrack_count', 'backtrack_rationale']
        
        # Check if all required keys are present
        if all(key in extracted_vars for key in required_keys):
            for key, mapped_key in zip(required_keys, mapped_keys):
                ret_dict.setdefault(mapped_key, []).append(extracted_vars[key])
        else:
            # Append default values if extraction is incomplete
            for mapped_key in mapped_keys:
                ret_dict.setdefault(mapped_key, []).append(None)
        
        # Always append the original example values
        for k in examples.keys():
            ret_dict.setdefault(k, []).append(examples[k][i])
            
    return ret_dict

def map_fn_backchain(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):
        curr_backchain = examples['backward_chaining_raw'][i]
        try:
            extracted_vars = extract_variables(curr_backchain)
        except Exception:
            extracted_vars = {}
        
        required_keys = ['Does the text exhibit backward chaining?', 'Number of backward chaining instances', 'Rationale']
        mapped_keys = ['is_backchain', 'backchain_count', 'backchain_rationale']
        
        if all(key in extracted_vars for key in required_keys):
            for key, mapped_key in zip(required_keys, mapped_keys):
                ret_dict.setdefault(mapped_key, []).append(extracted_vars[key])
        else:
            for mapped_key in mapped_keys:
                ret_dict.setdefault(mapped_key, []).append(None)
        
        for k in examples.keys():
            ret_dict.setdefault(k, []).append(examples[k][i])
            
    return ret_dict

def map_fn_verification(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):            
        curr_verification = examples['verification_raw'][i]
        try:
            extracted_vars = extract_variables(curr_verification)
        except Exception:
            extracted_vars = {}
        
        required_keys = ['Does verification?', 'Number of answer verification steps', 'Rationale']
        mapped_keys = ['is_verification', 'verification_count', 'verification_rationale']
        
        if all(key in extracted_vars for key in required_keys):
            for key, mapped_key in zip(required_keys, mapped_keys):
                ret_dict.setdefault(mapped_key, []).append(extracted_vars[key])
        else:
            for mapped_key in mapped_keys:
                ret_dict.setdefault(mapped_key, []).append(None)
        
        for k in examples.keys():
            ret_dict.setdefault(k, []).append(examples[k][i])
            
    return ret_dict

def map_fn_solution(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):            
        curr_is_solution = examples['is_solution_raw'][i]
        try:
            extracted_vars = extract_variables(curr_is_solution)
        except Exception:
            extracted_vars = {}
        
        required_keys = ['Contains Problem?', 'Contains Solution?', 'Text domain (broad)', 'Text domain (specific)', 'Rationale']
        mapped_keys = ['contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale']
        
        if all(key in extracted_vars for key in required_keys):
            for key, mapped_key in zip(required_keys, mapped_keys):
                ret_dict.setdefault(mapped_key, []).append(extracted_vars[key])
        else:
            for mapped_key in mapped_keys:
                ret_dict.setdefault(mapped_key, []).append(None)
        
        for k in examples.keys():
            ret_dict.setdefault(k, []).append(examples[k][i])
            
    return ret_dict

# Apply the mapping functions
ds = ds.map(map_fn_backtrack, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())
ds = ds.map(map_fn_backchain, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())
ds = ds.map(map_fn_verification, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())
ds = ds.map(map_fn_solution, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())

Map (num_proc=24): 100%|██████████| 777000/777000 [00:07<00:00, 108713.56 examples/s]
Map (num_proc=24): 100%|██████████| 777000/777000 [00:11<00:00, 65230.26 examples/s] 


In [6]:
ds

Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'subgoal_setting_raw', 'backward_chaining_raw', 'is_backtrack', 'backtrack_count', 'backtrack_rationale', 'is_backchain', 'backchain_count', 'backchain_rationale', 'is_verification', 'verification_count', 'verification_rationale', 'contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale'],
    num_rows: 777000
})

In [7]:
filtered_ds_backtrack = ds.filter(lambda x: x['is_backtrack'] and x['is_backtrack'].lower() == 'yes', num_proc=os.cpu_count())
filtered_ds_backtrack

Filter (num_proc=24): 100%|██████████| 777000/777000 [00:07<00:00, 106694.93 examples/s]


Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'subgoal_setting_raw', 'backward_chaining_raw', 'is_backtrack', 'backtrack_count', 'backtrack_rationale', 'is_backchain', 'backchain_count', 'backchain_rationale', 'is_verification', 'verification_count', 'verification_rationale', 'contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale'],
    num_rows: 46467
})

In [8]:
filtered_ds_backchain = ds.filter(lambda x: x['is_backchain'] and x['is_backchain'].lower() == 'yes', num_proc=os.cpu_count())
filtered_ds_backchain

Filter (num_proc=24): 100%|██████████| 777000/777000 [00:02<00:00, 304934.20 examples/s]


Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'subgoal_setting_raw', 'backward_chaining_raw', 'is_backtrack', 'backtrack_count', 'backtrack_rationale', 'is_backchain', 'backchain_count', 'backchain_rationale', 'is_verification', 'verification_count', 'verification_rationale', 'contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale'],
    num_rows: 274768
})

In [9]:
filtered_ds_verified = ds.filter(lambda x: x['is_verification'] and x['is_verification'].lower() == 'yes', num_proc=os.cpu_count())
filtered_ds_verified

Filter (num_proc=24): 100%|██████████| 777000/777000 [00:02<00:00, 323855.58 examples/s]


Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'subgoal_setting_raw', 'backward_chaining_raw', 'is_backtrack', 'backtrack_count', 'backtrack_rationale', 'is_backchain', 'backchain_count', 'backchain_rationale', 'is_verification', 'verification_count', 'verification_rationale', 'contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale'],
    num_rows: 164377
})

In [10]:
filtered_ds_backtrack[0]

{'url': 'https://dsp.stackexchange.com/questions/47210/duality-of-the-continuous-time-fourier-transform-derivation-and-notation?noredirect=1',
 'text': '# Duality of the continuous-time Fourier transform - derivation and notation\n\nSuppose we have the Fourier transform pair $x(t)$ and $X(\\omega)$ such that $$X(\\omega) = \\int_{-\\infty}^{\\infty} x(t) e^{-j\\omega t} \\mathrm{d}t$$\n\nThe duality property states that $X(t)$ and $2\\pi x(-\\omega)$ constitute a Fourier transform pair. I was trying to prove this statement when I ran into the following problem: the Fourier transform of a signal is sometimes denoted by $X(j\\omega)$, such that $$X(j\\omega) = \\int_{-\\infty}^{\\infty} x(t) e^{-j\\omega t} \\mathrm{d}t$$\n\nUsing this notation, how do we even state the duality property? Note that evaluating $X(t)$ in this case would effectively replace $j\\omega$ (as opposed to just $\\omega$ in the previous case) by $t$ which is clearly wrong. Would we have to write $X(jt)$ for consist

In [11]:
print(filtered_ds_backtrack[0]['text'])

# Duality of the continuous-time Fourier transform - derivation and notation

Suppose we have the Fourier transform pair $x(t)$ and $X(\omega)$ such that $$X(\omega) = \int_{-\infty}^{\infty} x(t) e^{-j\omega t} \mathrm{d}t$$

The duality property states that $X(t)$ and $2\pi x(-\omega)$ constitute a Fourier transform pair. I was trying to prove this statement when I ran into the following problem: the Fourier transform of a signal is sometimes denoted by $X(j\omega)$, such that $$X(j\omega) = \int_{-\infty}^{\infty} x(t) e^{-j\omega t} \mathrm{d}t$$

Using this notation, how do we even state the duality property? Note that evaluating $X(t)$ in this case would effectively replace $j\omega$ (as opposed to just $\omega$ in the previous case) by $t$ which is clearly wrong. Would we have to write $X(jt)$ for consistency?

Returning to the question of deriving the duality property and using the notation I used in the beginning, what is the flaw in the following approach?

Since $$X(\omega) 

In [12]:
filtered_ds_verified[0]

{'url': 'https://pypi.org/project/refactor/0.5.0/',
 'text': 'AST-based fragmental source code refactoring toolkit\n\n# Refactor\n\nSimple, hassle-free, dependency-free, AST based fragmental source code refactoring and transformation toolkit.\n\n## Why?\n\nOur framework is primarily built on the principle of "simple but effective transformations". We focus on refactorings that target a small span of source code, and work our way out from it. What this enables for us is being able to operate directly on a single format for both analyses and transformations. This is what we shine at compared to other similar tools.\n\n## How?\n\nLet\'s not get into too much details, but just to give a sneak peek we can try to write a rule that would replace the identifier placeholder with 42.\n\nimport ast\nfrom refactor import Rule, Replace, run\n\n# Each refactor transformer inherits from "refactor.Rule"\nclass FillPlaceholders(Rule):\n\n# And each rule implements a "match()" method, which would\n# rec

In [13]:
print(filtered_ds_verified[1]['text'])

# multifi_cokriging.py¶

Integrates the Multi-Fidelity Co-Kriging method described in [LeGratiet2013].

(Author: Remi Vauclin vauclin.remi@gmail.com)

This code was implemented using the package scikit-learn as basis. (Author: Vincent Dubourg, vincent.dubourg@gmail.com)

OpenMDAO adaptation. Regression and correlation functions were directly copied from scikit-learn package here to avoid scikit-learn dependency. (Author: Remi Lafage, remi.lafage@onera.fr)

ISAE/DMSM - ONERA/DCPS

class openmdao.surrogate_models.multifi_cokriging.FloatMultiFiCoKrigingSurrogate(regr='constant', rho_regr='constant', theta=None, theta0=None, thetaL=None, thetaU=None, tolerance=1e-06, initial_range=0.3)[source]

Predictions are returned as the mean of the NormalDistribution predicted by base class model.

__init__(regr='constant', rho_regr='constant', theta=None, theta0=None, thetaL=None, thetaU=None, tolerance=1e-06, initial_range=0.3)

Initialize all attributes.

Parameters: regr : string or callable, opt

In [14]:
ds.push_to_hub('Asap7772/open-web-math-processed-v2')

Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 18.91ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 21.54ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 20.19ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 21.05ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 20.54ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 21.29ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 21.16ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 19.15ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 21.11ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 19.78ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 20.28ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:01<00:00, 20.

CommitInfo(commit_url='https://huggingface.co/datasets/Asap7772/open-web-math-processed-v2/commit/a76d9632bbda593918110a46b634b18bedd2af92', commit_message='Upload dataset', commit_description='', oid='a76d9632bbda593918110a46b634b18bedd2af92', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Asap7772/open-web-math-processed-v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Asap7772/open-web-math-processed-v2'), pr_revision=None, pr_num=None)