In [1]:
import datasets
ds = datasets.load_dataset('Asap7772/open-web-math-processed', split='train')
ds

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw'],
    num_rows: 10000
})

In [2]:
import re

def extract_variables(markdown_text):
    """
    Extracts variable values from a markdown string.
    The markdown is expected to have sections where a header (starting with '##')
    is immediately followed (after optional blank lines) by a single line
    containing the variable value.
    
    Args:
        markdown_text (str): The markdown content as a string.
        
    Returns:
        dict: A dictionary where keys are header texts and values are the extracted variable lines.
    """
    # This regex works as follows:
    # - It finds a header line that starts with '##', capturing any text that follows.
    # - It then matches one or more newline characters (allowing for blank lines),
    #   and captures the first non-empty line that follows as the variable value.
    pattern = r"##\s*(.*?)\s*\n+(?!##)([^\n]+)"
    
    matches = re.findall(pattern, markdown_text)
    
    variables = {}
    for header, var in matches:
        # Strip any extra whitespace from the variable text
        variables[header] = var.strip()
    return variables

In [3]:
extracted_vars = extract_variables(ds['backtracking_raw'][0])
print("Extracted variables:")
for header, var in extracted_vars.items():
    print(f"'{header}': '{var}'")

Extracted variables:
'Does backtrack?': 'no'
'Number of backtrack steps': '<count> 0 </count>.'
'Rationale': 'The provided text does not exhibit any backtracking behavior. It is a coherent and linear discussion about Bayes' theorem, its history, and related topics, without any instances of the author explicitly abandoning a line of reasoning to try a different approach. The text includes various tangents and asides, but these are presented in a straightforward and non-recursive manner, without any indication of backtracking.'


In [4]:
import os
def map_fn_backtrack(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):
        
        curr_backtrack = examples['backtracking_raw'][i]
        extracted_vars = extract_variables(curr_backtrack)
        
        required_keys = ['Does backtrack?', 'Number of backtrack steps', 'Rationale']
        mapped_keys = ['is_backtrack', 'backtrack_count', 'backtrack_rationale']
        
        not_found = False
        for key in required_keys: 
            if key not in extracted_vars:
                not_found = True
                break
        if not_found: continue
        
        for key, mapped_key in zip(required_keys, mapped_keys):
            if mapped_key not in ret_dict: ret_dict[mapped_key] = []
            ret_dict[mapped_key].append(extracted_vars[key])

        for k in examples.keys():
            if k not in ret_dict: ret_dict[k] = []
            ret_dict[k].append(examples[k][i])
            
    return ret_dict

def map_fn_verification(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):            
        curr_verification = examples['verification_raw'][i]
        extracted_vars = extract_variables(curr_verification)
        
        required_keys = ['Does answer verification?', 'Number of answer verification steps', 'Rationale']
        mapped_keys = ['is_verification', 'verification_count', 'verification_rationale']
        
        not_found = False
        for key in required_keys: 
            if key not in extracted_vars:
                not_found = True
                break
        if not_found: continue
        
        for key, mapped_key in zip(required_keys, mapped_keys):
            if mapped_key not in ret_dict: ret_dict[mapped_key] = []
            ret_dict[mapped_key].append(extracted_vars[key])
            
        curr_is_solution = examples['is_solution_raw'][i]    
        extracted_vars = extract_variables(curr_is_solution)

        for k in examples.keys():
            if k not in ret_dict: ret_dict[k] = []
            ret_dict[k].append(examples[k][i])
            
    return ret_dict

def map_fn_solution(examples):
    ret_dict = {}
    for i in range(len(examples['text'])):            
        curr_verification = examples['is_solution_raw'][i]
        extracted_vars = extract_variables(curr_verification)
        
        required_keys = ['Contains Problem?', 'Contains Solution?', 'Text domain (broad)', 'Text domain (specific)', 'Rationale']
        mapped_keys = ['contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale']
        
        not_found = False
        for key in required_keys: 
            if key not in extracted_vars:
                not_found = True
                break
        if not_found: continue
        
        for key, mapped_key in zip(required_keys, mapped_keys):
            if mapped_key not in ret_dict: ret_dict[mapped_key] = []
            ret_dict[mapped_key].append(extracted_vars[key])
            
        curr_is_solution = examples['is_solution_raw'][i]    
        extracted_vars = extract_variables(curr_is_solution)

        for k in examples.keys():
            if k not in ret_dict: ret_dict[k] = []
            ret_dict[k].append(examples[k][i])
            
    return ret_dict

ds = ds.map(map_fn_backtrack, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())
ds = ds.map(map_fn_verification, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())
ds = ds.map(map_fn_solution, batched=True, remove_columns=ds.column_names, num_proc=os.cpu_count())

Map (num_proc=24): 100%|██████████| 9614/9614 [00:00<00:00, 32105.82 examples/s]


In [5]:
ds

Dataset({
    features: ['url', 'text', 'date', 'metadata', 'backtracking_raw', 'is_solution_raw', 'verification_raw', 'is_backtrack', 'backtrack_count', 'backtrack_rationale', 'is_verification', 'verification_count', 'verification_rationale', 'contain_problem', 'contain_solution', 'domain_broad', 'domain_specific', 'solution_rationale'],
    num_rows: 9593
})

In [6]:
filtered_ds_backtrack = ds.filter(lambda x: x['is_backtrack'].lower() == 'yes', num_proc=os.cpu_count())

Filter (num_proc=24): 100%|██████████| 9593/9593 [00:00<00:00, 41052.95 examples/s]


In [7]:
filtered_ds_verified = ds.filter(lambda x: x['is_verification'].lower() == 'yes', num_proc=os.cpu_count())

Filter (num_proc=24): 100%|██████████| 9593/9593 [00:00<00:00, 44220.93 examples/s]


In [8]:
filtered_ds_backtrack[0]

{'url': 'http://math.stackexchange.com/questions/222974/probability-of-getting-2-aces-2-kings-and-1-queen-in-a-five-card-poker-hand-pa',
 'text': "# Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand (Part II)\n\nSo I reworked my formula in method 1 after getting help with my original question - Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand. But I am still getting results that differ...although they are much much closer than before, but I must still be making a mistake somewhere in method 1. Anyone know what it is?\n\nMethod 1\n\n$P(2A \\cap 2K \\cap 1Q) = P(Q|2A \\cap 2K)P(2A|2K)P(2K)$\n\n$$= \\frac{1}{12}\\frac{{4 \\choose 2}{46 \\choose 1}}{50 \\choose 3}\\frac{{4 \\choose 2}{48 \\choose 3}}{52 \\choose 5}$$\n\n$$= \\frac{(6)(17296)(6)(46)}{(2598960)(19600)(12)}$$\n\n$$= 4.685642 * 10^{-5}$$\n\nMethod 2\n\n$$\\frac{{4 \\choose 2} {4 \\choose 2}{4 \\choose 1}}{52 \\choose 5} = \\frac{3}{54145}$$\n\n$$5.540678 * 10^{-5}$$\n\n-\n

In [9]:
print(filtered_ds_backtrack[0]['text'])

# Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand (Part II)

So I reworked my formula in method 1 after getting help with my original question - Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand. But I am still getting results that differ...although they are much much closer than before, but I must still be making a mistake somewhere in method 1. Anyone know what it is?

Method 1

$P(2A \cap 2K \cap 1Q) = P(Q|2A \cap 2K)P(2A|2K)P(2K)$

$$= \frac{1}{12}\frac{{4 \choose 2}{46 \choose 1}}{50 \choose 3}\frac{{4 \choose 2}{48 \choose 3}}{52 \choose 5}$$

$$= \frac{(6)(17296)(6)(46)}{(2598960)(19600)(12)}$$

$$= 4.685642 * 10^{-5}$$

Method 2

$$\frac{{4 \choose 2} {4 \choose 2}{4 \choose 1}}{52 \choose 5} = \frac{3}{54145}$$

$$5.540678 * 10^{-5}$$

-
Please make an effort to make the question self-contained and provide a link to your earlier question. –  Sasha Oct 28 '12 at 19:56
I think we would rather ahve you edit your initial ques

In [10]:
filtered_ds_verified[0]

{'url': 'http://math.stackexchange.com/questions/222974/probability-of-getting-2-aces-2-kings-and-1-queen-in-a-five-card-poker-hand-pa',
 'text': "# Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand (Part II)\n\nSo I reworked my formula in method 1 after getting help with my original question - Probability of getting 2 Aces, 2 Kings and 1 Queen in a five card poker hand. But I am still getting results that differ...although they are much much closer than before, but I must still be making a mistake somewhere in method 1. Anyone know what it is?\n\nMethod 1\n\n$P(2A \\cap 2K \\cap 1Q) = P(Q|2A \\cap 2K)P(2A|2K)P(2K)$\n\n$$= \\frac{1}{12}\\frac{{4 \\choose 2}{46 \\choose 1}}{50 \\choose 3}\\frac{{4 \\choose 2}{48 \\choose 3}}{52 \\choose 5}$$\n\n$$= \\frac{(6)(17296)(6)(46)}{(2598960)(19600)(12)}$$\n\n$$= 4.685642 * 10^{-5}$$\n\nMethod 2\n\n$$\\frac{{4 \\choose 2} {4 \\choose 2}{4 \\choose 1}}{52 \\choose 5} = \\frac{3}{54145}$$\n\n$$5.540678 * 10^{-5}$$\n\n-\n

In [13]:
print(filtered_ds_verified[1]['text'])

## MATH1111 Quizzes

Local Linearity and the Differential Quiz
Web resources available Questions

This quiz tests the work covered in lecture on local linearity and the differential and corresponds to Section 14.3 of the textbook Calculus: Single and Multivariable (Hughes-Hallett, Gleason, McCallum et al.).
There is a useful applet at http://www.slu.edu/classes/maymk/banchoff/TangentPlane.html - take some time to read the instructions and add your own functions.

There are more web quizzes at Wiley, select Section 3. This quiz has 10 questions.

Suppose $f\left(3,2\right)=4\phantom{\rule{0.3em}{0ex}},\phantom{\rule{1em}{0ex}}{f}_{x}\left(3,2\right)=-2$ and ${f}_{y}\left(3,2\right)=3$ for some surface $z=f\left(x,y\right)\phantom{\rule{0.3em}{0ex}}.$
Which of the following is the tangent plane to the surface at $\left(3,2,4\right)\phantom{\rule{0.3em}{0ex}}?$ Exactly one option must be correct)
 a) $4z=-2\left(x-3\right)+3\left(y-2\right)$ b) $z=4-2\left(x-3\right)+3\left(y-2\right)$ c)

In [12]:
ds.push_to_hub('Asap7772/open-web-math-processed-v2')

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 33.97ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:05<00:00,  5.43s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/Asap7772/open-web-math-processed-v2/commit/c515c09af7afb963ebcdeceda7ba12e5b97d26f5', commit_message='Upload dataset', commit_description='', oid='c515c09af7afb963ebcdeceda7ba12e5b97d26f5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Asap7772/open-web-math-processed-v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Asap7772/open-web-math-processed-v2'), pr_revision=None, pr_num=None)