In [1]:
import pandas as pd
import numpy as np

df = pd.read_parquet('../data/open-stax-texts-terms.parquet')

In [2]:
def return_duplicates(dataframe):
    print(f'Remaining Sections: {dataframe.shape[0]}')
    duplicates = (dataframe
                 .loc[dataframe.text.duplicated(keep=False)].book
                 .value_counts()
                 .sort_index()
                )
    print(f'Remaining duplicates: {duplicates.sum()}')
    display(duplicates)
    return dataframe

_ = return_duplicates(df)

Remaining Sections: 4892
Remaining duplicates: 1052


algebra-and-trigonometry-2e                 39
chemistry-2e                                43
chemistry-atoms-first-2e                    43
college-algebra-2e                          40
college-algebra-corequisite-support-2e       3
college-physics-2e                         232
college-physics-ap-courses-2e              232
introduction-business                        3
introductory-business-statistics           122
organizational-behavior                     10
principles-economics-2e                     78
principles-macroeconomics-2e                30
principles-macroeconomics-ap-courses-2e     24
principles-management                       13
principles-microeconomics-2e                76
principles-microeconomics-ap-courses-2e     64
Name: book, dtype: int64

Some sections texts are repeated in different textbooks

After fiddling with the values, it seems that removing the following books results in a dataset with no duplicate sections.

Not all sections in all of these books are repeated. An alternative solution would be to remove duplicated sections...

In [7]:
drop_columns = ['college-physics-ap-courses-2e',
                'college-algebra-corequisite-support-2e',
                'principles-macroeconomics-ap-courses-2e',
                'principles-microeconomics-ap-courses-2e',
                'chemistry-atoms-first-2e',
                'introductory-business-statistics',
                'principles-economics-2e',
                'principles-macroeconomics-2e',
                'principles-microeconomics-2e',
                'college-algebra-2e',
                'principles-management',
               ]

deduplicated = df.loc[~df.book.isin(drop_columns)].pipe(return_duplicates)

Remaining Sections: 3713
Remaining duplicates: 0


Series([], Name: book, dtype: int64)

In [4]:
def combine_offsets(row):
    '''Takes a dataframe row with two columns that contain (st, end) tuples
    Returns a single list of (st, end, label) tuples
    where label relates to the name of the respective column
    '''
    return [(st, end, label) for label, values in
            {'bold_term': row.bold_term_offsets,
             'light_term': row.no_bold_term_offsets}
            .items()
            for (st, end) in values]

In [5]:
def convert_to_jsonl(dataframe, out_path):
    return (dataframe[['book', 'url', 'text', 'bold_term_offsets', 'no_bold_term_offsets']]
     .assign(labels=lambda x: x.apply(combine_offsets, axis=1))
     .rename(columns={'url': 'name'})
     .assign(group= lambda x: pd.Categorical(x.book).codes) # convert book titles to integer codes for splitting dataset by group
     .drop(columns=['book', 'bold_term_offsets', 'no_bold_term_offsets'])
     .to_json(out_path,
              orient='records',
              lines=True)
    )

convert_to_jsonl(deduplicated, f'../assets/openstax.jsonl')

# Generate special file for Principles of Macroeconomics 2e
This textbook is not included in the prepared dataset, but the text is used for our demo website: https://textbook-demo.web.app/

We will create a special .jsonl file for this textbook, so model performance can be manually reviewed.

In [7]:
convert_to_jsonl(df.loc[df.book == 'principles-macroeconomics-2e'], f'../assets/macroeconomics_2e.jsonl')