In [1]:
import pandas as pd
import string

df = pd.read_csv('/home/jovyan/active-projects/macro-economics-textbook/data/subsections.csv')

## Generate Keyphrases

In [2]:
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Text2TextGenerationPipeline,
)

class KeyphraseGenerationPipeline(Text2TextGenerationPipeline):
    def __init__(self, model, keyphrase_sep_token=';', *args, **kwargs):
        super().__init__(
            model=AutoModelForSeq2SeqLM.from_pretrained(model),
            tokenizer=AutoTokenizer.from_pretrained(model,
                                                    truncation=True,
                                                    max_length=512,
                                                    model_max_length=512,
                                                    ),
            *args,
            **kwargs
        )
        self.keyphrase_sep_token = keyphrase_sep_token

    def postprocess(self, model_outputs):
        results = super().postprocess(model_outputs=model_outputs)
        return [
            [
                keyphrase.strip().translate(str.maketrans('', '', string.punctuation))
                for keyphrase in result.get('generated_text').split(
                    self.keyphrase_sep_token
                )
                if keyphrase.translate(str.maketrans('', '', string.punctuation)) != ''
            ]
            for result in results
        ][0]

pipe = KeyphraseGenerationPipeline(
    model='ml6team/keyphrase-generation-keybart-inspec',
    truncation=True)

In [None]:
df['keyphrases'] = df['clean_text'].apply(lambda x: pipe(x))

In [None]:
df.to_csv('/home/jovyan/active-projects/macro-economics-textbook/data/subsections.csv', index=False)

## Create Json

In [2]:
df = pd.read_csv('/home/jovyan/active-projects/macro-economics-textbook/data/subsections.csv')
df

Unnamed: 0,module,chapter,section,subsection,heading,raw_text,clean_text,slug,keyphrases
0,1,1,1,2,Overview,"Overview\n\nimport Alert from ""react-bootstrap...","Overview\n \n By the end of this section, ...",overview,"['scarcity', 'resources', 'individual decision..."
1,1,1,1,3,Introduction to FRED,Introduction to FRED\n\nData is very important...,Introduction to FRED\nData is very important i...,introduction-to-fred,"['fried', 'data', 'government agencies', 'soci..."
2,1,1,1,4,The Problem of Scarcity,The Problem of Scarcity\n\nThink about all the...,The Problem of Scarcity\nThink about all the t...,the-problem-of-scarcity,"['scarcity', 'food', 'shelter', 'transportatio..."
3,1,1,1,5,Learn with Videos,"Learn with Videos\n\n<iframe\n width=""560""\n ...",Learn with Videos\n \n \n \n How 10 ...,learn-with-videos,"['water scarcity', 'drought', 'urban areas']"
4,1,1,1,6,The Division of and Specialization of Labor,The Division of and Specialization of Labor\n\...,The Division of and Specialization of Labor\nF...,the-division-of-and-specialization-of-labor,"['division of and specialization of labour', '..."
...,...,...,...,...,...,...,...,...,...
618,5,20,4,8,Long-Term Trends in Barriers to Trade,Long-Term Trends in Barriers to Trade\n\n\nIn ...,Long-Term Trends in Barriers to Trade\nIn news...,long-term-trends-in-barriers-to-trade,"['longterm trends', 'barriers to trade', 'news..."
619,5,20,4,9,Learn with Videos,"Learn with Videos\n\n<iframe width=""560"" heigh...",Learn with Videos\n,learn-with-videos-2,"['learning', 'feedback', 'market overview', 's..."
620,5,20,4,10,Please your write summary below,Please your write summary below,Please your write summary below,please-your-write-summary-below,"['call centre', 'collaborative browsing', 'cus..."
621,5,20,5,2,Overview,"Overview\n\n<Alert variant=""primary"">\n <Aler...","Overview\n \n By the end of this section, ...",overview,"['international trade', 'marketoriented econom..."


In [28]:
df[(df.chapter == 5) & (df.section == 2)].clean_text.str.cat(sep=' ')

"Overview\n  \n    By the end of this section, you will be able to:\n  \n  - Define and contrast nominal GDP and real GDP - Explain GDP deflator - Calculate\n  real GDP based on nominal GDP values\nWhen examining economic statistics, there is a crucial distinction worth emphasizing. The distinction is between nominal and real measurements, which refer to whether or not inflation has distorted a given statistic. Looking at economic statistics without considering inflation is like looking through a pair of binoculars and trying to guess how close something is: unless you know how strong the lenses are, you cannot guess the distance very accurately. Similarly, if you do not know the inflation rate, it is difficult to figure out if a rise in GDP is due mainly to a rise in the overall level of prices or to a rise in quantities of goods produced. The nominal value of any economic statistic means that we measure the statistic in terms of actual prices that exist at the time. The real value re

In [4]:
from ast import literal_eval
import json

section_dict = {}

for (chapter, section), section_frame in df.groupby(['chapter', 'section']):
    section_key = f'{chapter:02d}-{section:02d}'
    section_dict[section_key] = {
        'text': '',
        'keyphrases': [],
    }
    for subsection, subframe in section_frame.groupby('subsection'):
        keyphrases = literal_eval(subframe['keyphrases'].item())
        text = subframe['clean_text'].item()
        
        section_dict[section_key]['keyphrases'].extend(keyphrases)
        section_dict[section_key]['text'] += text

In [7]:
with open('/home/jovyan/active-projects/macro-economics-textbook/data/macroeconomics-2e-sections.json', 'w') as fp:
    json.dump(section_dict, fp)