# Gathering Data for FLAIR (Financial Literacy AI Resource)

In [1]:
import re
import os
from tika import parser
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import numpy as np
import urllib.request

In [2]:
### Helper functions
def download_file(download_url, filename):
    response=urllib.request.urlopen(download_url)
    save_path = os.path.join("data_sources", filename)
    with open(save_path, "wb") as f:
        f.write(response.read())
    print(f"Saved PDF to {save_path}")

### Downloading financial literacy information
We're only downloading financial literacy information from government websites to avoid advertising, opinions, bias, and false claims.  
Our sources are:  
- Consumer Financial Protection Bureau ([cfpb](www.consumerfinance.gov))


Data will be formatted as a list of dictionaries. Each data point will have the structure: 
```
{
    "_id": int,
    "source_title": str,
    "source_filename": str,
    "source_url" : str,
    "section_title": str,
    "passage_text": str
}
```

In [52]:
class parsePDF:
    def __init__(self, url, filename):
        self.filename = filename
        self.url = url

    
    def extract_contents(self, start_regex=None, download=True):
        """ Extract a pdf's contents using tika. """
        # Check if the file is downloaded
        pdf_path = os.path.join("data_sources", self.filename)
        if not os.path.exists(pdf_path):
            # Download the file to the data_sources folder
            if download == True:
                download_file(self.url, self.filename)
            else:
                # Read directly from the website without downloading
                pdf_path = self.url
            
        # Get raw parsed pdf text 
        pdf = parser.from_file(pdf_path)
        self.text = pdf["content"]
        self.title = pdf["metadata"]["title"]
        
        # Cut text to start at starting_chars
        if start_regex:
            try:
                split = re.split(start_regex, self.text, 1)[-1]
                self.text = start_regex.replace("\s+", "\n") + split
            except:
                print(f"Failed to find the phrase '{starting_chars}'")        
        return self.text 
        
        
    def clean_text(self, specific=None):
        """ Clean the raw text of the pdf to be readable English. """      
        # Remove non ASCII characters: curved quotations
        self.text = re.sub(r'“|”', '"', self.text)  # double quotes
        self.text = re.sub(r'’|‘', "'", self.text)  # single quotes
        
        # Remove Lines starting with 
        self.text = re.sub(r"^.*$", "", self.text, flags=re.M)
        
        # Remove non ASCII characters: bullet points as lists
        pattern = r"\s*([•·∙⋅◦‣§■◊])\s*(.*?)\s*\1"
        self.text = re.sub(pattern, r"\n\2;", self.text, flags=re.DOTALL)
        
        # Remove all other non ASCII characters
        self.text = re.sub(r"[^\x00-\x7f]", "", self.text)
        
        # Format numbered lists as semicolon-separated lists
        self.text = re.sub(r"^[0-9]+\s*\.(.*)\n", r"\1;", self.text, flags=re.M)
        
        # If a line ends with a comma, remove it for clarity
        self.text = re.sub(r",(\n)+", r"\1", self.text)
        
        # Remove numbers from footnotes
        self.text = re.sub(r"^[0-9]+\s+(.*)", r"\1", self.text, flags=re.M)

        # Replace tabs with spaces
        self.text = re.sub(r"\t+", r" ", self.text)
        
        # Remove space before punctuation
        self.text = re.sub(r" ([.;,?!])", r"\1", self.text)  
        
        # Remove numbers at end of words (usually footnotes)
        self.text = re.sub(r"(?<=[A-z.,])[0-9]+[\b\s]", " ", self.text)
        
        # Remove any words containing both letters and numbers
        pattern = r"[A-z]+[0-9]\S*|[0-9]+[A-z]+\S*"
        self.text = re.sub(pattern, "", self.text)  

        # Do formatting on lines
        clean_text = " "
        for line in re.split(r"\n", self.text):
            line = line.strip()
            # Signals the end of the document, start of metadata
            if re.match(r"^SPREAD WITH", line):
                break
                
            # Specific to PDF
            if specific == "cfpb_ymyg":  # Your Money, Your Goals
                # Ignore page footers
                if re.match(r"^MODULE [0-9]+:.*[0-9]+", line):
                    continue
                elif re.match(r"[0-9]+ [A-Z]+", line):
                    continue

                # Weird example tables start with '.'
                elif line.startswith("."):
                    clean_text += "\n"
                    continue

                # Those tables seem to be surrounded by 'CATEGORY'
                elif line == "CATEGORY":
                    clean_text += "\n"
                    continue
                    
            # End the previous sentence if extra blank line
            if not line:
                L = clean_text[-1]
                period = "." if L.isalnum() else ""
                clean_text += f"{period} \n"
                continue
            
            # Remove page numbers
            elif re.match(r"^[0-9\s]+$", line):
                continue
                
            # Remove likely footers
            elif re.search(r"[0-9]*[A-Z\s]+$", line):
                continue
            
            # Remove lines that are URLs
            elif re.match(r"^https?:\/\/.*[\r\n]*", line, flags=re.M):
                continue
                
            # Remove lines with no alphanumeric characters
            elif not re.search(r"[A-Za-z0-9]", line):
                continue
                
            # Remove any input fields
            elif "_" in line or ("check box" in line.lower() and ":" in line):
                continue
                
#             # Remove lines that are in ALL CAPS
#             elif line.isupper():
#                 continue
                
            # Keep Title Case headers as their own line
            elif line.istitle():
                clean_text += f"\n{line}\n"
                continue
                
            # Add line to text
            usespace = "" if clean_text[-1] == "\n" else " "
            clean_text += (usespace + line)
        self.text = clean_text.strip()

        
        # If 3 or more spaces, make it a new line
        self.text = re.sub(r"   +", "\n", self.text, flags=re.M)
        
        # Replace multiple whitespace with one newline
        self.text = re.sub(r"\n\s+", "\n", self.text, flags=re.M)
        
        # Replace multiple periods and single period lines
        self.text = re.sub(r"\.\.+", "", self.text, flags=re.M)
        self.text = re.sub(r"\n\.", "", self.text, flags=re.M)
        
        # Replace all other duplicate punctuation with the first one
        self.text = re.sub(r"([!,-.:;?])([!,-.:;?]+)", r"\1", self.text)
        
        # Make sure there's a whitespace after punctuation
        pattern = r"([A-Za-z][!,.:;?])([A-Za-z])"
        self.text = re.sub(pattern, r"\1 \2", self.text)
        
        # Remove whitespace after a slash (e.g., within a url)
        self.text = re.sub(r"/\s", "/", self.text)
        
        # Remove whitespace around hyphens
        self.text = re.sub(r"\s{0,1}-\s{0,1}", "-", self.text)
        
        # Specific to pdf
        if specific == "cfpb_ymyg":
            # Sometimes on pages they interject contacts for problems
            pattern = r"Having a problem.*?-[0-9]{4}\."
            self.text = re.sub(pattern, "", self.text, flags=re.DOTALL)
        
            # Format Module titles
            new_text = ""
            split_text = re.split(r"MODULE [0-9]+", self.text)[1:]
            for i, T in enumerate(split_text):
                # Section title
                pattern = r".*?\s?(?=[A-Z][a-z]+\s([a-z,']+\s){3})"
                title =  re.search(pattern, T.strip(), flags=re.M|re.DOTALL)
                title = title.group(0).strip().replace("\n", " ")
                new_text += f"MODULE {i+1}: {title}\n"

                # Text of section
                new_text += T.replace(title, "", 1).strip()
            self.text = new_text
        
        return self.text
    
    
    def get_text_data(self, chapter_regex=None, starting_id=0):
        """ 
        Break the text into medium-sized paragraph chunks, about 100
        words each. Then format the data into the correct format
        """
        assert hasattr(self, "text"), "You haven't parsed the text yet"
        # Split text up by 'chapter'
        if chapter_regex:
            split_text = re.split(chapter_regex, self.text)[1:]
            chapter_names = [x.group(0) for x in 
                re.finditer(chapter_regex, self.text)]
            text_dict = {c: t for c, t in zip(chapter_names, split_text)}
        else:
            text_dict = {"main": self.text}
            
        # For each chapter, break text up into paragraphs
        self.chunks = defaultdict(list)
        for chapter_name, chapter_text in text_dict.items():
            chunk_list = self.chunks[chapter_name]
            for para in chapter_text.split("\n"):
                # Considered a paragraph if more than one sentence
                if len(sent_tokenize(para)) <= 1:
                    continue
                # Get paragraph length and previous chunk length
                paragraph_words = len(para.split())
                if len(chunk_list) > 0:
                    previous_words = len(chunk_list[-1].split())
                else:
                    previous_words = 0
                    
                # Decide if it should be appended to previous chunk
                if not previous_words or previous_words > 100:
                    chunk_list.append(para.strip())
                elif previous_words + paragraph_words < 200:
                    chunk_list[-1] += " " + para.strip()
                else:
                    chunk_list.append(para.strip())
                    
        # Format the data
        self.datapoints = []
        current_id = starting_id
        for section, text_list in self.chunks.items():
            for text in text_list:
                self.datapoints.append({
                    "_id": current_id,
                    "source_title": self.title,
                    "source_filename": self.filename,
                    "source_url": self.url,
                    "section_title": section,
                    "passage_text": text
                })
                current_id += 1
                    
        return self.datapoints

In [53]:
# Dataset is formatted as a list of dictionaries
financial_literacy = list()

---
#### Source: CFPB "Your Money, Your Goals Toolkit"
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/your-money-your-goals/toolkit/)  
Link to [pdf download](https://www.consumerfinance.gov/documents/8956/cfpb_your-money-your-goals_financial-empowerment_toolkit.pdf)

In [55]:
pdf_name = "cfpb_your-money-your-goals_financial-empowerment_toolkit.pdf"
download_url = f"https://www.consumerfinance.gov/documents/8956/{pdf_name}"

PP1 = parsePDF(download_url, pdf_name)
raw_text1 = PP1.extract_contents(start_regex=r"MODULE 1\s+")
clean_text1 = PP1.clean_text(specific="cfpb_ymyg")
text_data1 = PP1.get_text_data(chapter_regex=r"MODULE [0-9].+", starting_id=0)
financial_literacy.extend(text_data1)

text_data1[10]

{'_id': 10,
 'source_title': 'YOUR MONEY, YOUR GOALS: A financial empowerment toolkit',
 'source_filename': 'cfpb_your-money-your-goals_financial-empowerment_toolkit.pdf',
 'source_url': 'https://www.consumerfinance.gov/documents/8956/cfpb_your-money-your-goals_financial-empowerment_toolkit.pdf',
 'section_title': 'MODULE 1: Setting Goals',
 'passage_text': "Specific What will I achieve? Who will benefit from the goal? What specific thing will I accomplish? Measurable How much? How many? How will I know when it's done? can actually reach? Do I have the tools and support I need to accomplish this? Relevant Is this something that I really want? Is now the right time to do this? Why is this goal important? Time bound When will I reach this goal? Is the time frame reasonable? Putting goals into action Before you can accomplish a goal, you need a plan for how to achieve it. This tool helps you turn your SMART goals into an easy-to-follow action plan."}

---
#### Source: CFPB "Your Money, Your Goals" Companion Guide: Military Communities
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/your-money-your-goals/companion-guides/)  
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_ymyg-servicemembers-companion-guide.pdf)

In [159]:
pdf_name = "cfpb_ymyg-servicemembers-companion-guide.pdf"
download_url = f"https://files.consumerfinance.gov/f/documents/{pdf_name}"
starting_id = max([d["_id"] for d in financial_literacy]) + 1

PP2 = parsePDF(download_url, pdf_name)
raw_text2 = PP2.extract_contents(start_regex=r"MODULE 1\s+")
clean_text2 = PP2.clean_text(specific="cfpb_ymyg")
text_data2 = PP2.get_text_data(chapter_regex=r"MODULE [0-9].+", starting_id=starting_id)
financial_literacy.extend(text_data2)

text_data2[10]

{'_id': 288,
 'source_title': ' Your money, your goals: Focus on Military Communities',
 'source_filename': 'cfpb_ymyg-servicemembers-companion-guide.pdf',
 'source_url': 'https://files.consumerfinance.gov/f/documents/cfpb_ymyg-servicemembers-companion-guide.pdf',
 'section_title': 'MODULE 1: Setting Goals',
 'passage_text': "Complete next steps you agreed to during the conversation. Create a check list to keep track of next steps. Patrick I was unemployed last time you deployed, and there were things the kids and I needed. It's not like we were spending the money on frivolous things. Marissa All I know is we can't do that again. So, here is a budget I prepared for you and the children while I am gone. No more eating outthat's for sure. You wasted so much money on that last time. Patrick Wait, you prepared a budget for us! That doesn't seem very fair. You don't know what it is like managing all these responsibilities on your own. Sometimes I was too exhausted to cook, so we grabbed mea

---
#### Source: CFPB "Your Money, Your Goals" Companion Guide: Disabilities
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/your-money-your-goals/companion-guides/)  
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_ymyg_focus-on-people-with-disabilities.pdf)

In [160]:
pdf_name = "cfpb_ymyg_focus-on-people-with-disabilities.pdf"
download_url = f"https://files.consumerfinance.gov/f/documents/{pdf_name}"
starting_id = max([d["_id"] for d in financial_literacy]) + 1

PP3 = parsePDF(download_url, pdf_name)
raw_text3 = PP3.extract_contents(start_regex=r"MODULE 1\s+")
clean_text3 = PP3.clean_text(specific="cfpb_ymyg")
text_data3 = PP3.get_text_data(chapter_regex=r"MODULE [0-9].+", starting_id=starting_id)
financial_literacy.extend(text_data3)

text_data3[10]

{'_id': 393,
 'source_title': 'Your Money, Your Goals Focus on People with Disabilities',
 'source_filename': 'cfpb_ymyg_focus-on-people-with-disabilities.pdf',
 'source_url': 'https://files.consumerfinance.gov/f/documents/cfpb_ymyg_focus-on-people-with-disabilities.pdf',
 'section_title': 'MODULE 2: Saving',
 'passage_text': 'To begin, the person will need to fill out a PASS application form and submit it to the Social Security Administration (SSA). Help in completing the form can come from a vocational counselor, case manager, or the PASS Cadre, which is a group of PASS specialists. PASS experts can be found at ssa.gov/disabilityresearch/wi/passcadre.htm. Other disability agencies, such as centers for independent living can offer help to fill out the application forms and develop PASS plans. A full list of requirements and a description of the application process is at Plan to Achieve Self-Support (PASS) ssa.gov/disabilityresearch/wi/pass.htm.'}

---
#### Source: CFPB "Your Money, Your Goals" Companion Guide: Criminal Justice-Involved Individuals
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/your-money-your-goals/companion-guides/)  
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_ymyg_reentry_supplement.pdf)

In [161]:
pdf_name = "cfpb_ymyg_reentry_supplement.pdf"
download_url = f"https://files.consumerfinance.gov/f/documents/{pdf_name}"
starting_id = max([d["_id"] for d in financial_literacy]) + 1

PP4 = parsePDF(download_url, pdf_name)
raw_text4 = PP4.extract_contents(start_regex=r"MODULE 1\s+")
clean_text4 = PP4.clean_text(specific="cfpb_ymyg")
text_data4 = PP4.get_text_data(chapter_regex=r"MODULE [0-9].+", starting_id=starting_id)
financial_literacy.extend(text_data4)

text_data4[10]

{'_id': 446,
 'source_title': 'Focus on Reentry: Criminal Justice',
 'source_filename': 'cfpb_ymyg_reentry_supplement.pdf',
 'source_url': 'https://files.consumerfinance.gov/f/documents/cfpb_ymyg_reentry_supplement.pdf',
 'section_title': 'MODULE 2: Saving',
 'passage_text': 'If you are working and make under a certain level of income, you may be eligible for the Earned Income Tax Credit or other tax credits. It is important to find out whether you are eligible. See the Handout: Saving at tax time in this module. you get your tax refund. For information about incarceration and filing taxes, see Additional resources and visit https://www.irs.gov/individuals/prisoner-re-entry-program; Use Saving and asset limits to get a clear picture of how much you can save while still maintaining your public benefits. Public benefits may be affected by incarceration, which is discussed in Module 3. If you need assistance in preparing and filing your returns one of your choices is to visit a Volunteer 

---
#### Source: CFPB "Your Money, Your Goals" Companion Guide: Native Communities
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/your-money-your-goals/companion-guides/)  
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_ymyg_focus-on-native-communities.pdf)

In [162]:
pdf_name = "cfpb_ymyg_focus-on-native-communities.pdf"
download_url = f"https://files.consumerfinance.gov/f/documents/{pdf_name}"
starting_id = max([d["_id"] for d in financial_literacy]) + 1

PP5 = parsePDF(download_url, pdf_name)
raw_text5 = PP5.extract_contents(start_regex=r"MODULE 1\s+")
clean_text5 = PP5.clean_text(specific="cfpb_ymyg")
text_data5 = PP5.get_text_data(chapter_regex=r"MODULE [0-9].+", starting_id=starting_id)
financial_literacy.extend(text_data5)

text_data5[10]

Saved PDF to data_sources/cfpb_ymyg_focus-on-native-communities.pdf


{'_id': 556,
 'source_title': 'Your Money, Your Goals: Focus on Native Communities',
 'source_filename': 'cfpb_ymyg_focus-on-native-communities.pdf',
 'source_url': 'https://files.consumerfinance.gov/f/documents/cfpb_ymyg_focus-on-native-communities.pdf',
 'section_title': 'MODULE 2: Saving',
 'passage_text': 'each program has different rules about what counts as an asset and the total value of your assets allowed to qualify for assistance. For some programs, these rules are the same regardless of where you live. For others, each state or tribe determines its own rules. In general, your liquid assets (like cash or money in savings or checking accounts) are counted as assets. In some states, if the value of your car exceeds a certain amount, anything over that amount may be counted as an asset as well. Identify which public or tribal benefits the person you\'re working with currently receives or may be applying for in the near future. Use the websites and the "Additional Resources" prov

---
#### Source: CFPB A Newcomer's Guide to Managing Money
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/adult-financial-education/tools-and-resources/#newcomers)   

##### Opening a bank or credit union account
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_adult-fin-ed_checklist-for-opening-an-account.pdf)  

##### Selecting financial products and services
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_adult-fin-ed_selecting-financial-products-and-services.pdf)

##### Ways to pay your bills
Link to [pdf download](https://files.consumerfinance.gov/f/201507_cfpb_ways-to-pay-your-bills.pdf)

##### Ways to receive your money
Link to [pdf download](https://files.consumerfinance.gov/f/201507_cfpb_ways-to-receive-your-money.pdf)

In [15]:
newcomers_documents = [
    "cfpb_adult-fin-ed_checklist-for-opening-an-account.pdf",
    "cfpb_adult-fin-ed_selecting-financial-products-and-services.pdf",
    "201507_cfpb_ways-to-pay-your-bills.pdf",
    "201507_cfpb_ways-to-receive-your-money.pdf"]

for pdf_name in newcomers_documents:
    base_url = "https://files.consumerfinance.gov/f/"
    if pdf_name[0].isdigit():
        download_url = base_url + pdf_name
    else:
        download_url = base_url + "documents/" + pdf_name
    starting_id = max([d["_id"] for d in financial_literacy]) + 1

    PP6 = parsePDF(download_url, pdf_name)
    raw_text6 = PP6.extract_contents()
    clean_text6 = PP6.clean_text()
    text_data6 = PP6.get_text_data(starting_id=starting_id)
    financial_literacy.extend(text_data6)

financial_literacy[-1]

Saved PDF to data_sources/201507_cfpb_ways-to-pay-your-bills.pdf
Saved PDF to data_sources/201507_cfpb_ways-to-receive-your-money.pdf


{'_id': 308,
 'source_title': 'Ways to receive your money',
 'source_filename': '201507_cfpb_ways-to-receive-your-money.pdf',
 'source_url': 'https://files.consumerfinance.gov/f/201507_cfpb_ways-to-receive-your-money.pdf',
 'section_title': 'main',
 'passage_text': 'Your salary or wages are electronically sent to your prepaid card without the use of a paper check. May be safer and more secure than carrying cash or checks. The card does not have the same consumer protections as a checking account or payroll card for funds taken by electronic error or theft. You might be limited in the types of transactions you can use the card for. For example, you might not be able to use your prepaid card to pay bills. Many cards charge fees for inactivity, purchases, ATM use, monthly fees, etc. P.O. Box 4503'}

---
#### Source: CFPB Money Management
Link to [page](https://www.consumerfinance.gov/consumer-tools/educator-tools/adult-financial-education/tools-and-resources/#money-management)   

##### Know your overdraft options
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_adult-fin-ed_know-your-overdraft-options.pdf)  

##### Unwrapping gift cards
Link to [pdf download](https://files.consumerfinance.gov/f/documents/cfpb_adult-fin-ed_unwrapping-gift-cards-avoid-surprises.pdf)




In [None]:
newcomers_documents = [
    "cfpb_adult-fin-ed_know-your-overdraft-options.pdf",
    "cfpb_adult-fin-ed_unwrapping-gift-cards-avoid-surprises.pdf"
]

for pdf_name in newcomers_documents:
    base_url = "https://files.consumerfinance.gov/f/"
    if pdf_name[0].isdigit():
        download_url = base_url + pdf_name
    else:
        download_url = base_url + "documents/" + pdf_name
    starting_id = max([d["_id"] for d in financial_literacy]) + 1

    PP6 = parsePDF(download_url, pdf_name)
    raw_text6 = PP6.extract_contents()
    clean_text6 = PP6.clean_text()
    text_data6 = PP6.get_text_data(starting_id=starting_id)
    financial_literacy.extend(text_data6)

financial_literacy[-1]

---
---
## Save the data as a file

In [17]:
import pandas as pd
pd.set_option("display.max_colwidth", 500)

save_name = "financial_literacy_data.csv"
flair_df = pd.DataFrame(financial_literacy)

flair_df[["source_title", "section_title", "passage_text"]].sample(4)

Unnamed: 0,source_title,section_title,passage_text
188,"YOUR MONEY, YOUR GOALS: A financial empowerment toolkit",MODULE 7: Understanding Credit Reports and Scores,"Length of credit history tracks how long you've had credit accountsthe longer the history, the more positive affect on your scores. A long credit history provides strong evidence of how you use credit and patterns of your payment behavior. Pie chart values are from FICO. See myfico.com. New credit is tracked by measuring credit inquiries about you made by creditors and others. If there are too many inquiries, it may signal that you have a high demand for credit. Because this may be an indica..."
268,"YOUR MONEY, YOUR GOALS: A financial empowerment toolkit",MODULE 9: Protecting your Money,"Use a shredder, scissors, or your hands to tear all papers with identifying information or account numbers into tiny pieces before throwing them out. Also cut up any old or cancelled credit cards or debit cards. Only give out your Social Security number when it's absolutely necessary. Often when someone asks for it, you are not required to give it to them. Protect information like your mother's maiden name, which is often used as a way to verify identity with financial institutions. Be cauti..."
69,"YOUR MONEY, YOUR GOALS: A financial empowerment toolkit",MODULE 3: Tracking Income and Benefits,"I can earn extra money with skills I have (providing childcare, doing yard work, etc.)."
133,"YOUR MONEY, YOUR GOALS: A financial empowerment toolkit",MODULE 6: Dealing with Debt,"If you're short on cash, consider some alternatives to borrowing small amounts of money for a short period of time. Negotiate for more time to pay if the loan is to pay a bill that's due. Think about what you're borrowing the money for. Is it a need, an obligation, or a want? If it's a want, consider whether it's possible to spend less money for it, not purchase it, or wait until you have the money for it. Use a credit card if you have one that is not maxed out while it will increase your mo..."


In [None]:
flair_df.to_csv(save_name, index=False, sep="|")

In [None]:
###############################################################################
###############################################################################

In [None]:
# online_sources = {}

# url = "https://www.moneygeek.com/financial-planning/resources/assistance-for-formerly-incarcerated/"
# formerly_incarcerated = parseWebpage(url)
# fi_text = formerly_incarcerated.extract_contents()
# fi_chunks = formerly_incarcerated.chunk_into_paragraphs()
# online_sources.update(fi_chunks)

# title = formerly_incarcerated.title
# print(f"{title}\n{'='*len(title)}")
# print(fi_chunks[title][0])

In [None]:
# url = "https://www.moneygeek.com/financial-planning/resources/financial-help-women-abusive-relationships/"
# domestic_violence = parseWebpage(url)
# dv_text = domestic_violence.extract_contents()
# dv_chunks = domestic_violence.chunk_into_paragraphs()
# online_sources.update(dv_chunks)

# title = domestic_violence.title
# print(f"{title}\n{'='*len(title)}")
# print(dv_chunks[title][0])

In [None]:
# url = "https://www.moneygeek.com/financial-planning/resources/financial-literacy-handbook/"
# literacy_handbook = parseWebpage(url)
# flh_text = literacy_handbook.extract_contents()
# flh_chunks = literacy_handbook.chunk_into_paragraphs()
# online_sources.update(flh_chunks)

# title = literacy_handbook.title
# print(f"{title}\n{'='*len(title)}")
# print(flh_chunks[title][0])

In [None]:
# for p in online_sources["Financial Education and Reentry Resources After Incarceration"]:
#     print(p, "\n")

In [None]:
# url = "https://www.moneygeek.com/financial-planning/resources/assistance-for-formerly-incarcerated/"
# r = requests.get(url)
# soup = BeautifulSoup(r.content, "lxml")
# print(soup.prettify())

In [None]:
# # meat = "\n".join([p.getText().strip() for p in soup.findAll("p") if not p.attrs])
# # selftext = unicodedata.normalize("NFKD", meat).strip()
# # selftext = re.sub(r"\n+", "\n", selftext)  # Replace duplicate newlines

# for p in soup.findAll("p"):
#     t = p.getText().strip()
#     if "For most inmates" in t:
#         print(p)

In [None]:
# print("\n".join([p.getText().strip() for p in soup.findAll("p", recursive=True)]))

In [None]:
# divs = soup.findAll("div", P)

# # print("\n".join([p.getText().strip() for p in soup.findAll("div") if not p.has_attr("class")]))
# d0 = divs[0].prettify()
# print(d0)

In [None]:
# ex_f = lambda x: x.tag 
# print(soup.find_all("title"))
# print(soup.find_all(ex_f))

In [None]:
# help(soup.find_all)

In [None]:
# import requests
# from bs4 import BeautifulSoup
# import unicodedata
# import urllib.request

In [None]:
# class parseWebpage:
#     def __init__(self, url):
#         self.url = url
    
#     def extract_contents(self):
#         """ Extract the paragraph contents from a web page. """
#         # Scrape web page
#         r = requests.get(self.url)
#         soup = BeautifulSoup(r.content)
#         self.title = soup.find("title").getText().split("|")[0].strip()
        
#         # Get the "meat" of the page
#         meat = "\n".join([p.getText().strip() for p in
#             soup.findAll("p") if not p.attrs])
#         self.text = unicodedata.normalize("NFKD", meat).strip()
#         self.text = re.sub(r"\n+", "\n", self.text)  # Replace duplicate newlines
#         return self.text
    
    
#     def chunk_into_paragraphs(self):
#         """ Break the text into medium-sized paragraph chunks. """
#         assert hasattr(self, "text"), "You haven't parsed the text yet"
            
#         # Break text up into paragraphs
#         chunk_list = []
#         for para in self.text.split("\n"):
#             # Get paragraph length and previous chunk length
#             paragraph_words = len(para.split())
#             if len(chunk_list) > 0:
#                 previous_words = len(chunk_list[-1].split())
#             else:
#                 previous_words = 0
                
#             # Break up a paragraph if it's too long
#             max_words = 125
#             s_per_para = 4  # sentences to define a paragraph
#             if paragraph_words > max_words:
#                 sentences = sent_tokenize(para)
#                 subpara_len = len(sentences) // s_per_para
#                 new_paras = [" ".join(sp).strip() for sp in 
#                     np.array_split(sentences, max(1, subpara_len))]
#                 chunk_list.extend(new_paras)
#                 continue

#             # Decide if it should be appended to previous chunk
#             if not previous_words or previous_words > max_words:
#                 chunk_list.append(para.strip())
#             elif previous_words + paragraph_words < max_words:
#                 chunk_list[-1] += " " + para.strip()
#             else:
#                 chunk_list.append(para.strip())
        
#         self.chunks = {self.title: chunk_list}
#         return self.chunks

In [58]:
s = "hannah.is bad with.\npunctuation! Please fix.it. pi is 3.14159"


# pattern = r"([!,-.:;?])([!,-.:;?]+)"
pattern = r"([A-Za-z][!,.:;?])([A-Za-z])"
print(re.sub(pattern, r"\1 \2", s))

hannah. is bad with.
punctuation! Please fix. it. pi is 3.14159
