In [1]:
import re


# store raw learning notes in this directory
APPLE_NOTES_RAW_LEARNING_NOTES_DIRECTORY = "data/apple_notes/raw"

# store formatted learning notes in this directory
APPLE_NOTES_FORMATTED_NOTES_DIRECTORY = "data/apple_notes"


def remove_html_tags_extraneous_whitespaces(line):
    """
    Remove html tags and extraneous whitespace chars from line.
    """
    line = re.sub('<[^<]+?>', '',line).strip()
    return line


def format_learning_notes_to_csv(html_file, csv_file):
    """
    Format learning notes and export to csv file with each row containing data for:

        - Author
        - Title
        - Annotation
        - Date

    """
    # add header to csv file
    csv_file.write("Author,Title,Annotation,Date\n")
    
    for idx, line in enumerate(html_file):
        
        line = remove_html_tags_extraneous_whitespaces(line)
        
        # skip empty lines
        if line:
            # reset author and title after divider
            if "==========" in line:
                div_idx = idx
                author = None
                title = None
            # store author and title data
            # "" to ensure commas in data ok with csv format
            elif idx == div_idx + 1:
                author = '"' + line + '"'
            elif idx == div_idx + 2:
                title = '"' + line + '"'
            # write author, title, annotation, and date data to csv file
            else:
                try:
                    # extract date
                    date = re.search("(\d+/\d+/\d+)", line).group(1)
                    line = line.strip("(" + date + ")").strip()
                    csv_file.write(','.join([author, title, '"' + line + '"', '"' + date + '"']))
                except:
                    csv_file.write(','.join([author, title, '"' + line + '"']))
                csv_file.write('\n')
    csv_file.close()
    html_file.close()    


html_file = open(APPLE_NOTES_RAW_LEARNING_NOTES_DIRECTORY+'/learning_notes.html', 'r', encoding="ISO-8859-1")
csv_file = open(APPLE_NOTES_FORMATTED_NOTES_DIRECTORY+'/learning_notes.csv', 'w')

format_learning_notes_to_csv(html_file, csv_file)

In [3]:
# !head "data/apple_notes/learning_notes.csv"

# import pandas as pd

# # import csv file to view data
# data = pd.read_csv("data/apple_notes/learning_notes.csv")
# data

Author,Title,Annotation,Date
"The Tim Ferriss Show","#391: The Random Show - On Fasting, Forest Bathing, How to Say NO, Rebooting the Self, and Much More","What if you charged more than everyone else? What would you have to create that would be worth a price at the highest end?","12/02/2019"
"The Tim Ferriss Show","#391: The Random Show - On Fasting, Forest Bathing, How to Say NO, Rebooting the Self, and Much More","Book Recommendation: Awareness by Anthony de Mello Ñ> a lot of oh fuck moments re: self awareness","12/02/2019"
"The Tim Ferriss Show","#391: The Random Show - On Fasting, Forest Bathing, How to Say NO, Rebooting the Self, and Much More","Mushroom plugs, drill holes about an inch deep in branches that you cut from trees and pound in plugs then put thin layer of wax and cover with shade cloth water daily. Lions mane chopped up and sauted with butter. Good for the brain","12/05/2019"
"The Tim Ferriss Show","# 392: test test test","Test note 2","12/03/2019"
"The Tim Ferr

Unnamed: 0,Author,Title,Annotation,Date
0,The Tim Ferriss Show,"#391: The Random Show - On Fasting, Forest Bat...",What if you charged more than everyone else? W...,12/02/2019
1,The Tim Ferriss Show,"#391: The Random Show - On Fasting, Forest Bat...",Book Recommendation: Awareness by Anthony de M...,12/02/2019
2,The Tim Ferriss Show,"#391: The Random Show - On Fasting, Forest Bat...","Mushroom plugs, drill holes about an inch deep...",12/05/2019
3,The Tim Ferriss Show,# 392: test test test,Test note 2,12/03/2019
4,The Tim Ferriss Show,# 392: test test test,Test note 3,12/03/2019
5,The Tim Ferriss Show,# 393: test test test,Test note 4,12/03/2019
6,The Tim Ferriss Show,# 393: test test test,Test note 5,12/03/2019
7,The Tim Ferriss Show,# 393: test test test,Test test test,
