In [74]:
import subprocess
import re
import json
from os import walk

In [75]:
class Snippet():
    def __init__(self,full_path,tag_path,file):
        self.full_path = full_path
        self.tag_path = tag_path
        self.file = file
        self.tags = []
        self.tagId = []
        
    def setTags(self):
        tags = self.tag_path.split('/')
        if tags[-1] in ['basics','other']:
            tags = [': '.join(tags)]
        self.tags = tags
        
    def createHTML(self):
        command = f"jupyter nbconvert --to html --template basic {self.full_path}/{self.file}"
        parsed_command = command.split(' ')
        subprocess.run(parsed_command)
        
    def setHTML(self):
        html_file = self.file.replace('ipynb','html')
        with open(self.full_path +'/'+ html_file, 'r') as myfile:
            self.html = myfile.read()
        self.html = self.cleanAttr()
        
    def setNotebook(self):
        with open(self.full_path +'/'+ self.file, 'r') as myfile:
            self.notebook = json.loads(myfile.read()) 
            
    def cleanAttr(self):
        regex = r"---((.|\n)*?)---"
        subst = ""
        return re.sub(regex, subst, self.html, 0)
    
#     def clean_notebook_attr(self):
#         ''' Removes the first cell in a notebook, where the attribution is'''
#         try:
#             nb = self.notebook
#             start = find_nth(nb,"{",2)
#             end_indicator = find_nth(nb,"---",2)
#             next_bracket = nb[end_indicator:].index(',') + 1
#             end = end_indicator+next_bracket
#             pre_attr = nb[0:start]
#             post_attr = nb[end+1:]
#             self.notebook = pre_attr + post_attr
#         except:
#             print('index not found')
        

    def asJSON(self,pk,ipynb=True):
        title = self.file.replace('.ipynb','').replace('_',' ')
#         html = self.html.replace('"','\\"')
        if ipynb:
            code = json.dumps(self.notebook)
        else:
            code = json.dumps(self.html)
        one = f'{{"model": "snippet.snippet","pk": {pk},"fields": {{'
        two = f'"title": "{title}","code": {code},"tags": {self.tagId}'
        three = "}}"
        return(one+two+three)    
    
    def convertTags(self,tag_map):
        tagId = []
        for tag in self.tags:
            self.tagId.append(tag_map[tag])

In [76]:
mypath='/Users/jimmydelano/notes/content/'

snippet_list = []
for (dirpath, dirnames, filenames) in walk(mypath):
    for file in filenames:
        if ('.ipynb' in file) and not ('.ipynb_checkpoints' in dirpath):
            clean_dir = dirpath.replace(mypath,'')
            s = Snippet(dirpath,clean_dir,file)
            snippet_list.append(s)
            
tag_set = set()
for snippet in snippet_list:
    snippet.setTags()
    #snippet.createHTML() # Only need to run once
    for tag in snippet.tags:
        tag_set.add(tag)
    snippet.setNotebook()    
    
        
        
tag_list = list(tag_set)   
tag_map = { tag_list[i]:i+1 for i in range(0, len(tag_list)) }  

In [77]:
for snippet in snippet_list:
    snippet.convertTags(tag_map)

---

In [96]:
def convertHeader(raw_cell, tags):
    markdown_cell = {'cell_type': 'markdown', 
         'metadata': {}, 
         'source': ['---\n', 
            'title: "title"\n', 
            'description: ""\n', 
            'tags: Python\n', 
            'URL: https://github.com/chrisalbon/notes\n', 
            'Licence: \n', 
            'Creator: \n', 
            'Meta: \n', 
            '\n', 
            '---']}
    
    markdown_cell['source'][1] = raw_cell['source'][1]
    #markdown_cell['source'][2] = raw_cell['source'][4]
    markdown_cell['source'][3] = 'tags: '+ ', '.join(tags)+'\n'
    return markdown_cell

In [97]:
for snippet in snippet_list:
    snippet.notebook['cells'][0] = convertHeader(
                                            snippet.notebook['cells'][0], 
                                            snippet.tags)
    with open('/Users/jimmydelano/cocode/inprogress/'+snippet.file, 'w') as outfile:
        json.dump(snippet.notebook, outfile)

# Tests

In [39]:
tag_map

{'patterns': 1,
 'nearest_neighbors': 2,
 'python': 3,
 'support_vector_machines': 4,
 'regex': 5,
 'deep_learning': 6,
 'data_wrangling': 7,
 'model_selection': 8,
 'keras': 9,
 'frequentist': 10,
 'feature_engineering': 11,
 'logging': 12,
 'algorithms': 13,
 'machine_learning': 14,
 'preprocessing_images': 15,
 'python: basics': 16,
 'linear_regression': 17,
 'testing': 18,
 'preprocessing_dates_and_times': 19,
 'logistic_regression': 20,
 'naive_bayes': 21,
 'scala: basics': 22,
 'mathematics: basics': 23,
 'statistics': 24,
 'preprocessing_structured_data': 25,
 'machine_learning: basics': 26,
 'data_visualization': 27,
 'preprocessing_text': 28,
 'feature_selection': 29,
 'model_evaluation': 30,
 'python: other': 31,
 'vectors_matrices_and_arrays': 32,
 'clustering': 33,
 'trees_and_forests': 34,
 'postgresql: basics': 35,
 'computer_science': 36,
 'web_scraping': 37}

In [23]:
import re
regex = r"---(.*?)---"
test_str = "lots of other ---title: \"Extract Substrings Using Regex\"author: \"Chris Albon\"date: 2017-12-20T11:53:49-07:00description: \"Extract substrings using regex using Scala.\"type: technical_notedraft: false---stuff"
subst = ""
# You can manually specify the number of replacements by changing the 4th argument
result = re.sub(regex, subst, test_str, 0)
if result:
    print (result)  

lots of other stuff


In [64]:
html = f[0].html
json.dumps({"html":html})

'{"html": "---title: \\"Spearmans Rank Correlation\\"author: \\"Chris Albon\\"date: 2017-12-20T11:53:49-07:00description: \\"Spearman\'s Rank Correlation in Python.\\"type: technical_notedraft: false---<div class=\\"cell border-box-sizing text_cell rendered\\"><div class=\\"prompt input_prompt\\"></div><div class=\\"inner_cell\\"><div class=\\"text_cell_render border-box-sizing rendered_html\\"><h2 id=\\"Preliminaries\\">Preliminaries<a class=\\"anchor-link\\" href=\\"#Preliminaries\\">&#182;</a></h2></div></div></div><div class=\\"cell border-box-sizing code_cell rendered\\"><div class=\\"input\\"><div class=\\"prompt input_prompt\\">In&nbsp;[1]:</div><div class=\\"inner_cell\\">    <div class=\\"input_area\\"><div class=\\" highlight hl-ipython3\\"><pre><span></span><span class=\\"kn\\">import</span> <span class=\\"nn\\">numpy</span> <span class=\\"k\\">as</span> <span class=\\"nn\\">np</span><span class=\\"kn\\">import</span> <span class=\\"nn\\">pandas</span> <span class=\\"k\\">as

In [59]:
snippet_list[0].notebook

'{\n "cells": [\n  {\n   "cell_type": "raw",\n   "metadata": {},\n   "source": [\n    "---\\n",\n    "title: \\"Spearmans Rank Correlation\\"\\n",\n    "author: \\"Chris Albon\\"\\n",\n    "date: 2017-12-20T11:53:49-07:00\\n",\n    "description: \\"Spearman\'s Rank Correlation in Python.\\"\\n",\n    "type: technical_note\\n",\n    "draft: false\\n",\n    "---"\n   ]\n  },\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Preliminaries"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 1,\n   "metadata": {\n    "collapsed": true\n   },\n   "outputs": [],\n   "source": [\n    "import numpy as np\\n",\n    "import pandas as pd\\n",\n    "import scipy.stats"\n   ]\n  },\n  {\n   "cell_type": "markdown",\n   "metadata": {},\n   "source": [\n    "## Create Data"\n   ]\n  },\n  {\n   "cell_type": "code",\n   "execution_count": 2,\n   "metadata": {\n    "collapsed": true\n   },\n   "outputs": [],\n   "source": [\n    "# Create two lists of random va

In [61]:
json.loads(snippet_list[0].notebook)

{'cells': [{'cell_type': 'raw',
   'metadata': {},
   'source': ['---\n',
    'title: "Spearmans Rank Correlation"\n',
    'author: "Chris Albon"\n',
    'date: 2017-12-20T11:53:49-07:00\n',
    'description: "Spearman\'s Rank Correlation in Python."\n',
    'type: technical_note\n',
    'draft: false\n',
    '---']},
  {'cell_type': 'markdown', 'metadata': {}, 'source': ['## Preliminaries']},
  {'cell_type': 'code',
   'execution_count': 1,
   'metadata': {'collapsed': True},
   'outputs': [],
   'source': ['import numpy as np\n',
    'import pandas as pd\n',
    'import scipy.stats']},
  {'cell_type': 'markdown', 'metadata': {}, 'source': ['## Create Data']},
  {'cell_type': 'code',
   'execution_count': 2,
   'metadata': {'collapsed': True},
   'outputs': [],
   'source': ['# Create two lists of random values\n',
    'x = [1,2,3,4,5,6,7,8,9]\n',
    'y = [2,1,2,4.5,7,6.5,6,9,9.5]']},
  {'cell_type': 'markdown',
   'metadata': {},
   'source': ["## Calculate Spearman's Rank Correlati

In [68]:
# This proves that we can't just load
json.dumps(str(json.loads(snippet_list[0].notebook))) == json.dumps(snippet_list[0].notebook)

False

In [155]:
nb = snippet_list[21].notebook

In [156]:
def find_nth(haystack, needle, n):
    start = haystack.find(needle)
    while start >= 0 and n > 1:
        start = haystack.find(needle, start+len(needle))
        n -= 1
    return start

'{"cells": [{"cell_type": "markdown", "metadata": {}, "source": ["### Create a variable with the status of the conflict.\\n", "\\n", "- 1 if the conflict is active\\n", "- 0 if the conflict is not active\\n", "- unknown if the status of the conflict is unknwon"]}, {"cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": ["conflict_active = 1"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### If the conflict is active print a statement"]}, {"cell_type": "code", "execution_count": 17, "metadata": {"collapsed": true}, "outputs": [{"name": "stdout", "output_type": "stream", "text": ["The conflict is active.\\n"]}], "source": ["if conflict_active == 1:\\n", "    print(\'The conflict is active.\')"]}, {"cell_type": "markdown", "metadata": {}, "source": ["### If the conflict is active print a statement, if not, print a different statement"]}, {"cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [{"name": "stdout", "output_type": "stre

In [101]:
json.dumps(json.loads(snippet_list[0].notebook,))

'{"cells": [{"cell_type": "raw", "metadata": {}, "source": ["---\\n", "title: \\"Spearmans Rank Correlation\\"\\n", "author: \\"Chris Albon\\"\\n", "date: 2017-12-20T11:53:49-07:00\\n", "description: \\"Spearman\'s Rank Correlation in Python.\\"\\n", "type: technical_note\\n", "draft: false\\n", "---"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Preliminaries"]}, {"cell_type": "code", "execution_count": 1, "metadata": {"collapsed": true}, "outputs": [], "source": ["import numpy as np\\n", "import pandas as pd\\n", "import scipy.stats"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Create Data"]}, {"cell_type": "code", "execution_count": 2, "metadata": {"collapsed": true}, "outputs": [], "source": ["# Create two lists of random values\\n", "x = [1,2,3,4,5,6,7,8,9]\\n", "y = [2,1,2,4.5,7,6.5,6,9,9.5]"]}, {"cell_type": "markdown", "metadata": {}, "source": ["## Calculate Spearman\'s Rank Correlation\\n", "\\n", "Spearman\'s rank correlation is the Pearson\'s co