In [44]:
import pandas
import re

In [45]:

def load_section_overview_from_csv(input_filename_csv):
    df = pandas.read_csv(input_filename_csv, header=0, delimiter=',',
                     names=['section_id','file_id','url','heading_markdown','section_code'])
    
    readme_file_generator = lambda x: x.replace('https://github.com/','').replace('/','.') + '.md'
    df['local_readme_file'] = df['url'].apply(readme_file_generator) 
    # df['heading_text'] = df['heading_markdown'].apply(extract_text_in_heading_markdown)
    # In markdown, # = heading level 1, ## = heading level 2, etc.
    df['heading_level'] = df['heading_markdown'].apply(lambda x : len(re.search('^#+', x).group(0)))
    # df['abstracted_heading_markdown'] = df['heading_markdown'].apply(lambda x : abstract_text(x).replace('\n', ' ').strip())
    # df['abstracted_heading_text'] = df['abstracted_heading_markdown'].apply(extract_text_in_heading_markdown)
    # Don't convert to int, as data contains '-' for 'not in any class'
    # df['section_code'] = df['section_code'].apply(lambda x : merge_classes_1_and_2(x))
    
    return df

In [68]:
def extract_section_from_files(file_path, df):
    curr_filename = None
    curr_filename_lines = None
    curr_filename_line_number = 0
    sections = []
    # Section definition: lines between current heading and next, regardless of level
    # Can't simply use equivalent of left JOIN between list of headings and actual headings in file,
    # as a file may have a several heading with same text and level (e.g. multiple "Example" subheadings, one for each method in a reference section)
    for i, r in df.iterrows():
        heading_already_found = False
        local_readme_filename = r[5]
        heading_markdown = r[3]
        if (curr_filename is None) or (curr_filename != local_readme_filename):
            curr_filename = local_readme_filename
            with open (file_path + curr_filename, "r", encoding='utf-8') as myfile:
                # Read as is, use rstrip instead of strip to only remove trailing whitespace
                # We want to preserve leading whitespace to avoid treating line starting with space/tab followed by #
                # from being treated as heading
                curr_filename_lines = myfile.readlines()                    
                curr_filename_lines = [x.rstrip() for x in curr_filename_lines]
                curr_filename_line_number = 0 #Indent this line to be in the if block
        curr_section_content_lines = []
        # Iterate through file until heading markdown is found or end of file is found
        # Check also for underline-style formatting
        while (curr_filename_line_number < len(curr_filename_lines)):
            '''
            If a candidate heading is found, compare it with the heading we're looking for.
            If it's actually the one we want, set a flag, so if the next heading happens to have same string,
            we can tell that it's a different heading.
            '''
            if curr_filename_lines[curr_filename_line_number].startswith('#'):
                # Potential heading, starting with #. Is it the heading we want?
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if ((candidate_heading != heading_markdown.strip()) or heading_already_found):
                    break
                else:
                    heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and
                 curr_filename_lines[curr_filename_line_number + 1].startswith('===')):
                # Potential H1, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('# ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                    heading_already_found = True
            elif ((curr_filename_line_number < len(curr_filename_lines) - 1) and 
                 curr_filename_lines[curr_filename_line_number+1].startswith('---')):
                # Potential H2, in underline markdown style
                candidate_heading = curr_filename_lines[curr_filename_line_number].replace('\n',' ').strip() 
                if (('## ' + candidate_heading) != heading_markdown.strip() or heading_already_found):
                    # Skip next line (which is the underline)
                    curr_filename_line_number += 1                        
                    break
                else:
                    heading_already_found = True
            else:
                curr_section_content_lines.append(curr_filename_lines[curr_filename_line_number] + '%%NEWLINE')
            # Proceed to next line
            curr_filename_line_number += 1
        curr_section_content = ' '.join(curr_section_content_lines)
        sections.append(curr_section_content)
    return sections

In [69]:
    original_csv_filename = '../input/dataset_combined.csv'
    original_readme_path = '../input/dev_and_eval_readmes/'

    df = load_section_overview_from_csv(csv_filename)

In [70]:
l = extract_section_from_files(readme_path, df)

In [71]:
df['section_contents'] = l

In [72]:
df.head(15)

Unnamed: 0,section_id,file_id,url,heading_markdown,section_code,local_readme_file,heading_level,section_contents
0,1,1,https://github.com/xiaobai557/wechat,# [Easy WeChat](http://easywechat.org),-,xiaobai557.wechat.md,1,%%NEWLINE 可能是目前最优雅的微信公众平台 SDK 了。%%NEWLINE %%NE...
1,2,1,https://github.com/xiaobai557/wechat,## 特点,-,xiaobai557.wechat.md,2,%%NEWLINE - 命名不那么乱七八糟；%%NEWLINE - 隐藏开发者不需要关注...
2,3,1,https://github.com/xiaobai557/wechat,## 环境要求,-,xiaobai557.wechat.md,2,%%NEWLINE 1. PHP >= 5.5.9%%NEWLINE 2. **[compo...
3,4,1,https://github.com/xiaobai557/wechat,## 安装,-,xiaobai557.wechat.md,2,"%%NEWLINE ```shell%%NEWLINE composer require ""..."
4,5,1,https://github.com/xiaobai557/wechat,## 使用,-,xiaobai557.wechat.md,2,%%NEWLINE 基本使用（以服务端为例）:%%NEWLINE %%NEWLINE ```...
5,6,1,https://github.com/xiaobai557/wechat,## 文档,-,xiaobai557.wechat.md,2,%%NEWLINE [http://easywechat.org/](http://easy...
6,7,1,https://github.com/xiaobai557/wechat,## 框架集成,-,xiaobai557.wechat.md,2,%%NEWLINE [Laravel 5 拓展包: overtrue/laravel-wec...
7,8,1,https://github.com/xiaobai557/wechat,## 贡献代码,-,xiaobai557.wechat.md,2,%%NEWLINE [贡献指南](CONTRIBUTING.md)%%NEWLINE %%N...
8,9,1,https://github.com/xiaobai557/wechat,## License,-,xiaobai557.wechat.md,2,%%NEWLINE MIT%%NEWLINE
9,10,2,https://github.com/dungenessbin/think,# ThinkPHP 5.0,-,dungenessbin.think.md,1,===============%%NEWLINE %%NEWLINE [![Total Do...


In [73]:
df.to_csv(r'../output/original_sections_csv.csv', index = False, header=True)

In [74]:
new_csv_filename = '../input/new_dataset_combined.csv'   
new_readme_path = '../input/new_readmes_combined/'

df_new = load_section_overview_from_csv(new_csv_filename)

In [75]:
df_new

Unnamed: 0,section_id,file_id,url,heading_markdown,section_code,local_readme_file,heading_level
0,1,1,https://github.com/acunote/acunote-shortcuts,# Acunote's JavaScript Keyboard Shortcuts Fram...,0,acunote.acunote-shortcuts.md,1
1,2,1,https://github.com/acunote/acunote-shortcuts,## Overview,0,acunote.acunote-shortcuts.md,2
2,3,1,https://github.com/acunote/acunote-shortcuts,## Try It,1,acunote.acunote-shortcuts.md,2
3,4,1,https://github.com/acunote/acunote-shortcuts,## Features,0,acunote.acunote-shortcuts.md,2
4,5,1,https://github.com/acunote/acunote-shortcuts,## Four Simple Steps To Use Shortcuts from Pur...,1,acunote.acunote-shortcuts.md,2
...,...,...,...,...,...,...,...
501,502,48,https://github.com/zachhardesty7/tamper-monkey...,## Usage,1,zachhardesty7.tamper-monkey-scripts-collection.md,2
502,503,48,https://github.com/zachhardesty7/tamper-monkey...,## Known Issues,1,zachhardesty7.tamper-monkey-scripts-collection.md,2
503,504,48,https://github.com/zachhardesty7/tamper-monkey...,## Contributing,0,zachhardesty7.tamper-monkey-scripts-collection.md,2
504,505,48,https://github.com/zachhardesty7/tamper-monkey...,## License,0,zachhardesty7.tamper-monkey-scripts-collection.md,2


In [77]:
l_new = extract_section_from_files(new_readme_path, df_new)

In [78]:
df_new['section_contents'] = l_new

In [79]:
df_new.head()

Unnamed: 0,section_id,file_id,url,heading_markdown,section_code,local_readme_file,heading_level,section_contents
0,1,1,https://github.com/acunote/acunote-shortcuts,# Acunote's JavaScript Keyboard Shortcuts Fram...,0,acunote.acunote-shortcuts.md,1,%%NEWLINE %%NEWLINE
1,2,1,https://github.com/acunote/acunote-shortcuts,## Overview,0,acunote.acunote-shortcuts.md,2,%%NEWLINE Acunote Shortcuts is a JavaScript li...
2,3,1,https://github.com/acunote/acunote-shortcuts,## Try It,1,acunote.acunote-shortcuts.md,2,Firefox extension and Greasemonkey script to a...
3,4,1,https://github.com/acunote/acunote-shortcuts,## Features,0,acunote.acunote-shortcuts.md,2,%%NEWLINE * Simple to use: just declare a ke...
4,5,1,https://github.com/acunote/acunote-shortcuts,## Four Simple Steps To Use Shortcuts from Pur...,1,acunote.acunote-shortcuts.md,2,"%%NEWLINE 1. <a href=""https://github.com/acuno..."


In [81]:
df_new.to_csv(r'../output/new_sections_csv.csv', index = False, header=True)