In [1]:
import fastai.text

In [2]:
from fastai.text import transform

In [3]:
from nltk import tokenize

In [4]:
import pandas as pd
import datetime
from bs4 import BeautifulSoup
from markdown import markdown
from operator import itemgetter

import re

import html


def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)
    html = re.sub(r'```(.*?)```', ' ', html)
    return BeautifulSoup(html).get_text()

def tokenize_markdown(markdown_string, n_tokens=250):
    text = markdown_to_text(markdown_string)
    return nltk.tokenize.wordpunct_tokenize(text)[:n_tokens]


def tok_fn(lang):
    tok = fastai.text.transform.BaseTokenizer('none')
    tok.tokenizer = tokenize_markdown
    return tok


In [5]:
import fastai
import nltk

In [6]:
raw_df = pd.read_json('github_repos.json', lines=True)
raw_df.head()

Unnamed: 0,repo_name,path,content,language
0,mortardata/mortar-etl-redshift,README.md,# Mortar ETL Pipeline for Redshift\n\nA custom...,"[{'name': 'PigLatin', 'bytes': '4890'}, {'name..."
1,tsubery/sidekiq-unique-jobs,README.md,# SidekiqUniqueJobs [![Build Status](https://t...,"[{'name': 'Ruby', 'bytes': '41751'}]"
2,ovaskevich/svids,README.md,# svids\nIntrusion Detection System - An appli...,"[{'name': 'C', 'bytes': '3755'}, {'name': 'Mak..."
3,kfirg77/kDebuger,README.md.orig,kDebuger\n========\n\nShow information about t...,"[{'name': 'CSS', 'bytes': '27336'}, {'name': '..."
4,wdi-hk-sep-2014/PepperLunchClone,README.rdoc,== README\n\nThis README would normally docume...,"[{'name': 'CSS', 'bytes': '2444'}, {'name': 'C..."


In [7]:
markdown_tokenizer = fastai.text.Tokenizer(
    tok_func=tok_fn,
    pre_rules=[],
    post_rules=[],
    special_cases=[],
    n_cpus=12)

In [8]:
raw_df['languages'] = raw_df['language'].apply(lambda ds: [d['name'] for d in ds])

In [9]:
selected_langs = ['python', 'r', 'matlab', 'julia', 'c++', 'java', 'scala']
df = raw_df[raw_df['languages'].apply(lambda langs: any([lang.lower() in selected_langs for lang in langs]))]
df = df.drop(['language'], axis=1)
df = df[(~df['content'].isna())]
df = df[df['content'].str.split().apply(len) > 25]
df = df[(df['content'].apply(itemgetter(0)) != '<') & (df['content'].apply(itemgetter(-1)) != '>')]
df['content'] = df['content'].str.replace(r'#+', 'xxhashtag')

In [10]:
df['content'] = df['content'].str.replace(r'[0-9]+', 'xxnumber')

In [None]:
n_examples =  10 * 10 ** 3
lm_df = df[['repo_name', 'languages', 'truncated_content']][:n_examples]

In [None]:
lm_df = lm_df.dropna()

In [None]:
lm_df[['truncated_content']].to_csv('github_repos_lm_text_small.csv')

In [None]:
lm_df.index = pd.RangeIndex(len(lm_df))

import tqdm

extracted_content = pd.Series([tokenize_markdown(md_string) for md_string in tqdm.tqdm(lm_df['truncated_content'])])
lm_df['text'] = extracted_content.apply(' '.join)
lm_df = lm_df[(~lm_df['text'].isna()) & (lm_df['text'].apply(len) > 0)]
lm_df[['text']].to_csv('github_repos_lm_text.csv')

### Load to FastAI api

In [16]:
%%time
data_lm = TextLMDataBunch.from_csv(
    '', 'github_repos_lm_text.csv'
)
data_lm.save('data_lm_export.pkl')

CPU times: user 8.42 s, sys: 3.38 s, total: 11.8 s
Wall time: 37.7 s


In [17]:
bs = 64

In [18]:
data_lm = load_data('', 'data_lm_export.pkl', bs=bs, bptt=50)

In [44]:
data_lm

TextLMDataBunch;

Train: LabelList (55999 items)
x: LMTextList
xxbos xxrep 13 = xxmaj cookiecutter xxrep 13 = .. image : : https : / / badge . fury . io / py / cookiecutter . png : target : http : / / badge . fury . io / py / cookiecutter .. image : : https : / / travis - ci . org / audreyr / cookiecutter . png ? branch = master : target : https : / / travis - ci . org / audreyr / cookiecutter .. image : : https : / / ci . appveyor . com / api / projects / status / github / audreyr / cookiecutter ? branch = master : target : https : / / ci . appveyor . com / project / audreyr / cookiecutter / branch / master .. image : : https : / / pypip . in / d / cookiecutter / badge . png : target : https : / / crate . io / packages / cookiecutter ? version = latest .. image : : https : / / codecov . io / github / audreyr / cookiecutter / coverage . svg ? branch = master : target : https : / / codecov . io / github / audreyr / cookiecutter ? branch = master .. image : : https : / / badges . gitter 