In [1]:
import pandas as pd


In [2]:
repos_df = pd.read_json('../data/github_repos_000000000000.json.gz', compression='gzip', lines=True)

In [3]:
repos_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272876 entries, 0 to 272875
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   repo_name  272876 non-null  object
 1   content    272847 non-null  object
 2   language   272876 non-null  object
dtypes: object(3)
memory usage: 6.2+ MB


In [4]:
repos_df['content'][repos_df['content'].isna()] = ''

In [5]:
def merge_text_langs(langs):
    return ' '.join((lang['name'] for lang in langs))

In [6]:
repos_df['languages_str'] = repos_df['language'].apply(merge_text_langs)

In [7]:
repos_df['is_python_repo'] = repos_df['languages_str'].str.contains('Python')
python_repos_df = repos_df[repos_df['is_python_repo']]

In [8]:
python_repos_df

Unnamed: 0,repo_name,content,language,languages_str,is_python_repo
9,J-cztery/jenkins-inheritance-plugin,"This project contains the source code of the ""...","[{'name': 'CSS', 'bytes': '1174'}, {'name': 'G...",CSS Groovy HTML Java JavaScript Python,True
28,NilsJPWerner/scgbot,# scgbot\nBot for the UChicago senior class gi...,"[{'name': 'HTML', 'bytes': '3933'}, {'name': '...",HTML Python,True
29,FinnOD/text,# terrain\nprocedural terrain testing\n,"[{'name': 'HTML', 'bytes': '10477'}, {'name': ...",HTML JavaScript Python,True
38,niranjan-hanumegowda/chef-repo,Overview\n========\n\nEvery Chef installation ...,"[{'name': 'Perl', 'bytes': '847'}, {'name': 'P...",Perl Python Ruby Shell,True
42,Darmwind/FreeDMS,# FreeDMS\nA structured way to organize your d...,"[{'name': 'Groff', 'bytes': '1335'}, {'name': ...",Groff Python Shell,True
...,...,...,...,...,...
272204,apokellypse/shout-webapp,# shout-webapp\r\n\r\n# What is [sh]out?\r\n\r...,"[{'name': 'CSS', 'bytes': '112'}, {'name': 'HT...",CSS HTML JavaScript Python,True
272214,weidazh/lunchere,Lunchere is a web platform for users to pick w...,"[{'name': 'CSS', 'bytes': '33500'}, {'name': '...",CSS JavaScript PHP Python Shell,True
272215,joshuasnowball/INERT-dmp,dmp\n===\n\nDiscreet visual aggregator\n\nAugu...,"[{'name': 'CSS', 'bytes': '13618'}, {'name': '...",CSS JavaScript PHP Python Shell,True
272217,digitarald/d2g,Distribute to Gecko aka D2G\n=================...,"[{'name': 'CSS', 'bytes': '309795'}, {'name': ...",CSS JavaScript Python Scala Shell,True


In [9]:
import scattertext as st
import spacy

In [10]:
repos_df_sample = repos_df.sample(frac=0.1)
repos_df_sample.shape

(27288, 5)

In [11]:
import dragnet
import markdown2

In [12]:
repos_df_sample

Unnamed: 0,repo_name,content,language,languages_str,is_python_repo
260946,dcorderoch/c-fizzbuzz-adventures,# FizzBuzz\n\nimplemented in the C programming...,"[{'name': 'C', 'bytes': '3452'}, {'name': 'Mak...",C Makefile,False
158910,dagirard/DataTree,----------------------------------------------...,[],,False
150767,timfallmk/dcos-cli-docker,# dcos-cli-docker\nDCOS CLI preprepared in a D...,[],,False
205835,pbrisbin/airs,"# Airs\n\nThese days, we all keep up with tele...","[{'name': 'Ruby', 'bytes': '6889'}]",Ruby,False
34896,beomagi/cachewho,# cachewho\nA JSON/cmdline interactive key val...,"[{'name': 'HTML', 'bytes': '10217'}, {'name': ...",HTML Python,True
...,...,...,...,...,...
36009,lvicentesanchez/hackapp,HaskApp\n=======\n\nHaskell web application us...,"[{'name': 'Haskell', 'bytes': '2563'}]",Haskell,False
30758,LaudaDev/sep-issuer-bank,#SEP 2015 - Issuer Bank\n\nREST service for au...,"[{'name': 'Java', 'bytes': '31122'}, {'name': ...",Java JavaScript,False
242441,RedBeard0531/ycmd,ycmd: a code-completion & comprehension server...,"[{'name': 'C', 'bytes': '3657826'}, {'name': '...",C C# C++ CMake Go JavaScript Objective-C Perl ...,True
271159,gitawego/cordova-webview-setting,cordova-webview-setting\n====================\...,"[{'name': 'Java', 'bytes': '1659'}, {'name': '...",Java JavaScript,False


In [13]:
def convert_markdown(markdown_text):
    markdowner = markdown2.Markdown()
    return markdowner.convert(markdown_text)

In [14]:
from mlutil import parallel



In [15]:
%%time
repos_df_sample['html_content'] = list(parallel.mapp(convert_markdown, repos_df_sample['content'], chunksize=10))

CPU times: user 1.37 s, sys: 417 ms, total: 1.78 s
Wall time: 2min 9s


In [16]:
repos_df_sample['html_content'].iloc[0]

'<h1>FizzBuzz</h1>\n\n<p>implemented in the C programming Language (C99)</p>\n\n<p>simply having some fun and reviewing conditional compilation with make and the C pre-processor</p>\n\n<h1>Compilation</h1>\n\n<p><code>make</code> to simply compile</p>\n\n<p><code>make NUMBER=1</code> to enable printing the numbers before fuzz, buzz, fizzbuzz, or the number</p>\n\n<p><code>make SLEEP=1</code> to enable a small delay before each print</p>\n\n<p><code>make DEBUG</code> to enable the <code>-g</code> compile flag to add debug symbols</p>\n\n<p>and they can be combined to add numbers and delay, numbers and debug symbols, etc.</p>\n'

In [17]:
repos_df_sample.shape

(27288, 6)

In [None]:
%%time
nlp = spacy.load('en')
corpus = st.CorpusFromPandas(repos_df_sample, 
    category_col='is_python_repo', 
    text_col='content',
    nlp=nlp).build()