In [1]:
!conda install -y GitPython

Fetching package metadata .........
Solving package specifications: .

# All requested packages already installed.
# packages in environment at /Users/user/anaconda3:
#
gitpython                 2.1.3                    py36_0  


In [2]:
from git import Repo
import pandas as pd
import re

In [3]:
repo = Repo('~/youtube-dl')

In [4]:
data = {
    'author_email': [],
    'authored_date': [],
    'summary': [],
    'diffs': [], # to avoid the confliction with pd.DataFrame.diff
    'num_add': [],
    'num_del': []
}

this = repo.commit('master')
cnt = 0

while True:
    try:
        parent = this.parents[0]
    except:
        break

    record = {
        'author_email': this.author.email,
        'authored_date': this.authored_date,
        'summary': this.summary,
        'diffs': '\r\n\r\n'.join(list(map(str, this.diff(parent, create_patch=True)))),
        'num_add': this.stats.total['insertions'],
        'num_del': this.stats.total['deletions']
    }
    
    for k in record:
        data[k].append(record[k])
    
    this = parent
    
    cnt += 1
    if cnt % 1000 == 0:
        print('%d commits loaded' % cnt)

1000 commits loaded
2000 commits loaded
3000 commits loaded
4000 commits loaded
5000 commits loaded
6000 commits loaded
7000 commits loaded
8000 commits loaded
9000 commits loaded
10000 commits loaded
11000 commits loaded
12000 commits loaded


In [5]:
data = pd.DataFrame(data)

# format, clean up, and new feature
data.authored_date = pd.to_datetime(data.authored_date, unit='s')
data.diffs = data.diffs.map(lambda d: re.sub(r'\n[lr]hs: \d+ \| [0-9a-f]+', '', d))
data['words'] = data.diffs.map(lambda d: re.findall(r'[A-Za-z][A-Za-z0-9]+', d))

data.sample(5)

Unnamed: 0,author_email,authored_date,diffs,num_add,num_del,summary,words
8439,phihag@phihag.de,2014-07-11 13:38:18,youtube_dl/extractor/common.py\n==============...,60,13,[screencast] Add suppot for more video types (...,"[youtube, dl, extractor, common, py, class, In..."
11467,phihag@phihag.de,2012-11-27 23:13:40,youtube_dl/InfoExtractors.py\n================...,2,2,One more except..as,"[youtube, dl, InfoExtractors, py, class, Sound..."
4541,marco.ferragina@gmail.com,2015-10-14 09:11:52,docs/supportedsites.md\n======================...,84,0,[vidto] Add extractor,"[docs, supportedsites, md, VideoTt, video, tt,..."
11145,jaime.marquinez.ferrandiz@gmail.com,2013-04-20 10:50:14,youtube_dl/FileDownloader.py\n================...,21,12,Allows to specify which IE should be used for ...,"[youtube, dl, FileDownloader, py, if, os, name..."
819,remitamine@gmail.com,2017-02-21 13:38:00,youtube_dl/extractor/ninecninemedia.py\n======...,1,0,[ninecninemedia] use geo bypass mechanism,"[youtube, dl, extractor, ninecninemedia, py, c..."


In [6]:
train = data[data.authored_date < '2017-01-01']
train.sample(5)

Unnamed: 0,author_email,authored_date,diffs,num_add,num_del,summary,words
4285,dstftw@gmail.com,2015-12-11 15:11:45,youtube_dl/extractor/funimation.py\n==========...,2,1,[funimation] PEP 8,"[youtube, dl, extractor, funimation, py, from,..."
3718,dstftw@gmail.com,2016-02-11 16:02:37,youtube_dl/extractor/pbs.py\n=================...,8,4,[pbs] Fix multi part videos extraction,"[youtube, dl, extractor, pbs, py, class, PBSIE..."
8274,phihag@phihag.de,2014-08-21 23:52:56,youtube_dl/extractor/__init__.py\n============...,148,0,Merge remote-tracking branch 'terminalmage/add...,"[youtube, dl, extractor, init, py, from, orf, ..."
10511,phihag@phihag.de,2013-08-28 10:47:38,youtube_dl/extractor/addanime.py\n============...,0,6,Remove unused imports,"[youtube, dl, extractor, addanime, py, import,..."
4884,dstftw@gmail.com,2015-09-26 19:10:39,youtube_dl/extractor/eagleplatform.py\n=======...,3,1,[eagleplatform] Simplify secure mp4 constructi...,"[youtube, dl, extractor, eagleplatform, py, cl..."


In [7]:
test = data[data.authored_date >= '2017-01-01']
test_ans = test.summary

test = test.drop('summary', axis=1)
test.sample(5)

Unnamed: 0,author_email,authored_date,diffs,num_add,num_del,words
485,dstftw@gmail.com,2017-04-15 13:30:51,youtube_dl/extractor/brightcove.py\n==========...,34,25,"[youtube, dl, extractor, brightcove, py, class..."
1093,dstftw@gmail.com,2017-01-18 16:10:00,.github/ISSUE_TEMPLATE.md\n===================...,5,5,"[github, ISSUE, TEMPLATE, md, Make, sure, you,..."
331,remitamine@gmail.com,2017-05-07 07:58:34,youtube_dl/extractor/nbc.py\n=================...,29,65,"[youtube, dl, extractor, nbc, py, from, utils,..."
221,dstftw@gmail.com,2017-05-28 17:33:24,ChangeLog\n===================================...,11,0,"[ChangeLog, version, unreleased, Extractors, y..."
495,dstftw@gmail.com,2017-04-13 17:29:36,ChangeLog\n===================================...,20,0,"[ChangeLog, version, unreleased, Core, downloa..."


In [8]:
train.to_csv('train.csv')
test.to_csv('test.csv')
test_ans.to_csv('test_ans.csv')