In [1]:
require 'open-uri'
require 'pycall/import'
include PyCall::Import

pyimport :pandas, as: :pd
pyimport :numpy, as: :np
pyblt = PyCall::builtins
Dict = PyCall::Dict
List = PyCall::List

<class 'list'>

In [2]:
def read_committers_log(file)
    committers = File.read(file).split("\n").map do |e|
        commits, id = e.split("\t")
        user, addr = id.split(" <")
        [commits.to_i, user, addr.chop]
    end
    pd.DataFrame.new(data: committers, columns:[:commits, :author, :addr])
end

:read_committers_log

In [3]:
df = read_committers_log('ruby_committers.log')

Unnamed: 0,commits,author,addr
0,16567,nobu,nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
1,4746,akr,akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2,4340,svn,svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
3,2728,naruse,naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
4,2562,matz,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
...,...,...,...
279,1,ujihisa,ujihisa@users.noreply.github.com
280,1,y-yagi,yuuji.yaginuma@gmail.com
281,1,yuuji.yaginuma,yuuji.yaginuma@gmail.com
282,1,zsombor,zsombor@b2dd03c8-39d4-4d8f-98ff-823fe69b080e


In [4]:
df.author.unique().size

267

In [5]:
addr_join = proc {|s| s.tolist.to_a.join(',')}
df_uniq_author = df.groupby(:author).agg(Dict.new({commits: :sum, addr: addr_join}))
    .sort_values(:commits, ascending: false).reset_index

Unnamed: 0,author,commits,addr
0,nobu,16567,nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
1,akr,4746,akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2,svn,4340,svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
3,naruse,2728,naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
4,matz,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
...,...,...,...
262,Yaw Boakye,1,wheresyaw@gmail.com
263,Will Binns,1,will@trek.io
264,Tomoki Aonuma,1,uasi@uasi.jp
265,Thomas Leitner,1,t_leitner@gmx.at


In [6]:
addr_user = proc do |df|
    df.addr.tolist.map do |addrs|
        addrs.split(',').map do |addr|
            user, domain = addr.split('@')
            if user == 'mail'
                domain.split('.')[0]
            elsif ['svn',  'svn-admin'].include?(user)
                'matzbot'
            elsif domain == 'users.noreply.github.com'
                sp = user.split('+')
                if sp.size == 2
                    sp[1]
                else
                    sp[0]
                end
            else
                user
            end
        end.uniq.join(',')
    end
end

df_addr_user = df_uniq_author.assign(addr_user: addr_user)

Unnamed: 0,author,commits,addr,addr_user
0,nobu,16567,nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu
1,akr,4746,akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr
2,svn,4340,svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matzbot
3,naruse,2728,naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,naruse
4,matz,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matz
...,...,...,...,...
262,Yaw Boakye,1,wheresyaw@gmail.com,wheresyaw
263,Will Binns,1,will@trek.io,will
264,Tomoki Aonuma,1,uasi@uasi.jp,uasi
265,Thomas Leitner,1,t_leitner@gmx.at,t_leitner


In [7]:
max_author =  proc {|s| s.tolist.max_by{|e| e.size}}
df_uniq_addr_user = df_addr_user.groupby(:addr_user).agg(Dict.new({commits: :sum, addr: addr_join, author: max_author}))
    .sort_values(:commits, ascending: false).reset_index

Unnamed: 0,addr_user,commits,addr,author
0,nobu,17733,"nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu...",Nobuyoshi Nakada
1,akr,4759,"akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr@f...",Tanaka Akira
2,matzbot,4686,"svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,svn@b...",svn
3,naruse,2760,"naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,na...","NARUSE, Yui"
4,matz,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matz
...,...,...,...,...
232,richard.schneeman+foo,1,richard.schneeman+foo@gmail.com,schneems
233,rafaelmfranca,1,rafaelmfranca@gmail.com,Rafael Mendonça França
234,pvalena,1,pvalena@redhat.com,Pavel Valena
235,poketo7878,1,poketo7878@gmail.com,Pocket7878


In [8]:
def check_github_user(user)
    open("https://github.com/#{user}/")
    true
rescue => e
    false
end

is_github_user = df_uniq_addr_user.addr_user.tolist.map do |e|
    sleep(1)
    check_github_user(e)
end

[true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, true, true, true, true, true, true, true, true, true, true, false, false, true, true, false, true, true, true, true, true, true, true, true, true, false, true, true, true, true, true, true, false, false, true, true, true, false, true, false, true, true, true, false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, true, true, false, true, true, true, false, true, true, true, true, false, false, false, true, true, true, true, true, true, false, false, true, false, false, false, true, false, true, true, false, false, false, true, true, true, true, true, false, false, true, true, true, true, true, true, true, true, true, false, false, true, false, true, true, false, true, false, true, true, false, false, false, false, false, false, false, false, true, true,

In [9]:
df_github_user = df_uniq_addr_user.assign(is_github_user: is_github_user)

Unnamed: 0,addr_user,commits,addr,author,is_github_user
0,nobu,17733,"nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu...",Nobuyoshi Nakada,True
1,akr,4759,"akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr@f...",Tanaka Akira,True
2,matzbot,4686,"svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,svn@b...",svn,True
3,naruse,2760,"naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,na...","NARUSE, Yui",True
4,matz,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matz,True
...,...,...,...,...,...
232,richard.schneeman+foo,1,richard.schneeman+foo@gmail.com,schneems,False
233,rafaelmfranca,1,rafaelmfranca@gmail.com,Rafael Mendonça França,False
234,pvalena,1,pvalena@redhat.com,Pavel Valena,True
235,poketo7878,1,poketo7878@gmail.com,Pocket7878,False


In [10]:
add_tmp_user = proc do |df|
    df.is_github_user.tolist.zip(df.addr_user.tolist).map do |is_github_user, addr_user|
        is_github_user ? addr_user : "XXXX_" + addr_user 
    end
end

df_tmp_user = df_github_user.assign(tmp_user: add_tmp_user)

Unnamed: 0,addr_user,commits,addr,author,is_github_user,tmp_user
0,nobu,17733,"nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu...",Nobuyoshi Nakada,True,nobu
1,akr,4759,"akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr@f...",Tanaka Akira,True,akr
2,matzbot,4686,"svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,svn@b...",svn,True,matzbot
3,naruse,2760,"naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,na...","NARUSE, Yui",True,naruse
4,matz,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matz,True,matz
...,...,...,...,...,...,...
232,richard.schneeman+foo,1,richard.schneeman+foo@gmail.com,schneems,False,XXXX_richard.schneeman+foo
233,rafaelmfranca,1,rafaelmfranca@gmail.com,Rafael Mendonça França,False,XXXX_rafaelmfranca
234,pvalena,1,pvalena@redhat.com,Pavel Valena,True,pvalena
235,poketo7878,1,poketo7878@gmail.com,Pocket7878,False,XXXX_poketo7878


In [11]:
df_mod_drop = df_tmp_user.drop(columns: [:addr_user])

Unnamed: 0,commits,addr,author,is_github_user,tmp_user
0,17733,"nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu...",Nobuyoshi Nakada,True,nobu
1,4759,"akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr@f...",Tanaka Akira,True,akr
2,4686,"svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,svn@b...",svn,True,matzbot
3,2760,"naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,na...","NARUSE, Yui",True,naruse
4,2562,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,matz,True,matz
...,...,...,...,...,...
232,1,richard.schneeman+foo@gmail.com,schneems,False,XXXX_richard.schneeman+foo
233,1,rafaelmfranca@gmail.com,Rafael Mendonça França,False,XXXX_rafaelmfranca
234,1,pvalena@redhat.com,Pavel Valena,True,pvalena
235,1,poketo7878@gmail.com,Pocket7878,False,XXXX_poketo7878


In [12]:
df_tmp_out = df_mod_drop.reindex(columns: [:commits, :tmp_user, :author, :is_github_user, :addr])
    .sort_values([:commits, :tmp_user], ascending: [false, true]).reset_index(drop: true)

Unnamed: 0,commits,tmp_user,author,is_github_user,addr
0,17733,nobu,Nobuyoshi Nakada,True,"nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,nobu..."
1,4759,akr,Tanaka Akira,True,"akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,akr@f..."
2,4686,matzbot,svn,True,"svn@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,svn@b..."
3,2760,naruse,"NARUSE, Yui",True,"naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e,na..."
4,2562,matz,matz,True,matz@b2dd03c8-39d4-4d8f-98ff-823fe69b080e
...,...,...,...,...,...
232,1,uasi,Tomoki Aonuma,True,uasi@uasi.jp
233,1,ujihisa,ujihisa,True,ujihisa@users.noreply.github.com
234,1,will,Will Binns,True,will@trek.io
235,1,ybiquitous,Masafumi Koba,True,473530+ybiquitous@users.noreply.github.com


In [13]:
df_tmp_out.to_csv("ruby_committers_tmp.csv", index: false)