In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from altair import *

In [10]:
data = pd.read_csv("/share/GithubAnalysis/pullFiles/Pulls_VS_Total.csv").drop("Unnamed: 0", axis=1)

In [37]:

data.head()

Unnamed: 0,day,hour,pull_events,total_events,percent_pulls
0,1,0,3552,39026,0.091016
1,1,1,3651,40997,0.089055
2,1,2,2601,41247,0.063059
3,1,3,2590,39555,0.065478
4,1,4,2206,32467,0.067946


In [42]:
traces = [
    Scatter({
            'x': data.index,
            'y': data.pull_events,
            "name": "# Pull Events"
        }),
    Scatter({
            'x': data.index,
            'y': data.percent_pulls * 10000,
            "name": "% Pull Events"
        })
]

layout = Layout(
    title="Pulls Over Time",
    xaxis=dict(
        title='Time in hours across December',
        autotick=False,
        ticks='outside',
        tick0=0,
        dtick=24,
        ticklen=5,
        tickwidth=2,
        tickcolor='#000'
    ),
    yaxis=dict(
        title="# of Pulls",
        autotick=False,
        ticks='outside',
        tick0=0,
        dtick=1000,
        ticklen=5,
        tickwidth=2,
        tickcolor='#000'
    )
)
fig = Figure(data=traces, layout=layout)
py.iplot(fig, filename='axes-ticks')

The % Pull Events line shows that the # Pull Events is actually an acurate representation of Github activity as a whole.

From this graph we can see a pattern of weekdays versus weekends. Notice the sequence of 5 large spikes followed by 2 smaller spikes. The first two weeks are fairly even, but the third week drops slightly and the last week (Christmas) is about half the activity. Saturdays appear fairly regular, but Sunday nights stay fairly active. From the trends, it appears that most of Github's users are situated in the same/similar timezone (probably USA/West Europe).

Note: Since time is kept in UTC, 0 hour corresponds to 5 PM SLO time, so most spikes occur around 11 am.

In [44]:
allData = pd.read_csv("/share/GithubAnalysis/pullFiles/combined/combined.csv").drop("Unnamed: 0", axis=1)
allData.head()

Unnamed: 0,additions,body,changed_files,created_at,deletions,description,event_time,forks_count,has_wiki,language,name,number_commits,open_issues_count,secondary_user,size,user_login,watchers_count
0,81.0,,13.0,2015-10-20T18:21:36Z,178.0,"News, and upcoming releases from the Adventure...",2016-12-12 00:32:15,0,False,CSS,adventurerscodex.github.io,4.0,7,,33379,Sonictherocketman,1
1,366.0,Issue #316\r\n\r\nTorre Agbar:\r\nModel in ble...,13.0,2016-09-27T19:26:16Z,0.0,Video Game project 2016 Software Engineering. ...,2016-12-12 00:21:50,0,True,C#,ES2016B,5.0,17,,271098,jdomenmi7,8
2,104.0,"Updated ""Migration from older versions"" chapte...",2.0,2011-12-09T15:15:43Z,0.0,REST and JAXRS,2016-12-12 00:15:40,652,False,Java,Resteasy,1.0,46,,39365,ronsigal,546
3,4380.0,,4.0,2016-11-20T19:14:17Z,0.0,,2016-12-12 00:10:56,0,True,Jupyter Notebook,Renju,3.0,2,,1241,storandrew,0
4,1.0,,1.0,2016-12-11T16:27:13Z,0.0,,2016-12-12 00:45:07,0,True,,proyect,1.0,1,,0,jlog1994,0


In [48]:
allData

Unnamed: 0,additions,body,changed_files,created_at,deletions,description,event_time,forks_count,has_wiki,language,name,number_commits,open_issues_count,secondary_user,size,user_login,watchers_count
0,81.0,,13.0,2015-10-20T18:21:36Z,178.0,"News, and upcoming releases from the Adventure...",2016-12-12 00:32:15,0,False,CSS,adventurerscodex.github.io,4.0,7,,33379,Sonictherocketman,1
1,366.0,Issue #316\r\n\r\nTorre Agbar:\r\nModel in ble...,13.0,2016-09-27T19:26:16Z,0.0,Video Game project 2016 Software Engineering. ...,2016-12-12 00:21:50,0,True,C#,ES2016B,5.0,17,,271098,jdomenmi7,8
2,104.0,"Updated ""Migration from older versions"" chapte...",2.0,2011-12-09T15:15:43Z,0.0,REST and JAXRS,2016-12-12 00:15:40,652,False,Java,Resteasy,1.0,46,,39365,ronsigal,546
3,4380.0,,4.0,2016-11-20T19:14:17Z,0.0,,2016-12-12 00:10:56,0,True,Jupyter Notebook,Renju,3.0,2,,1241,storandrew,0
4,1.0,,1.0,2016-12-11T16:27:13Z,0.0,,2016-12-12 00:45:07,0,True,,proyect,1.0,1,,0,jlog1994,0
5,213.0,Fixed validation edge case with application sc...,7.0,2011-12-09T15:15:43Z,1.0,REST and JAXRS,2016-12-12 00:15:28,652,False,Java,Resteasy,1.0,47,,39365,ronsigal,546
6,10.0,,2.0,2016-05-06T09:11:07Z,2.0,Upload ISO image to SAKURA CLOUD.,2016-12-12 00:47:50,1,True,Go,sacloud-upload-image,1.0,0,,69,hnakamur,1
7,1.0,https://greenkeeper.io/,1.0,2016-11-28T00:14:43Z,1.0,Funkster is a compositional server library. Th...,2016-12-12 00:00:45,0,True,TypeScript,funkster-http-json,1.0,2,,96,Bomret,0
8,10527.0,,66.0,2016-11-17T16:07:24Z,0.0,,2016-12-12 00:58:29,16,True,PHP,blog,1.0,3,,2743,nima-vova,0
9,5.0,…he api,2.0,2016-11-14T22:53:26Z,3.0,Feeder - All You Need to Know,2016-12-12 00:15:24,0,True,HTML,feeder,1.0,1,,3608,npranto,0


__Analysis of Languages__

In [53]:
allData["size"] = pd.to_numeric(allData["size"])
allData["open_issues_count"] = pd.to_numeric(allData["open_issues_count"])
allData.groupby("language").sum().sort_values("size")

Unnamed: 0_level_0,additions,changed_files,deletions,forks_count,has_wiki,number_commits,open_issues_count,size,watchers_count,issuesBySize
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DTrace,1.0,1.0,0.0,0,1.0,1.0,0,2,0,0.000000
MQL4,10.0,1.0,5.0,0,1.0,1.0,1,11,0,0.090909
Papyrus,148.0,7.0,0.0,3,1.0,1.0,4,25,9,0.160000
GAMS,0.0,0.0,0.0,2,1.0,1.0,1,53,0,0.018868
LookML,21.0,3.0,4.0,6,2.0,1.0,3,103,2,0.270202
Cirru,5.0,1.0,1.0,11,1.0,1.0,3,110,52,0.027273
Squirrel,18.0,2.0,15.0,20,10.0,8.0,29,134,0,2.363636
CLIPS,1040.0,18.0,1322.0,8,3.0,15.0,10,162,12,0.131579
Diff,158.0,2.0,201.0,1,1.0,1.0,3,176,4,0.017045
Lex,847.0,19.0,1030.0,3,1.0,10.0,0,246,0,0.000000


In [67]:
temp = allData.groupby("language").sum().sort_values("size", ascending=False)
temp["issuesBySize"] = temp["open_issues_count"] / temp["size"] * 100
temp = temp.head(25)
temp["language"] = temp.index

temp

Unnamed: 0_level_0,additions,changed_files,deletions,forks_count,has_wiki,number_commits,open_issues_count,size,watchers_count,issuesBySize,language
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Java,123333578.0,1654397.0,55010442.0,10747161,40294.0,669983.0,2605519,14290571719,25999397,0.018232,Java
C++,72796641.0,551639.0,59126713.0,12660397,17582.0,330736.0,3244656,4264845262,37961881,0.076079,C++
JavaScript,215497495.0,2128466.0,106887722.0,26547035,79785.0,531182.0,6175883,3252415658,124521909,0.189886,JavaScript
Ruby,33522557.0,439716.0,9719095.0,12012490,29461.0,192783.0,4184286,2482043106,27357626,0.168582,Ruby
Python,88047775.0,573121.0,24449952.0,19108534,42070.0,333285.0,7594696,2460809269,51188786,0.308626,Python
DM,8239542.0,46846.0,8062869.0,1130512,1834.0,31850.0,2063338,2312505343,372117,0.089225,DM
HTML,127119625.0,1060892.0,59932619.0,14744305,22882.0,118521.0,1671885,1413611925,5202015,0.11827,HTML
PHP,63539328.0,743003.0,27588680.0,7533810,21340.0,183174.0,2083640,1325922718,17087943,0.157146,PHP
C#,73815911.0,505995.0,21399455.0,4059843,13587.0,95628.0,2864474,1280085439,13375985,0.223772,C#
Go,34854156.0,281416.0,16308600.0,13381478,11728.0,113281.0,7358320,1221862122,49844564,0.602222,Go


In [80]:
layout = Layout({
        'title': 'Size (bytes) vs % Open Issues by Language (top 25 most common by size)',
        'xaxis':{'title':'Size'},
        'yaxis':{'title':'% Open Issues'}
    })

split_traces = [] 

for lang in temp.language:
    split_traces.append(
        Scatter({
            'x': temp[temp.language == lang]["size"],
            'y': temp[temp.language == lang]["issuesBySize"],
            'mode': 'markers',
            'name': lang
        })
    )

split_fig = Figure(data=split_traces, layout=layout)

py.iplot(split_fig)

Based on the above graph, it is clear that Java is the most commonly used programming language on Github. In addition, it is also the language that exhibited the lowest percent of issues based on its size. This is likely because Java has been explored to a large extent and is very commonly used, therefore common problems that occur can be solved by a google search, so issues are less likely to occur. Contrast this with a Julia (a new language used for numerical computing, that is similar to Python and Matlab) which has quite a few more issues per size (1.25%). This is likely due to the fact that Julia is a relatively new language and is mostly unexplored so issues would be common, even at a basic level.