In [1]:
from pymongo import MongoClient, InsertOne, UpdateOne
from IPython.display import clear_output
from bson.json_util import loads
import pprint
import configparser

In [2]:
config = configparser.ConfigParser()
config.read('mongo.cfg')

client = MongoClient('mongodb+srv://{}:{}@{}/{}?retryWrites=true&w=majority'.format(*config['CLUSTER'].values()))

db = '{}'.format(*config['DB'].values())
coll = '{}'.format(*config['COLL'].values())
github_coll = client[db][coll]

Events in defined day

In [5]:
day =  '2018-02-01'

pipeline = [
    {
        '$project': {'_id': 0, 'created_at': 1 , 'actor_id': 1, 'actor_login': 1, 'repo_id': 1, 'repo_name':1,
                    'daySubstring': {'$substr': [ "$created_at", 0, 10 ] }}
    },
    {
        '$match': {'daySubstring': day}
    },
        {'$sort': {'created_at': 1} 
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Number of events per month by user

In [7]:
pipeline = [
    {
       '$addFields': {
                      'month': {'$substr': ["$created_at", 0, 7]} ,
                      'Issue': {'$cond': [{'$eq': ["$type", "IssuesEvent"]}, 1, 0 ]},
                      'PR': {'$cond': [{'$eq': ["$type", "PullRequestEvent"]}, 1, 0 ]}
                     }
    },    
    {
    '$group': {
        '_id' : {
                 "month": "$month", 
                 "actor_login": "$actor_login"
                }, 
          'SUM_PullRequestEvent': {'$sum': '$PR'},
          'SUM_IssuesEvent': {'$sum': '$Issue'}
        }
    },
    {
        '$sort': {'_id': 1} 
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Number of events per month by repo

In [8]:
pipeline = [
    {
       '$addFields': {
                      'month': {'$substr': ["$created_at", 0, 7]} ,
                      'Issue': {'$cond': [{'$eq': ["$type", "IssuesEvent"]}, 1, 0 ]},
                      'PR': {'$cond': [{'$eq': ["$type", "PullRequestEvent"]}, 1, 0 ]},
                      'Fork': {'$cond': [{'$eq': ["$type", "ForkEvent"]}, 1, 0 ]}
                     }
    },    
    {
    '$group': {
        '_id' : {
                 "month": "$month", 
 
                 "repo_name": "$repo_name"
                }, 
          'SUM_PullRequestEvent': {'$sum': '$PR'},
          'SUM_IssuesEvent': {'$sum': '$Issue'},
          'SUM_ForkEvent': {'$sum': '$Fork'}
        }
    },
    {
        '$sort': {'_id': 1} 
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Number of active users by month

In [3]:
pipeline = [
    {
        '$addFields': {
                      'month': {'$substr': ["$created_at", 0, 7]}
                     }
    },    
    {
        '$group': {
            '_id' : {
                     "month": "$month",
                     "actor_id": "$actor_id", 
                    }, 
              'SUM': {'$sum': 1}
        }
    }, 
    {
        '$group': {
            '_id' : {
                     "month": "$_id.month" 
                    }, 
              'active_users': {'$sum': 1}
        }
    },     
    {
        '$sort': {'_id': 1} 
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Number of active repos by month

In [4]:
pipeline = [
    {
        '$match': {'type': 'PullRequestEvent'}
    },
    {
        '$addFields': {'month': {'$substr': ["$created_at", 0, 7]}}
    },    
    {
        '$group': {
            '_id' : {
                     "month": "$month",
                     "repo_id": "$repo_id", 
                    }, 
              'SUM': {'$sum': 1}
        }
    }, 
    {
        '$group': {
            '_id' : {
                     "month": "$_id.month" 
                    }, 
              'active_repos_count': {'$sum': 1}
        }
    },     
    {
        '$sort': {'_id': 1} 
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Top 20 repos by pullrequest events per month

In [6]:
pipeline = [
    {
        '$match': {'type': 'PullRequestEvent'}
    },
    {
        '$addFields': {'month': {'$substr': ["$created_at", 0, 7]}}
    },    
    {
        '$group': {
            '_id' : {
                     "month": "$month",
                     "repo_name": "$repo_name", 
                    }, 
              'sum_pull_request': {'$sum': 1}
        }
    },    
    {
        '$sort': {'sum_pull_request': -1} 
    },
    {
        '$limit': 20
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))

Top 20 users by pullrequest events per month

In [5]:
pipeline = [
    {
        '$match': {'type': 'PullRequestEvent'}
    },
    {
        '$addFields': {'month': {'$substr': ["$created_at", 0, 7]}}
    },    
    {
        '$group': {
            '_id' : {
                     "month": "$month",
                     "actor_login": "$actor_login", 
                    }, 
              'sum_pull_request': {'$sum': 1}
        }
    },    
    {
        '$sort': {'sum_pull_request': -1} 
    },
    {
        '$limit': 20
    }
]

clear_output()
pprint.pprint(list(github_coll.aggregate(pipeline)))