# Prerequisites

In [1]:
import datalake
import datetime
import github
import logging
import pymongo
import os
import yaml

# Set up logging ... DEBUG for my code, WARNING for libraries
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("hdfs").setLevel(logging.WARNING)
logging.basicConfig(format='%(asctime)s - %(levelname)-6s - %(name)15s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)

# Load credentials ... credentials.yaml is never checked-in to GitHub!
with open('credentials.yaml', 'r') as credentials_file:
    credentials = yaml.safe_load(credentials_file)

# Instantiate the (self-authored) GitHub API client (and log the current rate limits)
g = github.GitHub(credentials['github']['username'],
                  credentials['github']['token'])
g.log_limits()

# Instantiate the (self-authored) DataLake tool, specify the zone, source, and data type
datalake = datalake.DataLake('raw', 'api.github.com', 'repository')

# Connect to MongoDB (which is running in another container/host called 'mongodb'
mongo = pymongo.MongoClient(host = 'mongodb')

# Create a reference to the 'repositories' collection w/in the 'github' database in MongoDB
repositories = mongo.github.repositories

2021-09-27 01:17:32 - DEBUG  -          GitHub - Remaining core: 4992, remaining search: 30


# Retrieve from GitHub API, store in HDFS

In [2]:
for lang in github.GitHub.LANGUAGES:
    g.search_repositories(lambda results: datalake.store_json(results['id'], results, contentType=github.GitHub.CONTENT_TYPE),
                          'is:public stars:>=9000 archived:False language:{lang}'.format(lang=lang))

2021-09-27 01:17:32 - INFO   -          GitHub - Searching GitHub repositories: is:public stars:>=9000 archived:False language:JavaScript
2021-09-27 01:17:35 - DEBUG  -          GitHub - Total count: 594.  Page 1 of 6
2021-09-27 01:17:49 - DEBUG  -          GitHub - Total count: 594.  Page 2 of 6
2021-09-27 01:18:16 - DEBUG  -          GitHub - Total count: 594.  Page 3 of 6
2021-09-27 01:18:52 - DEBUG  -          GitHub - Total count: 594.  Page 4 of 6
2021-09-27 01:19:29 - DEBUG  -          GitHub - Total count: 594.  Page 5 of 6
2021-09-27 01:20:01 - DEBUG  -          GitHub - Total count: 594.  Page 6 of 6
2021-09-27 01:20:22 - INFO   -          GitHub - Searching GitHub repositories: is:public stars:>=9000 archived:False language:Rust
2021-09-27 01:20:23 - DEBUG  -          GitHub - Total count: 35.  Page 1 of 1
2021-09-27 01:20:29 - INFO   -          GitHub - Searching GitHub repositories: is:public stars:>=9000 archived:False language:Python
2021-09-27 01:20:32 - DEBUG  -       

# Verify HDFS Contents (and Metadata)

In [3]:
datalake.list(showAttrs=True)

/raw/api.github.com/repository/2021/09/26/100060912.json                    {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/100401612.json                    {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/10054176.json                     {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/10057936.json                     {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/10064545.json                     {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/100982449.json                    {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/101033179.json                    {'contentType': 'application/vnd.github.v3+json'}
/raw/api.github.com/repository/2021/09/26/101109137.json                    {'contentType': 'application/vnd.github.v3

# Retrieve from HDFS, Store in MongoDB

In [4]:
for path, dirs, files in datalake.walk():
    for f in files:
        content = datalake.get_json(os.path.join(path, f))

        # Convert date fields to ISODate to facilitate searching in MongoDB
        for k in ['created_at', 'updated_at', 'pushed_at']:
            content[k] = datetime.datetime.strptime(content[k], '%Y-%m-%dT%H:%M:%SZ')

        # Use the GitHub ID as a unique index to avoid duplicates
        repositories.update_one({'id': content['id']},
                                {'$set': content},
                                upsert=True)


# Verify Contents of MongoDB

In [5]:
%%bash

mongosh --eval "db.repositories.countDocuments()" --quiet mongodb://mongodb:27017/github
mongosh --eval "db.repositories.findOne()" --quiet mongodb://mongodb:27017/github


2103
{
  _id: ObjectId("61510278ed66e53e62a2884e"),
  id: 100060912,
  allow_forking: true,
  archive_url: 'https://api.github.com/repos/microsoft/terminal/{archive_format}{/ref}',
  archived: false,
  assignees_url: 'https://api.github.com/repos/microsoft/terminal/assignees{/user}',
  blobs_url: 'https://api.github.com/repos/microsoft/terminal/git/blobs{/sha}',
  branches_url: 'https://api.github.com/repos/microsoft/terminal/branches{/branch}',
  clone_url: 'https://github.com/microsoft/terminal.git',
  collaborators_url: 'https://api.github.com/repos/microsoft/terminal/collaborators{/collaborator}',
  comments_url: 'https://api.github.com/repos/microsoft/terminal/comments{/number}',
  commits_url: 'https://api.github.com/repos/microsoft/terminal/commits{/sha}',
  compare_url: 'https://api.github.com/repos/microsoft/terminal/compare/{base}...{head}',
  contents_url: 'https://api.github.com/repos/microsoft/terminal/contents/{+path}',
  contributors_url: 'https://api.github.com/repos/mi