In [13]:
import os, errno
import csv
import json
import time
import datetime

from pathlib import Path
from datetime import date
from dateutil.relativedelta import relativedelta

In [14]:
# VARIABLES
path_to_bronze = 'Datalake/bronze/'
path_to_silver = 'Datalake/silver/'
gh_data_staging = 'gh-stg.csv'

In [15]:
# Delete existing staging file before proceeding
try:
    os.remove(path_to_silver+gh_data_staging)
    print('Staging file detected. Removing...')
except OSError as e:
    if e.errno != errno.ENOENT: # Skip file not found error
        raise
    print('Staging file not detected.')

Staging file not detected.


In [16]:
# Loop each json file and retrive the following data items, sha|url|message|name|email|date|login|id
try:
    commit_list = []
    # Iterate each raw file in bronze
    for file in os.listdir(path_to_bronze):
        with open(path_to_bronze+file) as json_file:
            print('Processing file: '+file)
            # Deserialize to a list
            commits = json.load(json_file)
            # Loop 100 entries
            for commit in commits:
                # Extract data items into a dictionary
                commitDict = {
                    "sha":commit['sha'],
                    "url":commit['url'],
                    "message":commit['commit']['message'],
                    "name":commit['commit']['author']['name'],
                    "email":commit['commit']['author']['email'],
                    "date":commit['commit']['author']['date'],
                }

                commit_list.append(commitDict)

    commit_count = str(len(commit_list))
    print(commit_count+' commits collected.')
except:
    print('Something went wrong when processing raw files.')


Processing file: gh-data-1.json
Processing file: gh-data-10.json
Processing file: gh-data-11.json
Processing file: gh-data-12.json
Processing file: gh-data-13.json
Processing file: gh-data-14.json
Processing file: gh-data-15.json
Processing file: gh-data-16.json
Processing file: gh-data-17.json
Processing file: gh-data-18.json
Processing file: gh-data-19.json
Processing file: gh-data-2.json
Processing file: gh-data-20.json
Processing file: gh-data-21.json
Processing file: gh-data-22.json
Processing file: gh-data-23.json
Processing file: gh-data-24.json
Processing file: gh-data-3.json
Processing file: gh-data-4.json
Processing file: gh-data-5.json
Processing file: gh-data-6.json
Processing file: gh-data-7.json
Processing file: gh-data-8.json
Processing file: gh-data-9.json
2375 commits collected.


In [19]:
# Write the consolidated dataset into a csv file in Silver layer
try:
    with open(path_to_silver+gh_data_staging, 'w', encoding="utf-8") as f:
        headers = ['sha','url','message','name','email','date']
        w = csv.DictWriter(f, fieldnames=headers, quoting=csv.QUOTE_ALL)
        w.writeheader()
        w.writerows(commit_list)
    print('Successfully wrote to file: '+path_to_silver+gh_data_staging)
except:
    print('Failed writing to file: '+path_to_silver+gh_data_staging)

Successfully wrote to file: Datalake/silver/gh-stg.csv


In [64]:
print(commit_list[1466])

{'sha': '54ac9234a8154325b13202e558c5fd8798319c7a', 'url': 'https://api.github.com/repos/apache/airflow/commits/54ac9234a8154325b13202e558c5fd8798319c7a', 'message': '📝 Update release documentation (#31631)', 'name': 'GitHub', 'email': 'noreply@github.com', 'date': '2023-05-31T18:58:14Z', 'login': 'web-flow', 'id': 19864447}
