# Setup

We first need to import packages, load secret credentials, and set up a connection to MySQL.

In [1]:
import mysql.connector
import os
from github import Github
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
mysql_user = os.getenv("MYSQL_USER")
mysql_pass = os.getenv("MYSQL_PASS")
gh_token = os.getenv("GH_TOKEN")

In [3]:
cnx = mysql.connector.connect(user=mysql_user, password=mysql_pass,
                              host='127.0.0.1',
                              database='final_proj')

# Get data

In [4]:
g = Github(gh_token)
for repo in g.get_user().get_repos():
    print(repo.name)

aicoe-newsletter
ai-ci
AI-Stacks-pipeline
aicoe-cd
aicoe-ci
aicoe-sre
anomaly-detection-demo-app
Awesome-Data-Science-with-Python
benchmarks-tekton
clai
common
content-pipeline
data-driven-development
disk-failure-prediction
donkeycar
edge-tekton-model
edge-tekton-pipeline
elyra-aidevsecops-tutorial
EthicalML-awesome-production-machine-learning
experiment-tracking
experiment-tracking-template
frauddetection-producer-consumer
idh-manifests
inference
inference_results_v0.5
instrumented-app-go
instrumented-app-prometheus
internal-data-hub
jpegio
jupyter-notebooks
jupyterhub-operator
kubectl-container
kubeflow
learn-katacoda
log-anomaly-detector
ludus
manage-dependencies-tutorial
matmul-pipeline-cpu
matmul-pipeline-gpu
meteor
meteor-operator
mlflow-tracking-operator
mlperf-inference-tekton-pipeline
mlperf-tekton
mlperf-training
mlperf_metric_collection
okr
olm-testing-example
olm-testing-example-fork
OpenShiftKubeAudit
openshift_kubeflow_workshop
overlays-for-ai-pipeline-tutorial
peak
peak

In [5]:
# store the repos into a list
repos = list(g.get_user().get_repos())

In [6]:
test_repo = repos[0]
print(repo.raw_data)

{'id': 366816677, 'node_id': 'MDEwOlJlcG9zaXRvcnkzNjY4MTY2Nzc=', 'name': 'sandiego', 'full_name': 'sandiego-rh/sandiego', 'private': False, 'owner': {'login': 'sandiego-rh', 'id': 88001610, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjg4MDAxNjEw', 'avatar_url': 'https://avatars.githubusercontent.com/u/88001610?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/sandiego-rh', 'html_url': 'https://github.com/sandiego-rh', 'followers_url': 'https://api.github.com/users/sandiego-rh/followers', 'following_url': 'https://api.github.com/users/sandiego-rh/following{/other_user}', 'gists_url': 'https://api.github.com/users/sandiego-rh/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/sandiego-rh/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/sandiego-rh/subscriptions', 'organizations_url': 'https://api.github.com/users/sandiego-rh/orgs', 'repos_url': 'https://api.github.com/users/sandiego-rh/repos', 'events_url': 'https://api.github.com/users/sandiego-rh

In [7]:
# get the 'main' branch and print the raw data
branch = repo.get_branch("main")
#print(branch.raw_data)
# get the first commit item form commits
commit = None
for c in repo.get_commits():
    commit = c
    break # we break here to only have the first iteration
# print the commit raw data
#print(commit.raw_data)

In [8]:
commits_data = {
"repo_id": int(repo.id),
"commit_sha": commit.sha,
"commit_message": commit.commit.message,
"commit_author_name": commit.commit.author.name,
"commit_author_email": commit.commit.author.email,
"commit_author_date": commit.commit.author.date,
"commit_committer_name": commit.commit.committer.name,
"commit_committer_email": commit.commit.committer.email,
"commit_committer_date": commit.commit.committer.date,
"author_login": commit.author.login,
"author_id": commit.author.id,
"author_avatar_url": commit.author.avatar_url,
"author_type": commit.author.type,
"committer_login": commit.committer.login,
"committer_id": commit.committer.id,
"committer_avatar_url": commit.committer.avatar_url,
"committer_type": commit.committer.type,
"stats_addtions": commit.stats.additions,
"stats_deletions": commit.stats.deletions,
"stats_total": commit.stats.total
}

In [9]:
#file_data = {
#"id": ,
#"commit_id": ,
#"repo_id": ,
#"file_name": ,
#"addtions": ,
#"deletions": ,
#"changes": ,
#"status":
#}

In [10]:
repo_data = {
"id": repo.id,
"name": repo.name,
"owner": repo.owner.login,
"fullname": repo.full_name,
"description": repo.description,
"url": repo.url,
"pushed_date": int(repo.pushed_at.timestamp()),
"created_date": int(repo.created_at.timestamp()),
"updated_date": int(repo.updated_at.timestamp()),
"size": repo.size,
"stars": repo.stargazers_count,
"forks": repo.forks_count,
"watchers": repo.watchers_count,
"language": repo.language,
"topics": ",".join(repo.get_topics()),
}

In [11]:
branch_data = {
"id": repo.id,
"repo_id": int(repo.id),
"name": branch.name,
"commit_sha": branch.commit.sha,
"protected": int(branch.protected)
}

In [12]:
ddl = """
CREATE TABLE IF NOT EXISTS repo(
id integer PRIMARY KEY,
name VARCHAR(255) NOT NULL,
owner VARCHAR(255) NOT NULL,
fullname VARCHAR(511) NOT NULL,
description TEXT NOT NULL,
url VARCHAR(511) NOT NULL,
pushed_date INTEGER NOT NULL,
created_date INTEGER NOT NULL,
updated_date INTEGER NOT NULL,
size INTEGER NOT NULL,
stars INTEGER NOT NULL,
forks INTEGER NOT NULL,
watchers INTEGER NOT NULL,
language VARCHAR(255) NOT NULL,
topics TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS branch(
id integer PRIMARY KEY,
repo_id INTEGER NOT NULL,
name VARCHAR(255) NOT NULL,
commit_sha VARCHAR(255) NOT NULL,
protected  INTEGER NOT NULL,
FOREIGN KEY (repo_id) REFERENCES repo (id)
);
CREATE TABLE IF NOT EXISTS commits(
id integer PRIMARY KEY,
repo_id INTEGER NOT NULL,
commit_sha VARCHAR(255) NOT NULL,
commit_message TEXT NOT NULL,
commit_author_name VARCHAR(255) NOT NULL,
commit_author_email VARCHAR(255) NOT NULL,
commit_author_date INTEGER NOT NULL,
commit_committer_name VARCHAR(255) NOT NULL,
commit_committer_email VARCHAR(255) NOT NULL,
commit_committer_date INTEGER NOT NULL,
author_login VARCHAR(255)  NULL,
author_id INTEGER NULL,
author_avatar_url VARCHAR(255) NULL,
author_type VARCHAR(255) NULL,
committer_login VARCHAR(255) NULL,
committer_id INTEGER NULL,
committer_avatar_url VARCHAR(255) NULL,
committer_type VARCHAR(255) NULL,
stats_addtions INTEGER NOT NULL,
stats_deletions INTEGER NOT NULL,
stats_total INTEGER NOT NULL,
FOREIGN KEY (repo_id) REFERENCES repo (id)
);
CREATE TABLE IF NOT EXISTS commit_file(
id integer PRIMARY KEY,
commit_id INTEGER NOT NULL,
repo_id INTEGER NOT NULL,
file_name VARCHAR(255) NOT NULL,
addtions INTEGER NOT NULL,
deletions INTEGER NOT NULL,
changes INTEGER NOT NULL,
status VARCHAR(255) NOT NULL,
FOREIGN KEY (commit_id) REFERENCES commits (id),
FOREIGN KEY (repo_id) REFERENCES repo (id)
);
"""

In [13]:
cursor = cnx.cursor(dictionary=True)
cursor.execute(ddl)

In [14]:
repo_columns = list(repo_data.keys())
branch_columns = list(branch_data.keys())
#file_columns = list(file_data.keys())
commits_columns = list(commits_data.keys())

In [15]:
values = [repo_data[k] for k in repo_columns]

In [16]:
query = """INSERT INTO repo {0} VALUES ({1})""".format(repo_columns, values)
print(query)

INSERT INTO repo ['id', 'name', 'owner', 'fullname', 'description', 'url', 'pushed_date', 'created_date', 'updated_date', 'size', 'stars', 'forks', 'watchers', 'language', 'topics'] VALUES ([366816677, 'sandiego', 'sandiego-rh', 'sandiego-rh/sandiego', None, 'https://api.github.com/repos/sandiego-rh/sandiego', 1637218396, 1620858824, 1635799369, 25597, 2, 7, 2, 'Jupyter Notebook', ''])


In [17]:
cursor.execute(query,list)

cursor.commit()

ValueError: Could not process parameters

In [None]:
cnx.close()