# Load data

Since we closed the connection to our database in the [first notebook](./01-ddl.ipynb), we will first start by creating this connection again

To reproduce this step locally, you will need to generate a GitHub personal access token. Find a tutorial on how to do this [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token).

In [1]:
import mysql.connector
import os
from github import Github
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
mysql_user = os.getenv("MYSQL_USER")
mysql_pass = os.getenv("MYSQL_PASS")
gh_token = os.getenv("GH_TOKEN")

In [3]:
cnx = mysql.connector.connect(user=mysql_user, password=mysql_pass,
                              host='127.0.0.1',
                              database='final_proj')
g = Github(gh_token)

# Get data

In [4]:
for repo in g.get_user().get_repos():
    print(repo.name)

aicoe-newsletter
ai-ci
AI-Stacks-pipeline
aicoe-cd
aicoe-ci
aicoe-sre
anomaly-detection-demo-app
Awesome-Data-Science-with-Python
benchmarks-tekton
clai
common
content-pipeline
data-driven-development
disk-failure-prediction
donkeycar
edge-tekton-model
edge-tekton-pipeline
elyra-aidevsecops-tutorial
EthicalML-awesome-production-machine-learning
experiment-tracking
experiment-tracking-template
frauddetection-producer-consumer
idh-manifests
inference
inference_results_v0.5
instrumented-app-go
instrumented-app-prometheus
internal-data-hub
jpegio
jupyter-notebooks
jupyterhub-operator
kubectl-container
kubeflow
learn-katacoda
log-anomaly-detector
ludus
manage-dependencies-tutorial
matmul-pipeline-cpu
matmul-pipeline-gpu
meteor
meteor-operator
mlflow-tracking-operator
mlperf-inference-tekton-pipeline
mlperf-tekton
mlperf-training
mlperf_metric_collection
okr
olm-testing-example
olm-testing-example-fork
OpenShiftKubeAudit
openshift_kubeflow_workshop
overlays-for-ai-pipeline-tutorial
peak
peak

In [5]:
# store the repos into a list
repos = list(g.get_user().get_repos())

In [6]:
repo = repos[1]
print(repo.raw_data)

{'id': 269347258, 'node_id': 'MDEwOlJlcG9zaXRvcnkyNjkzNDcyNTg=', 'name': 'ai-ci', 'full_name': 'AICoE/ai-ci', 'private': True, 'owner': {'login': 'AICoE', 'id': 33660954, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjMzNjYwOTU0', 'avatar_url': 'https://avatars.githubusercontent.com/u/33660954?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/AICoE', 'html_url': 'https://github.com/AICoE', 'followers_url': 'https://api.github.com/users/AICoE/followers', 'following_url': 'https://api.github.com/users/AICoE/following{/other_user}', 'gists_url': 'https://api.github.com/users/AICoE/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/AICoE/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/AICoE/subscriptions', 'organizations_url': 'https://api.github.com/users/AICoE/orgs', 'repos_url': 'https://api.github.com/users/AICoE/repos', 'events_url': 'https://api.github.com/users/AICoE/events{/privacy}', 'received_events_url': 'https://api.github.com/users/AICoE/

In [7]:
# get the 'main' branch and print the raw data
branch = repo.get_branch("master")
print(branch.raw_data)
# get the first commit item form commits
commit = None
for c in repo.get_commits():
    commit = c
    break # we break here to only have the first iteration
# print the commit raw data
print(commit)

{'name': 'master', 'commit': {'sha': '9e6126b2d656a5b4d10be766c2ac26de8f0b5970', 'node_id': 'MDY6Q29tbWl0MjY5MzQ3MjU4OjllNjEyNmIyZDY1NmE1YjRkMTBiZTc2NmMyYWMyNmRlOGYwYjU5NzA=', 'commit': {'author': {'name': 'Christoph Görn', 'email': 'goern@redhat.com', 'date': '2020-06-04T12:35:39Z'}, 'committer': {'name': 'GitHub', 'email': 'noreply@github.com', 'date': '2020-06-04T12:35:39Z'}, 'message': 'Create README.md', 'tree': {'sha': 'be9ce2a6023143021a97ea7870b36e2778820d77', 'url': 'https://api.github.com/repos/AICoE/ai-ci/git/trees/be9ce2a6023143021a97ea7870b36e2778820d77'}, 'url': 'https://api.github.com/repos/AICoE/ai-ci/git/commits/9e6126b2d656a5b4d10be766c2ac26de8f0b5970', 'comment_count': 0, 'verification': {'verified': True, 'reason': 'valid', 'signature': '-----BEGIN PGP SIGNATURE-----\n\nwsBcBAABCAAQBQJe2OqbCRBK7hj4Ov3rIwAAdHIIAG9YRUGNxylsSclKatAXkiX8\n+wDS4B5naoRoARIJ/ZbAAhzXpfOJy4lOY8u+PYjYHSaBYbqhuwMX61O5sw9l3zRl\ngBbfu5EkLCduT1xQhZwBfP2IwhZ/bRSa+HPwidqkbLrVJIAiuRdphmxaHdEhIXE4\nm

In [8]:
commits_data = {
"repo_id": int(repo.id),
"commit_sha": commit.sha,
"commit_message": commit.commit.message,
"commit_author_name": commit.commit.author.name,
"commit_author_email": commit.commit.author.email,
"commit_author_date": commit.commit.author.date,
"commit_committer_name": commit.commit.committer.name,
"commit_committer_email": commit.commit.committer.email,
"commit_committer_date": commit.commit.committer.date,
"author_login": commit.author.login,
"author_id": commit.author.id,
"author_avatar_url": commit.author.avatar_url,
"author_type": commit.author.type,
"committer_login": commit.committer.login,
"committer_id": commit.committer.id,
"committer_avatar_url": commit.committer.avatar_url,
"committer_type": commit.committer.type,
"stats_addtions": commit.stats.additions,
"stats_deletions": commit.stats.deletions,
"stats_total": commit.stats.total
}

In [9]:
#file_data = {
#"id": ,
#"commit_id": ,
#"repo_id": ,
#"file_name": ,
#"addtions": ,
#"deletions": ,
#"changes": ,
#"status":
#}

In [10]:
repo_data = {
"id": repo.id,
"name": repo.name,
"owner": repo.owner.login,
"fullname": repo.full_name,
"description": repo.description,
"url": repo.url,
"pushed_date": int(repo.pushed_at.timestamp()),
"created_date": int(repo.created_at.timestamp()),
"updated_date": int(repo.updated_at.timestamp()),
"size": repo.size,
"stars": repo.stargazers_count,
"forks": repo.forks_count,
"watchers": repo.watchers_count,
"language": repo.language,
"topics": ",".join(repo.get_topics()),
}

In [11]:
branch_data = {
"id": repo.id,
"repo_id": int(repo.id),
"name": branch.name,
"commit_sha": branch.commit.sha,
"protected": int(branch.protected)
}

In [12]:
repo_columns = list(repo_data.keys())
branch_columns = list(branch_data.keys())
#file_columns = list(file_data.keys())
commits_columns = list(commits_data.keys())

In [13]:
values = [repo_data[k] for k in repo_columns]

In [14]:
query = """INSERT INTO repo {0} VALUES ({1})""".format(repo_columns, values)
print(query)

INSERT INTO repo ['id', 'name', 'owner', 'fullname', 'description', 'url', 'pushed_date', 'created_date', 'updated_date', 'size', 'stars', 'forks', 'watchers', 'language', 'topics'] VALUES ([269347258, 'ai-ci', 'AICoE', 'AICoE/ai-ci', 'This is a continuous integration of a machine learning stack.', 'https://api.github.com/repos/AICoE/ai-ci', 1591288539, 1591286255, 1592505073, 18, 0, 0, 0, None, ''])


In [15]:
#cursor.execute(query,list)

#cursor.commit()

In [16]:
cnx.close()