# Find the CNCF Repos Available in the Augur Database

## Connect to your database

In [13]:
import psycopg2
import pandas as pd 
import sqlalchemy as salc
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import datetime
import json
warnings.filterwarnings('ignore')

with open("config.json") as config_file:
    config = json.load(config_file)

database_connection_string = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = salc.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

### Retrieve Available Respositories

In [14]:
repolist = pd.DataFrame()

repo_query = salc.sql.text(f"""
             SELECT a.rg_name,
                a.repo_group_id,
                b.repo_name,
                b.repo_id,
                b.forked_from,
                b.repo_archived 
            FROM
                repo_groups a,
                repo b 
            WHERE
                a.repo_group_id = b.repo_group_id 
            ORDER BY
                rg_name,
                repo_name;   

    """)

repolist = pd.read_sql(repo_query, con=engine)

display(repolist)

repolist.dtypes

Unnamed: 0,rg_name,repo_group_id,repo_name,repo_id,forked_from,repo_archived
0,-,26301,test,257882,Parent not available,0.0
1,18f,25602,10x-apis-xtravaganza,125279,Parent not available,1.0
2,18f,25602,10x-dux-app,125423,Parent not available,1.0
3,18f,25602,10x-dux-vuls-eval,125425,Parent not available,1.0
4,18f,25602,10x-mel,125410,Parent not available,1.0
...,...,...,...,...,...,...
103861,zotero,25444,zotero-standalone-build,27218,Parent not available,0.0
103862,zotero,25444,zotero-word-for-mac-integration,27192,Parent not available,0.0
103863,zotero,25444,zotero-word-for-windows-integration,27198,Parent not available,0.0
103864,zotero,25444,,27230,Parent not available,0.0


rg_name           object
repo_group_id      int64
repo_name         object
repo_id            int64
forked_from       object
repo_archived    float64
dtype: object

### Extracting label data

In [15]:
sandbox = pd.DataFrame()

sandbox_query = salc.sql.text(f"""
        
        select repo_id from augur_operations.user_groups, augur_operations.user_repos where 
        augur_operations.user_groups.user_id = 2 and
        augur_operations.user_repos.group_id=augur_operations.user_groups.group_id 
        and 
        augur_operations.user_groups.group_id=166; -- sandbox 
        
    """)

sandbox = pd.read_sql(sandbox_query, con=engine)

sandbox['label_name'] = "sandbox"

sandbox.to_csv(path_or_buf='data/sandbox_label.csv')

display(sandbox)

sandbox.dtypes


incubating = pd.DataFrame()

incubating_query = salc.sql.text(f"""
        
        select repo_id from augur_operations.user_groups, augur_operations.user_repos where 
        augur_operations.user_groups.user_id = 2 and
        augur_operations.user_repos.group_id=augur_operations.user_groups.group_id 
        and 
        augur_operations.user_groups.group_id=167; -- incubating 
        
    """)

incubating = pd.read_sql(incubating_query, con=engine)

incubating['label_name'] = "incubating"

incubating.to_csv(path_or_buf='data/incubating_label.csv')

display(incubating)

incubating.dtypes

grads = pd.DataFrame()

grads_query = salc.sql.text(f"""
        
        select repo_id from augur_operations.user_groups, augur_operations.user_repos where 
        augur_operations.user_groups.user_id = 2 and
        augur_operations.user_repos.group_id=augur_operations.user_groups.group_id 
        and 
        augur_operations.user_groups.group_id=168; -- supported  
        
    """)

grads = pd.read_sql(grads_query, con=engine)

grads['label_name'] = "grads"

grads.to_csv(path_or_buf='data/grads_label.csv')

display(grads)

grads.dtypes

data_frame_list = [sandbox, incubating, grads]

label_dataset = pd.concat(data_frame_list)

display(label_dataset)

label_dataset.to_csv(path_or_buf='data/label_dataset.csv')

Unnamed: 0,repo_id,label_name
0,36656,sandbox
1,191178,sandbox
2,191179,sandbox
3,191180,sandbox
4,191181,sandbox
...,...,...
105,191283,sandbox
106,191284,sandbox
107,191285,sandbox
108,191286,sandbox


Unnamed: 0,repo_id,label_name
0,140424,incubating
1,150726,incubating
2,191146,incubating
3,191147,incubating
4,191148,incubating
5,191149,incubating
6,191151,incubating
7,191152,incubating
8,191153,incubating
9,191154,incubating


Unnamed: 0,repo_id,label_name
0,123948,grads
1,139311,grads
2,165414,grads
3,165440,grads
4,165470,grads
5,165501,grads
6,191124,grads
7,191125,grads
8,191126,grads
9,191128,grads


Unnamed: 0,repo_id,label_name
0,36656,sandbox
1,191178,sandbox
2,191179,sandbox
3,191180,sandbox
4,191181,sandbox
...,...,...
18,191139,grads
19,191140,grads
20,191141,grads
21,191142,grads
