In [1]:
import os
from github import Github
import re
import sqlparse
import json
import time

In [2]:
def extract_ctes(sql_code):
    # Regular expression to capture CTE name and the SQL following AS
    cte_regex = re.compile(r"(\w+)\s+AS\s+\(", re.IGNORECASE)
    
    # Find all matches for CTEs
    matches = cte_regex.finditer(sql_code)
    
    ctes = []
    
    # Iterate through each match to extract the CTE
    for match in matches:
        cte_name = match.group(1)
        start_idx = match.end()  # Starting index after 'AS ('
        end_idx = None
        parentheses_count = 1  # We already have one opening parenthesis
        
        in_single_quote = False
        in_double_quote = False
        escape_char = False
        
        # Iterate through the remaining SQL to find matching parentheses
        for idx in range(start_idx, len(sql_code)):
            char = sql_code[idx]
            
            # Handle escape character inside a string
            if escape_char:
                escape_char = False  # Skip the next character after a backslash
                continue
            
            # Check for the escape character
            if char == '\\':
                escape_char = True
                continue
            
            # Toggle flags for entering/exiting single or double quotes
            if char == "'" and not in_double_quote:
                in_single_quote = not in_single_quote
                continue
            elif char == '"' and not in_single_quote:
                in_double_quote = not in_double_quote
                continue
            
            # If we're inside a string, ignore any parentheses
            if in_single_quote or in_double_quote:
                continue
            
            # Track parentheses when not inside a string
            if char == '(':
                parentheses_count += 1
            elif char == ')':
                parentheses_count -= 1
            
            # If parentheses are balanced, we've found the end of the CTE definition
            if parentheses_count == 0:
                end_idx = idx
                break
        
        # Extract the CTE SQL code, excluding the last parenthesis
        if end_idx is not None:
            cte_sql = sql_code[start_idx:end_idx].strip()  # Exclude the last closing parenthesis
            ctes.append({
                'name': cte_name,
                'sql': cte_sql
            })
    
    return ctes

In [8]:
# Authenticate with GitHub
token = os.getenv('GITHUB_TOKEN')
g = Github(token)

def search_github_for_ctes(query, language="SQL", sort="indexed", order="desc", limit=1000):
    # Search GitHub for code matching the query
    results = g.search_code(query=query, language=language, sort=sort, order=order)
    
    ctes_list = []

    try:
        print("searching for CTEs:")
        print("[",end='')
        for file in results[:limit]:
            try:
                # Check rate limit
                rate_limit = g.get_rate_limit().search
                if rate_limit.remaining == 0:
                    print("Rate limit exceeded. Returning results gathered so far.")
                    break

                # Get the contents of the file
                content = file.decoded_content.decode('utf-8')

                
                repo_full_name = file.repository.full_name
                file_repo = file.repository
                repo = g.get_repo(repo_full_name)
                
                # Parse the SQL content using sqlparse
                parsed = sqlparse.parse(content)
                
                for statement in parsed:
                    # Find CTEs using regex
                    matches = extract_ctes(str(statement))
                    
                    for match in matches:
                        cte_name = match['name'].strip()
                        cte_query_body = match['sql'].strip()
                        # ctes_list.append({cte_name: cte_query_body})

                        ctes_list.append(
                            {
                                'cte_name': cte_name,
                                'SQL': cte_query_body,
                                'Repo': repo_full_name,
                                'file_name': file.name,
                                'file_path':file.html_url,
                                'file_size':file.size,
                                'repo_path':file_repo.html_url,
                                'repo_description': repo.description,
                                'repo_size': repo.size,
                                'num_commits': repo.get_commits().totalCount,
                                'num_branches': repo.get_branches().totalCount,
                                'num_prs': repo.get_pulls(state='all').totalCount,
                                'repo_contributors': repo.get_contributors().totalCount,
                                'repo_releases': repo.get_releases().totalCount,
                                'repo_stars': file_repo.stargazers_count,
                                'repo_forks': file_repo.forks_count,
                            }
                        )

            except Exception as e:
                print(f"Error processing file {file.html_url}: {e}")
            print("*",end='')
        print("]")

    except Exception as e:
        print(f"Exception occurred: {e}")
        
    return ctes_list


schemas = [
    "Chinook",
    "Northwind",
    "AdventureWorks",
    "Sakila",
    "HR",
    "Pagila",
    "DVD Rental",
    "World",
    "Employees",
    "TPC-H",
    "IMDB",
    "FoodMart",
    "WideWorldImporters",
    "ClassicModels",
    "Airline"
]

# prepare informationfor retrieval
query_json = [{'query': f'language:SQL WITH {schema}', 'schema': {schema}} for schema in schemas]


In [27]:
list_to_test = search_github_for_ctes('WITH IMDB', limit=10)

searching for CTEs:
[**********]


In [None]:
list_to_test

In [4]:
# Convert lists and sets within the dictionaries to tuples to ensure hashability
def make_hashable(d):
    return {k: tuple(v) if isinstance(v, (list, set)) else v for k, v in d.items()}

In [5]:
# Function to check if schema data is already in the file
def schema_exists_in_file(schema, file_name='combined_data.json'):
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            data = json.load(file)
            return any(item.get('schema') == schema for item in data)
    return False

# Function to save the combined data to the JSON file
def save_to_json(data, file_name='combined_data.json'):
    if os.path.exists(file_name):
        with open(file_name, 'r') as file:
            existing_data = json.load(file)
    else:
        existing_data = []
    
    # Combine new and existing data, removing duplicates
    combined_data = existing_data + data
    unique_data = [dict(t) for t in {tuple(sorted(make_hashable(d).items())) for d in combined_data}]

    # Save the unique data to the JSON file
    with open(file_name, 'w') as file:
        json.dump(unique_data, file, indent=4)

In [6]:
def generate_github_dataset(name_of_file: str):
    combined = []
    mod_2_flag = 0

    for query_schema_object in query_json:
        schema = query_schema_object["schema"]

        # Check if the schema data is already in the file
        if schema_exists_in_file(schema, name_of_file):
            print(f'Schema {schema} already processed, skipping...')
            continue

        print(f'\nGetting data for {schema} schema')

        try:
            ctes_desc = search_github_for_ctes(query_schema_object['query'])
            ctes_asc = search_github_for_ctes(query_schema_object['query'], order='asc')
            ctes = ctes_desc + ctes_asc

            # Add the schema attribute
            for obj in ctes:
                obj['schema'] = schema

            # Combine results and remove duplicates
            combined += ctes
            unique_list = [dict(t) for t in {tuple(sorted(make_hashable(d).items())) for d in combined}]

            # Save progress to JSON file
            save_to_json(unique_list, name_of_file)

            mod_2_flag += 1

        except Exception as e:
            print(f'Error occurred for schema {schema}: {e}')
            break  # Handle the error and break to avoid losing progress

In [None]:
# Remove duplicates by converting the list to a set of tuples
unique_list = [dict(t) for t in {tuple(sorted(make_hashable(d).items())) for d in combined}]

# Saving the CTEs to a JSON file
with open(name_of_file, 'w') as json_file:
    json.dump(unique_list, json_file, indent=4)

In [9]:
file_name = 'github/github_ctes.json'
generate_github_dataset(file_name)


Getting data for {'Chinook'} schema
searching for CTEs:
[******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/fvgm-spec/learn_dbt/pulls?state=all&per_page=1 failed with 403: Forbidden
Setting next backoff to 881.021136s


*****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'N

Request GET /repos/jasonwnc/ds2002F23/pulls?state=all&per_page=1 failed with 403: Forbidden
Setting next backoff to 785.586909s


*************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/andradelucas/EstudoDBT/releases?per_page=1 failed with 403: Forbidden
Setting next backoff to 1301.749414s


************************************************************************************]
searching for CTEs:
[****************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/JakubWorek/introduction_to_databases_course/pulls?state=all&per_page=1 failed with 403: Forbidden
Setting next backoff to 1292.505598s


*************************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'AdventureWorks'} schema
searching for CTEs:
[*********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/Ann-su/bazy_danych/commits?per_page=1 failed with 403: Forbidden
Setting next backoff to 1433.588327s


*******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[*******************************************************************************************************************************************************************************************************************************************

Request GET /repos/AdrianBobowski/Bazy-danych/releases?per_page=1 failed with 403: Forbidden
Setting next backoff to 1475.311606s


***********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'Sakila'} schema
searching for CTEs:
[*****************

Request GET /repos/TimmyTonyY/DVD-Rental-Store-Analysis/branches?per_page=1 failed with 403: Forbidden
Setting next backoff to 1503.929322s


***********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repositories/17225609/contents/sql2jooq/src/test/resources/sql/mysql-sakila-schema.sql?ref=ef9a8d809661a707b3ee41860fc258d01d272579 failed with 403: Forbidden
Setting next backoff to 1429.233784s


************************************************************************]
searching for CTEs:
[**********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repositories/817157074/contents/src/main/resources/db/migration/V1__init_tables.sql?ref=1e77f32292d65670091ae3292d30e6eab098667c failed with 403: Forbidden
Setting next backoff to 1479.553845s


********************************************************************************************************************************************************************]

Getting data for {'HR'} schema
searching for CTEs:
[*************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repositories/544709994/contents/week-05/sql-challenge-answer/statements.sql?ref=9db88a52a3a9191aa9981899a0383735c868e899 failed with 403: Forbidden
Setting next backoff to 1308.063573s


***************************************************************************************************************************************************************************************************]
searching for CTEs:
[**************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/WillemW-01/der-swiper failed with 403: Forbidden
Setting next backoff to 1237.790246s


***************************************************************************************************************************************************************************************************]

Getting data for {'Pagila'} schema
searching for CTEs:
[**************************************************************]
searching for CTEs:
[**************************************************************]

Getting data for {'DVD Rental'} schema
searching for CTEs:
[******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/pvanand07/Programming-for-Data-Science-with-Python-Nanodegree-Udacity/pulls?state=all&per_page=1 failed with 403: Forbidden
Setting next backoff to 1335.294652s


*********************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/mahab-22/supercarousel failed with 403: Forbidden
Setting next backoff to 980.109444s


**************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[***************************************************

Request GET /repositories/423974140/contents/schema/generate/12-schema_update_14.sql?ref=3a253df3838f2103ce6ed0de0a87e1bea4904820 failed with 403: Forbidden
Setting next backoff to 998.469713s


***********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'Employees'} schema
searching for CTEs:
[********************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/thirstler/benchgo/branches?per_page=1 failed with 403: Forbidden
Setting next backoff to 1311.617073s


*******************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/vinaydanidhariya/Marwadi-uni failed with 403: Forbidden
Setting next backoff to 1633.251376s


********************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/thirstler/benchgo/contributors?per_page=1 failed with 403: Forbidden
Setting next backoff to 1610.362611s


*****************************************************************************************************]
searching for CTEs:
[**************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/thirstler/benchgo/branches?per_page=1 failed with 403: Forbidden
Setting next backoff to 1612.101471s


**********************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/ktopcuoglu/gitlab-dbt-analytics/releases?per_page=1 failed with 403: Forbidden
Setting next backoff to 1601.117393s


************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/thirstler/benchgo/commits?per_page=1 failed with 403: Forbidden
Setting next backoff to 1627.707804s


******************************************************************************]

Getting data for {'TPC-H'} schema
searching for CTEs:
[*****************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[*****************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'IMDB'} schema
searching for CTEs:
[****************************************************

Request GET /repos/sushmitafordata/IMDB-assignment_SQL-/commits?per_page=1 failed with 403: Forbidden
Setting next backoff to 1473.128396s


*****************************************************************************************************************************************************************************************************************

Request GET /repos/PrakNew/UpGrad/commits?per_page=1 failed with 403: Forbidden
Setting next backoff to 1687.708218s


*******************************************

Request GET /repos/umbra-db/diamond-vldb2024/releases?per_page=1 failed with 403: Forbidden
Setting next backoff to 1785.147588s


************************************************************Error processing file https://github.com/Sahnawaj27/IMDb-Movies-Analysis/blob/0c91bfdc72f3387ecb5b46247469b6a7a34968e9/Segment%207;%20%20Recommendations.sql: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/contents#get-repository-content", "status": "404"}
*******************************************

Request GET /repos/umbra-db/diamond-vldb2024/branches?per_page=1 failed with 403: Forbidden
Setting next backoff to 1722.92976s


*******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[*****************************************************************************************

Request GET /repositories/314808046/contents/hw-4/imdb.sql?ref=0103b2d08a6173d2b84e64685d7e45732947a3ec failed with 403: Forbidden
Setting next backoff to 1083.99336s


***************************************************************************************************************************************************************************************

Request GET /repos/RUC-MSc-CS-CIT-2024/portfolio_subproject_1/contributors?per_page=1 failed with 403: Forbidden
Setting next backoff to 1681.376922s


********************************************************************************************Error processing file https://github.com/Sahnawaj27/IMDb-Movies-Analysis/blob/0c91bfdc72f3387ecb5b46247469b6a7a34968e9/Segment%207;%20%20Recommendations.sql: 404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/contents#get-repository-content", "status": "404"}
*******************************************

Request GET /repos/umbra-db/diamond-vldb2024/pulls?state=all&per_page=1 failed with 403: Forbidden
Setting next backoff to 1715.669835s


*******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'FoodMart'} schema
searching for CTEs:
[***********]
searching for CTEs:
[***********]

Getting data for {'WideWorldImporters'} schema
searching for CTEs:
[******************************************************************************************************************************************

Request GET /repositories/821849437/contents/HW5%20-%20OVER/hw_window_functions_tasks-188-334649.sql?ref=6e64aaca32b6d8e586275f7da10f4fab16735bce failed with 403: Forbidden
Setting next backoff to 1073.392335s


************************************************************************************************************************************************]
searching for CTEs:
[******************************************************************************************************************************************************************************************************************************************************************************************]

Getting data for {'ClassicModels'} schema
searching for CTEs:
[******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repos/prasitstk/terraform-aws-devops failed with 403: Forbidden
Setting next backoff to 1225.496838s


**********************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[*******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************

Request GET /repositories/727882504/contents/blog-akasa.sql?ref=dcfce379db0a97603a5e4c6afa98e1c703ab4e21 failed with 403: Forbidden
Setting next backoff to 33.500035s


******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
searching for CTEs:
[***********************************************

Request GET /repos/Yona-p/sql_cs50 failed with 403: Forbidden
Setting next backoff to 290.092441s


***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************]
