<a href="https://colab.research.google.com/github/ipeirotis/introduction-to-databases/blob/master/schemas/Generate_ER_Diagrams.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate ER Diagrams from BigQuery Schema

This notebook queries BigQuery's `INFORMATION_SCHEMA` to extract table structures, primary keys, and foreign keys, then generates:

1. **Mermaid** syntax (renders directly in GitHub markdown)
2. **dbdiagram.io** syntax (for drag-and-drop positioning and PNG export)

## Setup

In [None]:
# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.cloud import bigquery
import pandas as pd

# Specify your Google Cloud project ID
PROJECT_ID = 'nyu-datasets'  # <-- Replace with your project ID

client = bigquery.Client(project=PROJECT_ID)

def run_query(sql):
    """Run a BigQuery SQL query and return results as a pandas DataFrame."""
    return client.query(sql).to_dataframe()

## Extract Schema Information from BigQuery

We query `INFORMATION_SCHEMA` to get:
- Table columns and data types
- Primary key constraints
- Foreign key constraints

In [None]:
def get_schema_info(dataset_id):
    """
    Extract schema information from a BigQuery dataset.
    Returns columns, primary keys, and foreign keys.
    """

    # Get all columns
    columns_query = f"""
    SELECT
        table_name,
        column_name,
        data_type,
        is_nullable,
        ordinal_position
    FROM `{dataset_id}.INFORMATION_SCHEMA.COLUMNS`
    ORDER BY table_name, ordinal_position
    """
    columns_df = run_query(columns_query)

    # Get primary keys
    pk_query = f"""
    SELECT
        tc.table_name,
        kcu.column_name
    FROM `{dataset_id}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` tc
    JOIN `{dataset_id}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` kcu
        ON tc.constraint_name = kcu.constraint_name
        AND tc.table_name = kcu.table_name
    WHERE tc.constraint_type = 'PRIMARY KEY'
    """
    try:
        pk_df = run_query(pk_query)
    except:
        pk_df = pd.DataFrame(columns=['table_name', 'column_name'])

    # Get foreign keys
    fk_query = f"""
    SELECT
        tc.table_name,
        kcu.column_name,
        ccu.table_name AS foreign_table,
        ccu.column_name AS foreign_column
    FROM `{dataset_id}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` tc
    JOIN `{dataset_id}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` kcu
        ON tc.constraint_name = kcu.constraint_name
        AND tc.table_name = kcu.table_name
    JOIN `{dataset_id}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` ccu
        ON tc.constraint_name = ccu.constraint_name
    WHERE tc.constraint_type = 'FOREIGN KEY'
    """
    try:
        fk_df = run_query(fk_query)
    except:
        fk_df = pd.DataFrame(columns=['table_name', 'column_name', 'foreign_table', 'foreign_column'])

    return columns_df, pk_df, fk_df

In [None]:
def simplify_data_type(data_type):
    """Convert BigQuery data types to simplified types for diagrams."""
    data_type = data_type.upper()
    if 'INT' in data_type:
        return 'INT'
    elif 'FLOAT' in data_type or 'NUMERIC' in data_type or 'DECIMAL' in data_type:
        return 'FLOAT'
    elif 'BOOL' in data_type:
        return 'BOOL'
    elif 'DATE' in data_type:
        return 'DATE'
    elif 'TIME' in data_type:
        return 'TIMESTAMP'
    elif 'STRING' in data_type or 'VARCHAR' in data_type:
        return 'VARCHAR'
    else:
        return data_type

## Generate Mermaid ER Diagram

Creates syntax that renders directly in GitHub markdown files.

In [None]:
def generate_mermaid(columns_df, pk_df, fk_df, title=None):
    """
    Generate Mermaid ER diagram syntax from schema information.
    """
    # Create sets for quick lookup
    pk_set = set(zip(pk_df['table_name'], pk_df['column_name']))
    fk_set = set(zip(fk_df['table_name'], fk_df['column_name']))

    lines = ['```mermaid', 'erDiagram']

    # Group columns by table
    tables = columns_df.groupby('table_name')

    # Generate entity definitions
    for table_name, group in tables:
        lines.append(f'    {table_name} {{')
        for _, row in group.iterrows():
            col_name = row['column_name']
            data_type = simplify_data_type(row['data_type'])

            # Determine key type
            key_marker = ''
            if (table_name, col_name) in pk_set:
                key_marker = ' PK'
            elif (table_name, col_name) in fk_set:
                key_marker = ' FK'

            lines.append(f'        {data_type} {col_name}{key_marker}')
        lines.append('    }')
        lines.append('')

    # Generate relationships from foreign keys
    for _, row in fk_df.iterrows():
        from_table = row['table_name']
        to_table = row['foreign_table']
        # Use a generic relationship label based on table names
        label = f"fk_{row['column_name']}"
        lines.append(f'    {to_table} ||--o{{ {from_table} : "{label}"')

    lines.append('```')

    return '\n'.join(lines)

## Generate dbdiagram.io Syntax

Creates syntax for [dbdiagram.io](https://dbdiagram.io) where you can drag-and-drop position tables and export to PNG.

In [None]:
def generate_dbdiagram(columns_df, pk_df, fk_df, title=None):
    """
    Generate dbdiagram.io syntax from schema information.
    """
    # Create sets/dicts for quick lookup
    pk_set = set(zip(pk_df['table_name'], pk_df['column_name']))

    # Build FK lookup: (table, column) -> (foreign_table, foreign_column)
    fk_dict = {}
    for _, row in fk_df.iterrows():
        key = (row['table_name'], row['column_name'])
        fk_dict[key] = (row['foreign_table'], row['foreign_column'])

    lines = []
    if title:
        lines.append(f'// {title}')
        lines.append('')

    # Group columns by table
    tables = columns_df.groupby('table_name')

    # Generate table definitions
    for table_name, group in tables:
        lines.append(f'Table {table_name} {{')
        for _, row in group.iterrows():
            col_name = row['column_name']
            data_type = simplify_data_type(row['data_type'])

            # Build annotations
            annotations = []
            if (table_name, col_name) in pk_set:
                annotations.append('pk')
            if (table_name, col_name) in fk_dict:
                foreign_table, foreign_column = fk_dict[(table_name, col_name)]
                annotations.append(f'ref: > {foreign_table}.{foreign_column}')

            if annotations:
                annotation_str = ', '.join(annotations)
                lines.append(f'  {col_name} {data_type} [{annotation_str}]')
            else:
                lines.append(f'  {col_name} {data_type}')

        lines.append('}')
        lines.append('')

    return '\n'.join(lines)

## Generate Diagrams for IMDB Database

In [None]:
# Extract schema for IMDB
imdb_columns, imdb_pk, imdb_fk = get_schema_info('nyu-datasets.imdb')

print("Tables found:")
print(imdb_columns['table_name'].unique())

Tables found:
['actors' 'directors' 'directors_genres' 'movies' 'movies_directors'
 'movies_genres' 'roles']


In [None]:
# View the schema
imdb_columns

Unnamed: 0,table_name,column_name,data_type,is_nullable,ordinal_position
0,actors,id,INT64,YES,1
1,actors,first_name,STRING,YES,2
2,actors,last_name,STRING,YES,3
3,actors,gender,STRING,YES,4
4,directors,id,INT64,YES,1
5,directors,first_name,STRING,YES,2
6,directors,last_name,STRING,YES,3
7,directors_genres,director_id,INT64,YES,1
8,directors_genres,genre,STRING,YES,2
9,directors_genres,prob,FLOAT64,YES,3


In [None]:
# View primary keys
print("Primary Keys:")
imdb_pk

Primary Keys:


Unnamed: 0,table_name,column_name
0,directors,id
1,actors,id


In [None]:
# View foreign keys
print("Foreign Keys:")
imdb_fk

Foreign Keys:


Unnamed: 0,table_name,column_name,foreign_table,foreign_column
0,roles,actor_id,actors,id
1,directors_genres,director_id,directors,id
2,movies_directors,director_id,directors,id


In [None]:
# Generate Mermaid diagram
imdb_mermaid = generate_mermaid(imdb_columns, imdb_pk, imdb_fk)
print(imdb_mermaid)

```mermaid
erDiagram
    actors {
        INT id PK
        VARCHAR first_name
        VARCHAR last_name
        VARCHAR gender
    }

    directors {
        INT id PK
        VARCHAR first_name
        VARCHAR last_name
    }

    directors_genres {
        INT director_id FK
        VARCHAR genre
        FLOAT prob
    }

    movies {
        INT id
        VARCHAR name
        INT year
        FLOAT rating
    }

    movies_directors {
        INT director_id FK
        INT movie_id
    }

    movies_genres {
        INT movie_id
        VARCHAR genre
    }

    roles {
        INT actor_id FK
        INT movie_id
        VARCHAR role
    }

    actors ||--o{ roles : "fk_actor_id"
    directors ||--o{ directors_genres : "fk_director_id"
    directors ||--o{ movies_directors : "fk_director_id"
```


In [None]:
# Generate dbdiagram.io syntax
imdb_dbdiagram = generate_dbdiagram(imdb_columns, imdb_pk, imdb_fk, title='IMDB Database')
print(imdb_dbdiagram)

// IMDB Database

Table actors {
  id INT [pk]
  first_name VARCHAR
  last_name VARCHAR
  gender VARCHAR
}

Table directors {
  id INT [pk]
  first_name VARCHAR
  last_name VARCHAR
}

Table directors_genres {
  director_id INT [ref: > directors.id]
  genre VARCHAR
  prob FLOAT
}

Table movies {
  id INT
  name VARCHAR
  year INT
  rating FLOAT
}

Table movies_directors {
  director_id INT [ref: > directors.id]
  movie_id INT
}

Table movies_genres {
  movie_id INT
  genre VARCHAR
}

Table roles {
  actor_id INT [ref: > actors.id]
  movie_id INT
  role VARCHAR
}



## Generate Diagrams for Facebook Database

In [None]:
# Extract schema for Facebook
fb_columns, fb_pk, fb_fk = get_schema_info('nyu-datasets.facebook')

print("Tables found:")
print(fb_columns['table_name'].unique())

Tables found:
['Concentration' 'FavoriteBooks' 'FavoriteMovies' 'FavoriteMusic'
 'FavoriteTVShows' 'Hobbies' 'LookingFor' 'Orientation' 'Profiles'
 'Relationship' 'user_growth_and_churn']


In [None]:
# Generate Mermaid diagram
fb_mermaid = generate_mermaid(fb_columns, fb_pk, fb_fk)
print(fb_mermaid)

```mermaid
erDiagram
    Concentration {
        INT ProfileID FK
        VARCHAR Concentration
    }

    FavoriteBooks {
        INT ProfileID FK
        VARCHAR Book
    }

    FavoriteMovies {
        INT ProfileID FK
        VARCHAR Movie
    }

    FavoriteMusic {
        INT ProfileID FK
        VARCHAR Music
    }

    FavoriteTVShows {
        INT ProfileID FK
        VARCHAR TVShow
    }

    Hobbies {
        INT ProfileID FK
        VARCHAR Hobby
    }

    LookingFor {
        INT ProfileID FK
        VARCHAR LookingFor
    }

    Orientation {
        INT ProfileID FK
        VARCHAR InterestedIn
    }

    Profiles {
        INT ProfileID PK
        VARCHAR Name
        TIMESTAMP MemberSince
        TIMESTAMP LastUpdate
        VARCHAR School
        VARCHAR Status
        VARCHAR Sex
        TIMESTAMP Birthday
        VARCHAR AIM
        VARCHAR Website
        VARCHAR PoliticalViews
        VARCHAR Geography
        VARCHAR HighSchool
        VARCHAR HomeTown
        V

In [None]:
# Generate dbdiagram.io syntax
fb_dbdiagram = generate_dbdiagram(fb_columns, fb_pk, fb_fk, title='Facebook Database')
print(fb_dbdiagram)

// Facebook Database

Table Concentration {
  ProfileID INT [ref: > Profiles.ProfileID]
  Concentration VARCHAR
}

Table FavoriteBooks {
  ProfileID INT [ref: > Profiles.ProfileID]
  Book VARCHAR
}

Table FavoriteMovies {
  ProfileID INT [ref: > Profiles.ProfileID]
  Movie VARCHAR
}

Table FavoriteMusic {
  ProfileID INT [ref: > Profiles.ProfileID]
  Music VARCHAR
}

Table FavoriteTVShows {
  ProfileID INT [ref: > Profiles.ProfileID]
  TVShow VARCHAR
}

Table Hobbies {
  ProfileID INT [ref: > Profiles.ProfileID]
  Hobby VARCHAR
}

Table LookingFor {
  ProfileID INT [ref: > Profiles.ProfileID]
  LookingFor VARCHAR
}

Table Orientation {
  ProfileID INT [ref: > Profiles.ProfileID]
  InterestedIn VARCHAR
}

Table Profiles {
  ProfileID INT [pk]
  Name VARCHAR
  MemberSince TIMESTAMP
  LastUpdate TIMESTAMP
  School VARCHAR
  Status VARCHAR
  Sex VARCHAR
  Birthday TIMESTAMP
  AIM VARCHAR
  Website VARCHAR
  PoliticalViews VARCHAR
  Geography VARCHAR
  HighSchool VARCHAR
  HomeTown VARCHAR


## Save to Files

In [None]:
def save_markdown(content, filename, title):
    """Save Mermaid diagram to a markdown file with title."""
    with open(filename, 'w') as f:
        f.write(f'# {title}\n\n')
        f.write(content)
    print(f"Saved: {filename}")

def save_dbdiagram(content, filename):
    """Save dbdiagram.io syntax to a text file."""
    with open(filename, 'w') as f:
        f.write(content)
    print(f"Saved: {filename}")

In [None]:
# Save IMDB diagrams
save_markdown(imdb_mermaid, 'imdb_er_diagram.md', 'IMDB Database Schema')
save_dbdiagram(imdb_dbdiagram, 'imdb_dbdiagram.txt')

Saved: imdb_er_diagram.md
Saved: imdb_dbdiagram.txt


In [None]:
# Save Facebook diagrams
save_markdown(fb_mermaid, 'facebook_er_diagram.md', 'Facebook Database Schema')
save_dbdiagram(fb_dbdiagram, 'facebook_dbdiagram.txt')

Saved: facebook_er_diagram.md
Saved: facebook_dbdiagram.txt


## Usage

### Mermaid (GitHub)
1. Copy the generated `.md` files to your GitHub repo
2. GitHub will automatically render the ER diagrams

### dbdiagram.io (for slides)
1. Go to [dbdiagram.io](https://dbdiagram.io)
2. Paste the contents of the `.txt` file
3. Drag tables to arrange the layout
4. Export as PNG for your slides

## Bonus: Generate for Any Dataset

Use this function to generate diagrams for any BigQuery dataset:

In [None]:
def generate_all_diagrams(dataset_id, output_prefix):
    """
    Generate both Mermaid and dbdiagram.io files for a dataset.

    Args:
        dataset_id: Full dataset ID (e.g., 'project.dataset')
        output_prefix: Prefix for output files (e.g., 'imdb' -> 'imdb_er_diagram.md')
    """
    print(f"Extracting schema from {dataset_id}...")
    columns, pk, fk = get_schema_info(dataset_id)

    print(f"Found {len(columns['table_name'].unique())} tables")
    print(f"Found {len(pk)} primary key columns")
    print(f"Found {len(fk)} foreign key relationships")

    # Generate and save Mermaid
    mermaid = generate_mermaid(columns, pk, fk)
    title = dataset_id.split('.')[-1].title() + ' Database Schema'
    save_markdown(mermaid, f'{output_prefix}_er_diagram.md', title)

    # Generate and save dbdiagram
    dbdiagram = generate_dbdiagram(columns, pk, fk, title=title)
    save_dbdiagram(dbdiagram, f'{output_prefix}_dbdiagram.txt')

    return mermaid, dbdiagram

In [None]:
# Example: Generate for a custom dataset
# mermaid, dbdiagram = generate_all_diagrams('your-project.your_dataset', 'your_dataset')