Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Warn about catalog manifest discrepancy #91

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

Recent and upcoming changes to dbt2looker

## Unreleased
### Added
- support ephemeral models (#57)
- warnings if there is a discrepancy between manifest and catalog (#5)
- more descriptive error message when a column's data type can't be inferred due to not being in the catalog

### Changed
- only non-ephemeral models _selected by tag logic_ are checked to ensure the model files are not empty (instead of all models) (#57)

## 0.11.0
### Added
- support label and hidden fields (#49)
Expand Down
5 changes: 3 additions & 2 deletions dbt2looker/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Union, Dict, List, Optional
from typing import Any, Union, Dict, List, Optional
try:
from typing import Literal
except ImportError:
Expand Down Expand Up @@ -144,6 +144,7 @@ class DbtModelColumn(BaseModel):
class DbtNode(BaseModel):
unique_id: str
resource_type: str
config: Dict[str, Any]


class Dbt2LookerExploreJoin(BaseModel):
Expand Down Expand Up @@ -224,4 +225,4 @@ def case_insensitive_column_names(cls, v: Dict[str, DbtCatalogNodeColumn]):


class DbtCatalog(BaseModel):
nodes: Dict[str, DbtCatalogNode]
nodes: Dict[str, DbtCatalogNode]
60 changes: 53 additions & 7 deletions dbt2looker/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,21 +31,24 @@ def tags_match(query_tag: str, model: models.DbtModel) -> bool:

def parse_models(raw_manifest: dict, tag=None) -> List[models.DbtModel]:
manifest = models.DbtManifest(**raw_manifest)
all_models: List[models.DbtModel] = [
materialized_models: List[models.DbtModel] = [
node
for node in manifest.nodes.values()
if node.resource_type == 'model'
if node.resource_type == 'model' and node.config['materialized'] != 'ephemeral'
]

if tag is None:
selected_models = materialized_models
else:
selected_models = [model for model in materialized_models if tags_match(tag, model)]

# Empty model files have many missing parameters
for model in all_models:
for model in selected_models:
if not hasattr(model, 'name'):
logging.error('Cannot parse model with id: "%s" - is the model file empty?', model.unique_id)
raise SystemExit('Failed')

if tag is None:
return all_models
return [model for model in all_models if tags_match(tag, model)]
return selected_models


def check_models_for_missing_column_types(dbt_typed_models: List[models.DbtModel]):
Expand All @@ -54,6 +57,33 @@ def check_models_for_missing_column_types(dbt_typed_models: List[models.DbtModel
logging.debug('Model %s has no typed columns, no dimensions will be generated. %s', model.unique_id, model)


def compare_model_vs_node_columns(model: models.DbtModel, node: models.DbtCatalogNode):
model_columns = set(model.columns.keys()) # as defined in YML config
catalogued_columns = set(node.columns.keys()) # as defined in SQL

# if the YML and SQL columns exactly match, return early
if not model_columns.symmetric_difference(catalogued_columns):
return

if model_columns.issubset(catalogued_columns):
for undocumented_column in sorted(catalogued_columns.difference(model_columns)):
logging.warning(
f'Column {model.unique_id}.{undocumented_column} has not been documented in YML, '
'but is present in the catalog. You should add it to your YML config, '
'or (if it is not required) remove it from the model SQL file, run the model, '
'and run `dbt docs generate` again')
# after warning the user, return early
return

# otherwise, there are columns defined in YML that don't match what's defined in SQL
for missing_column in sorted(model_columns.difference(catalogued_columns)):
logging.warning(
f'Column {model.unique_id}.{missing_column} documented in YML, '
'but is not defined in the DBT catalog. Check the model SQL file '
'and ensure you have run the model and `dbt docs generate`')
return # final return explicitly included for clarity


def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str] = None):
catalog_nodes = parse_catalog_nodes(raw_catalog)
dbt_models = parse_models(raw_manifest, tag=tag)
Expand All @@ -74,6 +104,11 @@ def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str]
logging.warning(
f'Model {model.unique_id} not found in catalog. No looker view will be generated. '
f'Check if model has materialized in {adapter_type} at {model.relation_name}')
else:
# we know that the model is included in the catalog - extract it
corresponding_catalog_node = catalog_nodes[model.unique_id]
# issue warnings if the catalog columns (defined via SQL) don't match what's documented in YML
compare_model_vs_node_columns(model, corresponding_catalog_node)

# Update dbt models with data types from catalog
dbt_typed_models = [
Expand All @@ -92,7 +127,18 @@ def parse_typed_models(raw_manifest: dict, raw_catalog: dict, tag: Optional[str]
return dbt_typed_models


class ColumnNotInCatalogError(Exception):
def __init__(self, model_id: str, column_name: str):
super().__init__(
f'Column {column_name} not found in catalog for model {model_id}, '
'cannot find a data type for Looker. Is the column selected in the model SQL file, '
'and have you run the model since adding the column to it?')


def get_column_type_from_catalog(catalog_nodes: Dict[str, models.DbtCatalogNode], model_id: str, column_name: str):
node = catalog_nodes.get(model_id)
column = None if node is None else node.columns.get(column_name)
return None if column is None else column.type
if column:
return column.type
# otherwise this will fail later when we try to map the data type to a Looker type
raise ColumnNotInCatalogError(model_id, column_name)