Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(telemetry): Unique User IDs in kedro-telemetry - merge only for kedro-telemetry release 0.4.0 #596

Merged
merged 18 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion kedro-telemetry/RELEASE.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Upcoming release
# Upcoming release 0.4.0
* Updated the plugin to generate an unique UUID for each user of `kedro-telemetry`.
astrojuanlu marked this conversation as resolved.
Show resolved Hide resolved

# Release 0.3.2
* Updated plugin to share if a project is being run in a ci environment.
Expand Down
70 changes: 51 additions & 19 deletions kedro-telemetry/kedro_telemetry/plugin.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""Kedro Telemetry plugin for collecting Kedro usage data."""

import getpass
import hashlib
import json
import logging
import os
import sys
import uuid
from copy import deepcopy
from datetime import datetime
from pathlib import Path
Expand All @@ -15,6 +15,7 @@
import requests
import toml
import yaml
from appdirs import user_config_dir
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a new dependency?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this a new dependency?

yes, we added it to the dependencies list

from kedro import __version__ as KEDRO_VERSION
from kedro.framework.cli.cli import KedroCLI
from kedro.framework.cli.hooks import cli_hook_impl
Expand All @@ -41,6 +42,7 @@
"BUILDKITE", # https://buildkite.com/docs/pipelines/environment-variables
}
TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
CONFIG_FILENAME = "telemetry.toml"

logger = logging.getLogger(__name__)

Expand All @@ -49,15 +51,45 @@ def _hash(string: str) -> str:
return hashlib.sha512(bytes(string, encoding="utf8")).hexdigest()


def _get_hashed_username():
def _get_or_create_uuid() -> str:
"""
Reads a UUID from a configuration file or generates and saves a new one if not present.
"""
config_path = user_config_dir("kedro")
astrojuanlu marked this conversation as resolved.
Show resolved Hide resolved
full_path = os.path.join(config_path, CONFIG_FILENAME)

try:
username = getpass.getuser()
return _hash(username)
except Exception as exc:
logger.warning(
"Something went wrong with getting the username. Exception: %s",
exc,
)
if os.path.exists(full_path):
with open(full_path) as f:
config = toml.load(f)

if "telemetry" in config and "uuid" in config["telemetry"]:
return uuid.UUID(config["telemetry"]["uuid"]).hex

# Generate a new UUID and save it to the config file
new_uuid = _generate_new_uuid(full_path)

return new_uuid

except Exception as e:
logging.error(f"Failed to retrieve UUID: {e}")
return ""


def _generate_new_uuid(full_path: str) -> str:
try:
config = {}
config["telemetry"] = {}
new_uuid = uuid.uuid4().hex
config["telemetry"]["uuid"] = new_uuid

os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "w") as f:
toml.dump(config, f)

return new_uuid
except Exception as e:
logging.error(f"Failed to create UUID: {e}")
return ""


Expand All @@ -82,25 +114,25 @@ def before_command_run(
return

# get KedroCLI and its structure from actual project root
cli = KedroCLI(project_path=Path.cwd())
cli = KedroCLI(project_path=project_metadata.project_path)
cli_struct = _get_cli_structure(cli_obj=cli, get_help=False)
masked_command_args = _mask_kedro_cli(
cli_struct=cli_struct, command_args=command_args
)
main_command = masked_command_args[0] if masked_command_args else "kedro"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't comment on the line above since it hasn't been changed but could line 117 be changed to -
cli = KedroCLI(project_path=metadata.project_path)
Following the changes in kedro-org/kedro#3683, I think this will allow us to capture commands called from within subdirectories also and mask them properly


logger.debug("You have opted into product usage analytics.")
hashed_username = _get_hashed_username()
user_uuid = _get_or_create_uuid()
project_properties = _get_project_properties(
hashed_username, project_metadata.project_path
user_uuid, project_metadata.project_path
)
cli_properties = _format_user_cli_data(
project_properties, masked_command_args
)

_send_heap_event(
event_name=f"Command run: {main_command}",
identity=hashed_username,
identity=user_uuid,
properties=cli_properties,
)

Expand All @@ -109,7 +141,7 @@ def before_command_run(
generic_properties["main_command"] = main_command
_send_heap_event(
event_name="CLI command",
identity=hashed_username,
identity=user_uuid,
properties=generic_properties,
)
except Exception as exc:
Expand Down Expand Up @@ -141,16 +173,16 @@ def after_catalog_created(self, catalog):
logger.debug("You have opted into product usage analytics.")

default_pipeline = pipelines.get("__default__") # __default__
hashed_username = _get_hashed_username()
user_uuid = _get_or_create_uuid()

project_properties = _get_project_properties(hashed_username, self.project_path)
project_properties = _get_project_properties(user_uuid, self.project_path)

project_statistics_properties = _format_project_statistics_data(
project_properties, catalog, default_pipeline, pipelines
)
_send_heap_event(
event_name="Kedro Project Statistics",
identity=hashed_username,
identity=user_uuid,
properties=project_statistics_properties,
)

Expand All @@ -163,10 +195,10 @@ def _is_known_ci_env(known_ci_env_var_keys=KNOWN_CI_ENV_VAR_KEYS):
return any(os.getenv(key) for key in known_ci_env_var_keys)


def _get_project_properties(hashed_username: str, project_path: str) -> Dict:
def _get_project_properties(user_uuid: str, project_path: str) -> Dict:
hashed_package_name = _hash(PACKAGE_NAME) if PACKAGE_NAME else "undefined"
properties = {
"username": hashed_username,
"username": user_uuid,
"package_name": hashed_package_name,
"project_version": KEDRO_VERSION,
"telemetry_version": TELEMETRY_VERSION,
Expand Down
1 change: 1 addition & 0 deletions kedro-telemetry/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ license = {text = "Apache Software License (Apache 2.0)"}
dependencies = [
"kedro>=0.18.0",
"requests~=2.20",
"appdirs>=1.4.4",
]
dynamic = ["readme", "version"]

Expand Down
56 changes: 30 additions & 26 deletions kedro-telemetry/tests/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,16 +131,16 @@ def test_before_command_run(self, mocker, fake_metadata):
mocked_anon_id.return_value = "digested"
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_hashed_username",
return_value="hashed_username",
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
command_args = ["--version"]
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "hashed_username",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -157,12 +157,12 @@ def test_before_command_run(self, mocker, fake_metadata):
expected_calls = [
mocker.call(
event_name="Command run: --version",
identity="hashed_username",
identity="user_uuid",
properties=expected_properties,
),
mocker.call(
event_name="CLI command",
identity="hashed_username",
identity="user_uuid",
properties=generic_properties,
),
]
Expand All @@ -177,8 +177,8 @@ def test_before_command_run_with_tools(self, mocker, fake_metadata):
mocked_anon_id.return_value = "digested"
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_hashed_username",
return_value="hashed_username",
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
Expand All @@ -188,7 +188,7 @@ def test_before_command_run_with_tools(self, mocker, fake_metadata):
command_args = ["--version"]
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "hashed_username",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -207,12 +207,12 @@ def test_before_command_run_with_tools(self, mocker, fake_metadata):
expected_calls = [
mocker.call(
event_name="Command run: --version",
identity="hashed_username",
identity="user_uuid",
properties=expected_properties,
),
mocker.call(
event_name="CLI command",
identity="hashed_username",
identity="user_uuid",
properties=generic_properties,
),
]
Expand All @@ -226,13 +226,17 @@ def test_before_command_run_empty_args(self, mocker, fake_metadata):
mocked_anon_id = mocker.patch("kedro_telemetry.plugin._hash")
mocked_anon_id.return_value = "digested"
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
command_args = []
telemetry_hook.before_command_run(fake_metadata, command_args)
expected_properties = {
"username": "digested",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -249,12 +253,12 @@ def test_before_command_run_empty_args(self, mocker, fake_metadata):
expected_calls = [
mocker.call(
event_name="Command run: kedro",
identity="digested",
identity="user_uuid",
properties=expected_properties,
),
mocker.call(
event_name="CLI command",
identity="digested",
identity="user_uuid",
properties=generic_properties,
),
]
Expand Down Expand Up @@ -296,7 +300,7 @@ def test_before_command_run_anonymous(self, mocker, fake_metadata):
mocked_anon_id = mocker.patch("kedro_telemetry.plugin._hash")
mocked_anon_id.return_value = "digested"
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch("getpass.getuser", side_effect=Exception)
mocker.patch("builtins.open", side_effect=Exception)

mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
telemetry_hook = KedroTelemetryCLIHooks()
Expand Down Expand Up @@ -474,8 +478,8 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913
mocker.patch("kedro_telemetry.plugin._hash", return_value="digested")
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_hashed_username",
return_value="hashed_username",
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("kedro_telemetry.plugin.open")
Expand All @@ -487,7 +491,7 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913
telemetry_hook.after_catalog_created(fake_catalog)

project_properties = {
"username": "hashed_username",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -504,7 +508,7 @@ def test_after_context_created_without_kedro_run( # noqa: PLR0913

expected_call = mocker.call(
event_name="Kedro Project Statistics",
identity="hashed_username",
identity="user_uuid",
properties=expected_properties,
)

Expand All @@ -530,8 +534,8 @@ def test_after_context_created_with_kedro_run( # noqa: PLR0913
mocker.patch("kedro_telemetry.plugin._hash", return_value="digested")
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_hashed_username",
return_value="hashed_username",
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("kedro_telemetry.plugin.toml.load")
Expand All @@ -546,7 +550,7 @@ def test_after_context_created_with_kedro_run( # noqa: PLR0913
telemetry_hook.after_catalog_created(fake_catalog)

project_properties = {
"username": "hashed_username",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -563,7 +567,7 @@ def test_after_context_created_with_kedro_run( # noqa: PLR0913

expected_call = mocker.call(
event_name="Kedro Project Statistics",
identity="hashed_username",
identity="user_uuid",
properties=expected_properties,
)

Expand All @@ -589,8 +593,8 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913
mocker.patch("kedro_telemetry.plugin._hash", return_value="digested")
mocker.patch("kedro_telemetry.plugin.PACKAGE_NAME", "spaceflights")
mocker.patch(
"kedro_telemetry.plugin._get_hashed_username",
return_value="hashed_username",
"kedro_telemetry.plugin._get_or_create_uuid",
return_value="user_uuid",
)
mocked_heap_call = mocker.patch("kedro_telemetry.plugin._send_heap_event")
mocker.patch("builtins.open", mocker.mock_open(read_data=MOCK_PYPROJECT_TOOLS))
Expand All @@ -607,7 +611,7 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913
telemetry_hook.after_catalog_created(fake_catalog)

project_properties = {
"username": "hashed_username",
"username": "user_uuid",
"package_name": "digested",
"project_version": kedro_version,
"telemetry_version": TELEMETRY_VERSION,
Expand All @@ -626,7 +630,7 @@ def test_after_context_created_with_kedro_run_and_tools( # noqa: PLR0913

expected_call = mocker.call(
event_name="Kedro Project Statistics",
identity="hashed_username",
identity="user_uuid",
properties=expected_properties,
)

Expand Down