/
datamasking.sql
154 lines (126 loc) · 4.58 KB
/
datamasking.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
--
-- Copyright 2018, Andrew Lindesay
-- Distributed under the terms of the MIT License.
--
-- -------------
--
-- This script is designed to be run on an HDS database in order to scramble
-- some data so that it contains less personal data and no password hashes.
--
-- -------------
-- helper functions
CREATE OR REPLACE FUNCTION
hds_scramble_chars(str TEXT)
RETURNS TEXT
LANGUAGE PLPGSQL
AS $$
DECLARE
replace TEXT := '0123456789'
|| 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|| 'abcdefghijklmnopqrstuvwxyz';
replace_len INTEGER := length(replace);
result TEXT := '';
str_len INTEGER := length(str);
i INTEGER := 1;
BEGIN
WHILE i <= str_len LOOP
result := result ||
CASE
WHEN substr(str, i, 1) ~ '\W' THEN ' '
ELSE substr(
replace,
floor(random() * replace_len)::INTEGER,
1)
END;
i := i + 1;
END LOOP;
RETURN result;
END; $$;
CREATE OR REPLACE FUNCTION
hds_swap_user_rating_users_for_pkg_version_id(in_pkg_version_id BIGINT)
RETURNS VOID
LANGUAGE PLPGSQL
AS
$$
DECLARE
a_record RECORD;
a_user_id BIGINT;
candidate_user_count INT;
BEGIN
SELECT COUNT(*)
INTO candidate_user_count
FROM haikudepot.user_rating ur
WHERE ur.pkg_version_id = in_pkg_version_id;
-- Create a table of the possible users that might be able to replace
-- the users who have previously commented on this PkgVersion. Users
-- who have previously commented but not on this PkgVersion are
-- chosen.
CREATE TEMP TABLE candidate_users AS
SELECT u.id
FROM haikudepot.user u
WHERE 1 = 1
AND EXISTS(SELECT ur.id FROM haikudepot.user_rating ur WHERE ur.user_id = u.id)
AND NOT EXISTS(SELECT ur.id
FROM haikudepot.user_rating ur
WHERE ur.user_id = u.id AND ur.pkg_version_id = in_pkg_version_id)
LIMIT candidate_user_count;
-- In some cases it could come to pass that there are not enough
-- users who match the above criteria. In this case, choose some
-- random users from the user pool.
WHILE (SELECT COUNT(*) FROM candidate_users) < candidate_user_count
LOOP
INSERT INTO candidate_users (id)
SELECT id
FROM haikudepot.user u
WHERE 1 = 1
AND NOT EXISTS(SELECT ur2.id
FROM haikudepot.user_rating ur2
WHERE ur2.user_id = u.id
AND ur2.pkg_version_id = in_pkg_version_id)
ORDER BY random()
LIMIT 1;
END LOOP;
FOR a_record IN SELECT ur.id FROM haikudepot.user_rating ur WHERE ur.pkg_version_id = in_pkg_version_id
LOOP
SELECT id INTO a_user_id FROM candidate_users ORDER BY random() LIMIT 1;
UPDATE haikudepot.user_rating ur SET user_id = a_user_id WHERE ur.id = a_record.id;
DELETE FROM candidate_users WHERE id = a_user_id;
END LOOP;
DROP TABLE candidate_users;
END;
$$;
-- remove any captchas
DELETE FROM captcha.response;
-- remove any password change tokens
DELETE FROM haikudepot.user_password_reset_token;
-- reset all users' passwords to 'zimmer' and emails to some non-routable email
-- addresses.
UPDATE haikudepot.user SET
nickname = 'user' || substr(md5(nickname || password_salt), 1, 6)
WHERE nickname <> 'root';
UPDATE haikudepot.user SET
password_hash = 'f925e5a026bb425cc5691e101605eacaf5f05f71a10cf5a9ffb2f96828f8a0c6',
password_salt = 'cad3422ea02761f8';
UPDATE haikudepot.user SET
email = nickname || '@example.com'
WHERE email IS NOT NULL;
-- remove special rights for all users except for root so that anybody getting
-- this database dump is not able to ascertain who might have root access.
UPDATE haikudepot.user SET
is_root = false, can_manage_users = false
WHERE nickname <> 'root';
-- now scramble the text of the comments as they are also effectively user-data.
UPDATE haikudepot.user_rating SET
comment = hds_scramble_chars(comment);
-- now scramble the authors of the user rating comments by replacing the existing
-- author of a comment by another author of a different comment.
SELECT hds_swap_user_rating_users_for_pkg_version_id(pv.id)
FROM haikudepot.pkg_version pv
WHERE EXISTS(SELECT ur.id FROM haikudepot.user_rating ur WHERE ur.pkg_version_id = pv.id);
-- scramble the user who has made an edit to package data
UPDATE haikudepot.pkg_supplement_modification SET
user_description = hds_scramble_chars(user_description)
WHERE user_description IS NOT NULL;
-- clean up
DROP FUNCTION hds_scramble_chars;
DROP FUNCTION hds_swap_user_rating_users_for_pkg_version_id;