-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_cleaning.sql
125 lines (106 loc) · 2.87 KB
/
data_cleaning.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
CREATE DATABASE projects;
USE projects;
SELECT
*
FROM
hr;
-- Change the column name "id" to "emp_id" in the "hr" table
-- Set the data type of the "emp_id" column to VARCHAR(20)
-- Allow NULL values in the "emp_id" column
ALTER TABLE hr
CHANGE id emp_id VARCHAR(20) NULL;
-- check all data types
DESCRIBE hr;
-- change birthdate data type since some date are 06-04-91 or 9/14/1982
-- Change the data type of the "birthdate" column in the "hr" table to DATE
ALTER TABLE hr
MODIFY COLUMN birthdate DATE;
SELECT
birthdate
FROM
hr;
SET sql_safe_updates = 0;
-- Update the "birthdate" column in the "hr" table
-- Use a CASE statement to handle two different date formats: MM/DD/YYYY and MM-DD-YYYY
-- Convert the date values to the YYYY-MM-DD format using the DATE_FORMAT function
UPDATE hr
SET
birthdate = CASE
WHEN
birthdate LIKE '%/%'
THEN
DATE_FORMAT(STR_TO_DATE(birthdate, '%m/%d/%Y'),
'%Y-%m-%d')
WHEN
birthdate LIKE '%-%'
THEN
DATE_FORMAT(STR_TO_DATE(birthdate, '%m-%d-%Y'),
'%Y-%m-%d')
ELSE NULL
END;
-- Do the same thing for the hire date
SELECT
hire_date
FROM
hr;
UPDATE hr
SET
hire_date = CASE
WHEN
hire_date LIKE '%/%'
THEN
DATE_FORMAT(STR_TO_DATE(hire_date, '%m/%d/%Y'),
'%Y-%m-%d')
WHEN
hire_date LIKE '%-%'
THEN
DATE_FORMAT(STR_TO_DATE(hire_date, '%m-%d-%Y'),
'%Y-%m-%d')
ELSE NULL
END;
ALTER TABLE hr
MODIFY COLUMN hire_date DATE;
-- Update term date
SELECT
termdate
FROM
hr;
-- Update the "termdate" column in the "hr" table
-- Convert empty strings to NULL using the IF function
-- Convert non-empty strings to dates using the STR_TO_DATE function
-- Filter only non-NULL values using the WHERE clause
UPDATE hr
SET
termdate = IF(termdate = '',
NULL,
STR_TO_DATE(termdate, '%Y-%m-%d %H:%i:%s UTC'))
WHERE
termdate IS NOT NULL;
-- Change the data type of the "termdate" column in the "hr" table to DATE
ALTER TABLE hr
MODIFY COLUMN termdate DATE;
-- Add age column to data
ALTER TABLE hr ADD COLUMN age INT;
-- This SQL statement updates the 'age' column in the 'hr' table to reflect the current age of each employee.
-- It uses the TIMESTAMPDIFF function to calculate the number of years between the 'birthdate' column and the current date,
-- and sets the result as the new value of the 'age' column.
UPDATE hr
SET
age = TIMESTAMPDIFF(YEAR,
birthdate,
CURDATE());
SELECT
birthdate, age
FROM
hr;
SELECT
MIN(age) AS youngest, MAX(age) AS oldest
FROM
hr;
-- We only want to count poeple over 18 years old so this WHERE statement will be included in most of our queires
SELECT
COUNT(*)
FROM
hr
WHERE
age < 18;