# Charting the Course for Maji Ndogo's Water Future

## Introduction

In this third part of the integrated project, we will pull data from many different tables and apply some statistical analyses to examine the consequences of an audit report that cross-references a random sample of records.

## Notebook setup

In [1]:
# Load the sql extension
%load_ext sql

Deploy Shiny apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup


In [2]:
# Create a connection to the mysql 'md_water_services' database
%sql mysql+pymysql://root:password@localhost:3306/md_water_services

From the ERD above, we can see that the visits table is the central table connecting other tables together.

- `location_id` is the **PRIMARY KEY** in the `location` table and a **FOREIGN KEY** in the `visits` table.
- `source_id` is the **PRIMARY KEY** in the `water_source` table and a **FOREIGN KEY** in the `visits` table.
- `assigned_employee_id` is the **PRIMARY KEY** in the `employee` table and a **FOREIGN KEY** in the `visits` table.

In a nutshell, the `visits` table logs **multiple** instances that a unique `location` was visited by a unique `employee` with interest to a particular `water_source`, hence the relationship between the three tables with the `visits` tables exudes a **one-to-many** relationship.

However, according to the ERD, the relationship between the `visits` table and `water_quality` table is a **one-to-many** relationship and yet according to our initial understanding, there should be one unique corresponding record in a water quality table to that of the `visits` table aluding to a potential error in the representation of a relationship between the two tables. hence we need to correct that.

![The Updated Maji Ndogo Water Services ERD!](../assets/updated_md_water_services_erd.png)

In [3]:
%%sql
# Retrieve information from tables of interest and combine them into a view for simplified analysis
SELECT
    l.province_name,
    l.town_name,
    ws.type_of_water_source,
    l.location_type,
    ws.number_of_people_served,
    v.time_in_queue,
    wp.results
FROM
	visits AS v
LEFT JOIN
	well_pollution AS wp
    ON wp.source_id = v.source_id
JOIN
	location AS l
    ON v.location_id = l.location_id
JOIN
	water_source AS ws
    ON ws.source_id = v.source_id
WHERE 
	v.visit_count = 1;

province_name,town_name,type_of_water_source,location_type,number_of_people_served,time_in_queue,results
Sokoto,Ilanga,river,Urban,402,15,
Kilimani,Rural,well,Rural,252,0,Contaminated
Hawassa,Rural,shared_tap,Rural,542,62,
Akatsi,Lusaka,well,Urban,210,0,Contaminated
Akatsi,Rural,shared_tap,Rural,2598,28,
Kilimani,Rural,river,Rural,862,9,
Akatsi,Rural,tap_in_home_broken,Rural,496,0,
Kilimani,Rural,tap_in_home,Rural,562,0,
Hawassa,Zanzibar,well,Urban,308,0,Contaminated: Chemical
Amanzi,Dahabu,tap_in_home,Urban,556,0,


In [5]:
%%sql
CREATE VIEW combined_analysis_table AS
# This view combines multiple tables of interest for simplified analysis
SELECT
    l.province_name,
    l.town_name,
    ws.type_of_water_source AS source_type,
    l.location_type,
    ws.number_of_people_served AS people_served,
    v.time_in_queue,
    wp.results
FROM
	visits AS v
LEFT JOIN
	well_pollution AS wp
    ON wp.source_id = v.source_id
JOIN
	location AS l
    ON v.location_id = l.location_id
JOIN
	water_source AS ws
    ON ws.source_id = v.source_id
WHERE 
	v.visit_count = 1;

In [6]:
%%sql
# Aggregate the population access to different source types per province
WITH province_totals AS (
	SELECT
		province_name,
        SUM(people_served) AS total_ppl_serv
	FROM
		combined_analysis_table
	GROUP BY
		province_name
)
SELECT
	ct.province_name,
	ROUND(SUM(CASE WHEN source_type = "river" THEN people_served ELSE 0 END) * 100.0 / pt.total_ppl_serv) AS river,
    ROUND(SUM(CASE WHEN source_type = "shared_tap" THEN people_served ELSE 0 END) * 100.0 / pt.total_ppl_serv) AS shared_tap,
    ROUND(SUM(CASE WHEN source_type = "tap_in_home" THEN people_served ELSE 0 END) * 100.0 / pt.total_ppl_serv) AS tap_in_home,
    ROUND(SUM(CASE WHEN source_type = "tap_in_home_broken" THEN people_served ELSE 0 END) * 100.0 / pt.total_ppl_serv) AS tap_in_home_broken,
    ROUND(SUM(CASE WHEN source_type = "well" THEN people_served ELSE 0 END) * 100.0 / pt.total_ppl_serv) AS well
FROM
	combined_analysis_table AS ct
JOIN
	province_totals AS pt
    ON ct.province_name = pt.province_name
GROUP BY
	ct.province_name
ORDER BY
	ct.province_name;

province_name,river,shared_tap,tap_in_home,tap_in_home_broken,well
Akatsi,5,49,14,10,23
Amanzi,3,38,28,24,7
Hawassa,4,43,15,15,24
Kilimani,8,47,13,12,20
Sokoto,21,38,16,10,15


In [7]:
%%sql
# Aggregate the population access to different source types per province and town
WITH town_totals AS (
	SELECT
		province_name,
        town_name,
        SUM(people_served) AS total_ppl_serv
	FROM
		combined_analysis_table
	GROUP BY
		province_name,
        town_name
)
SELECT
	ct.province_name,
    ct.town_name,
	ROUND(SUM(CASE WHEN source_type = "river" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS river,
    ROUND(SUM(CASE WHEN source_type = "shared_tap" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS shared_tap,
    ROUND(SUM(CASE WHEN source_type = "tap_in_home" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS tap_in_home,
    ROUND(SUM(CASE WHEN source_type = "tap_in_home_broken" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS tap_in_home_broken,
    ROUND(SUM(CASE WHEN source_type = "well" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS well
FROM
	combined_analysis_table AS ct
JOIN
	town_totals AS tt
    ON ct.province_name = tt.province_name
    AND ct.town_name = tt.town_name
GROUP BY
	ct.province_name,
    ct.town_name
ORDER BY
	ct.town_name;

province_name,town_name,river,shared_tap,tap_in_home,tap_in_home_broken,well
Amanzi,Abidjan,2,53,22,19,4
Kilimani,Amara,8,22,25,16,30
Hawassa,Amina,2,14,19,24,42
Amanzi,Amina,8,24,3,56,9
Amanzi,Asmara,3,49,24,20,4
Sokoto,Bahari,21,11,36,12,20
Amanzi,Bello,3,53,20,22,3
Sokoto,Cheche,19,16,35,12,18
Amanzi,Dahabu,3,37,55,1,4
Hawassa,Deka,3,16,23,21,38


In [9]:
# %%sql
# # Create a temporary table called town_aggregated_water_access with the query above
# CREATE TEMPORARY TABLE town_aggregated_water_access
# WITH town_totals AS (
# 	SELECT
# 		province_name,
#         town_name,
#         SUM(people_served) AS total_ppl_serv
# 	FROM
# 		combined_analysis_table
# 	GROUP BY
# 		province_name,
#         town_name
# )
# SELECT
# 	ct.province_name,
#     ct.town_name,
# 	ROUND(SUM(CASE WHEN source_type = "river" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS river,
#     ROUND(SUM(CASE WHEN source_type = "shared_tap" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS shared_tap,
#     ROUND(SUM(CASE WHEN source_type = "tap_in_home" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS tap_in_home,
#     ROUND(SUM(CASE WHEN source_type = "tap_in_home_broken" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS tap_in_home_broken,
#     ROUND(SUM(CASE WHEN source_type = "well" THEN people_served ELSE 0 END) * 100.0 / tt.total_ppl_serv) AS well
# FROM
# 	combined_analysis_table AS ct
# JOIN
# 	town_totals AS tt
#     ON ct.province_name = tt.province_name
#     AND ct.town_name = tt.town_name
# GROUP BY
# 	ct.province_name,
#     ct.town_name
# ORDER BY
# 	ct.town_name;

In [11]:
%%sql
# Order the aggregated town access by river in descending order
SELECT
	province_name,
    town_name,
    river,
    shared_tap,
    tap_in_home,
    tap_in_home_broken
FROM
	town_aggregated_water_access
ORDER BY 
	river DESC;

province_name,town_name,river,shared_tap,tap_in_home,tap_in_home_broken
Sokoto,Rural,22,49,8,8
Sokoto,Bahari,21,11,36,12
Sokoto,Kofi,20,16,34,10
Sokoto,Cheche,19,16,35,12
Sokoto,Majengo,18,14,36,12
Sokoto,Marang,17,19,31,13
Sokoto,Ilanga,16,12,36,15
Kilimani,Rural,9,55,8,9
Kilimani,Zuri,8,71,6,11
Kilimani,Amara,8,22,25,16


In [12]:
%%sql
# Order the data by province name
SELECT
	province_name,
    town_name,
    river,
    shared_tap,
    tap_in_home,
    tap_in_home_broken
FROM
	town_aggregated_water_access
ORDER BY 
	province_name;

province_name,town_name,river,shared_tap,tap_in_home,tap_in_home_broken
Akatsi,Kintampo,2,15,31,26
Akatsi,Rural,6,59,9,5
Akatsi,Lusaka,2,17,28,28
Akatsi,Harare,2,17,28,27
Amanzi,Asmara,3,49,24,20
Amanzi,Bello,3,53,20,22
Amanzi,Dahabu,3,37,55,1
Amanzi,Amina,8,24,3,56
Amanzi,Abidjan,2,53,22,19
Amanzi,Pwani,3,53,20,21


In [15]:
%%sql
# Compute the town with the highest ratio of a population with taps but no running water
SELECT
    province_name,
    town_name,
    ROUND(tap_in_home_broken / (tap_in_home + tap_in_home_broken) * 100.0) AS pct_broken_taps
FROM
    town_aggregated_water_access
WHERE province_name = "Amanzi"
ORDER BY pct_broken_taps DESC;

province_name,town_name,pct_broken_taps
Amanzi,Amina,95
Amanzi,Bello,52
Amanzi,Pwani,51
Amanzi,Rural,50
Amanzi,Abidjan,46
Amanzi,Asmara,45
Amanzi,Dahabu,2


In [None]:
# %%sql
# # Create a table to keep track of engineers' progress
# CREATE TABLE project_progress(
# 	project_id SERIAL PRIMARY KEY,
#     source_id VARCHAR(20) NOT NULL REFERENCES water_source(source_id) ON DELETE CASCADE ON UPDATE CASCADE,
#     address VARCHAR(50),
#     town VARCHAR(30),
#     source_type VARCHAR(50),
#     improvement VARCHAR(50),
#     source_status VARCHAR(50) DEFAULT 'Backlog' CHECK (source_status IN ('Backlog', 'In progress', 'Complete')),
#     date_of_completion DATE,
#     comments TEXT
# );

In [17]:
%%sql
# Retrieve necessary information to populate the progress table values
SELECT
	location.address,
    location.province_name,
    location.town_name,
    water_source.source_id,
    water_source.type_of_water_source,
    well_pollution.results
FROM
	water_source
LEFT JOIN
	well_pollution
    ON water_source.source_id = well_pollution.source_id

INNER JOIN
	visits
    ON water_source.source_id = visits.source_id
INNER JOIN
	location
    ON location.location_id = visits.location_id
WHERE
	visits.visit_count = 1
    AND (
		well_pollution.results != "Clean"
        OR water_source.type_of_water_source IN ("river", "tap_in_home_broken")
        OR (water_source.type_of_water_source = "shared_tap" AND visits.time_in_queue >= 30)
    );

address,province_name,town_name,source_id,type_of_water_source,results
36 Pwani Mchangani Road,Sokoto,Ilanga,SoIl32582224,river,
129 Ziwa La Kioo Road,Kilimani,Rural,KiRu28935224,well,Contaminated
18 Mlima Tazama Avenue,Hawassa,Rural,HaRu19752224,shared_tap,
100 Mogadishu Road,Akatsi,Lusaka,AkLu01628224,well,Contaminated
26 Bahari Ya Faraja Road,Kilimani,Rural,KiRu29315224,river,
104 Kenyatta Street,Akatsi,Rural,AkRu05234224,tap_in_home_broken,
117 Kampala Road,Hawassa,Zanzibar,HaZa21742224,well,Contaminated: Chemical
55 Fennec Way,Sokoto,Rural,SoRu35008224,shared_tap,
52 Moroni Avenue,Sokoto,Rural,SoRu35703224,well,Contaminated
51 Addis Ababa Road,Akatsi,Harare,AkHa00070224,well,Contaminated: Chemical


In [19]:
# %%sql
# CREATE VIEW progress_table_values AS
# SELECT
# 	location.address,
#     location.province_name,
#     location.town_name,
#     water_source.source_id,
#     water_source.type_of_water_source,
#     visits.time_in_queue,
#     well_pollution.results
# FROM
# 	water_source
# LEFT JOIN
# 	well_pollution
#     ON water_source.source_id = well_pollution.source_id

# INNER JOIN
# 	visits
#     ON water_source.source_id = visits.source_id
# INNER JOIN
# 	location
#     ON location.location_id = visits.location_id
# WHERE
# 	visits.visit_count = 1
#     AND (
# 		well_pollution.results != "Clean"
#         OR water_source.type_of_water_source IN ("river", "tap_in_home_broken")
#         OR (water_source.type_of_water_source = "shared_tap" AND visits.time_in_queue >= 30)
#     );

In [None]:
# %%sql
# # Insert relevant records in the project_progress table
# INSERT INTO project_progress (source_id, address, town, source_type, improvement)
# SELECT
# 	source_id,
#     address,
#     town_name,
#     type_of_water_source,
#     CASE 
# 		WHEN type_of_water_source = "well" AND results = "Contaminated: Biological" THEN "Install UV filter"
#         WHEN type_of_water_source = "well" AND results = "Contaminated: Chemical" THEN "Install OR filter"
#         WHEN type_of_water_source = "river" THEN "Drill well"
#         WHEN type_of_water_source = "shared_tap" AND time_in_queue >= 30 THEN CONCAT("Install ", FLOOR(time_in_queue/30), " nearby")
#         WHEN type_of_water_source = "tap_in_home_broken" THEN "Diagnose local insfrastructure"
#         ELSE NULL END AS improvement
# FROM progress_table_values;