# Drinking water filtration recommendation by region

The objective is to recommend one of four types of filters based on recorded contaminants and water qualities:

1. Activated Carbon Filter
2. Ion Exchange
3. Distillation
4. Reverse Osmosis

note: Orange County and San Diego recommendations are not available in the data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

base = pd.read_csv('base.csv')
base.head()

Unnamed: 0,SAMPLE_YEAR,MCL,filters,Superior_California,North_Coast,San_Francisco,Northern_San_Joaquin_Valley,Central_Coast,Southern_San_Joaquin_Valley,Inland_Empire,Los_Angeles,"1,1-Dichloroethane","1,1,1-Trichloroethane","1,1,2-Trichloroethane","1,1,2,2-Tetrachloroethane","1,2-Dibromo-3-chloropropane (DBCP)","1,2-Dichlorobenzene","1,2-Dichloroethane","1,2-Dichloropropane","1,2,3-Trichloropropane","1,2,4-Trichlorobenzene","1,4-Dichlorobenzene","2,4,5-TP (Silvex)",Alachlor,Aluminum,Antimony,Arsenic,Atrazine,Barium,Benzene,Beryllium,BHC-gamma (Lindane),Cadmium,Carbofuran,Carbon tetrachloride,Chlordane,Chromium,"cis-1,3-Dichloropropene",Copper,Dinoseb (DNPB),Endrin,Fluoride,Glyphosate,Heptachlor,Heptachlor epoxide,Lead,Mercury,Methoxychlor,Molinate,Nickel,Nitrate,Nitrate + Nitrite,Nitrite,Oxamyl,Pentachlorophenol (PCP),Picloram,Selenium,Simazine,Strontium,Styrene,Thallium,Thiobencarb,Toluene,Toxaphene,"trans-1,3-Dichloropropene",Trichlorofluoromethane,Vinyl chloride,Groundwater,Other,Surface Water
0,2016,0.2,Activated Carbon Filter,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
1,2012,0.2,Activated Carbon Filter,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
2,2016,0.2,Activated Carbon Filter,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
3,2019,0.2,Activated Carbon Filter,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1
4,2012,0.2,Activated Carbon Filter,0,0,1,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,1


In [12]:
base.filters.unique()

array(['Activated Carbon Filter', 'Ion Exchange', 'Distillation',
       'Reverse Osmosis'], dtype=object)

In [3]:
base = base.drop('SAMPLE_YEAR', axis=1)
cats = base.filter(['Superior_California', 'North_Coast', 'San_Francisco', 
            'Northern_San_Joaquin_Valley', 'Central_Coast', 'Southern_San_Joaquin_Valley', 
            'Inland_Empire', 'Los_Angeles','Groundwater','Other','Surface Water'])
conts = base.drop(cats, axis=1)
conts = conts.drop('filters', axis=1)

filters = base['filters']
features = pd.concat([cats,conts], axis = 1)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, filters, test_size=0.2, random_state=13)

## KNN

In [5]:
# fit KNN; Import KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Instantiate KNeighborsClassifier
clf = KNeighborsClassifier()

# Fit the classifier
clf.fit(X_train, y_train)

# Predict on the test set
test_preds = clf.predict(X_test)

In [10]:
# evaluate; Import the necessary functions
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

# Complete the function
def print_metrics(filters, preds):
    print("Precision Score: {}".format(precision_score(filters, preds, average='weighted')))
    print("Recall Score: {}".format(recall_score(filters, preds, average='weighted')))
    print("Accuracy Score: {}".format(accuracy_score(filters, preds)))
    print("F1 Score: {}".format(f1_score(filters, preds, average='weighted')))
    
print_metrics(y_test, test_preds)

Precision Score: 0.8964188473966851
Recall Score: 0.9045293881864228
Accuracy Score: 0.9045293881864228
F1 Score: 0.8994205647263812


## Random Forest

## SVM