In [1]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
import matplotlib.pyplot as plt

In [2]:
# Load model
with open('rf.pkl', 'rb') as file:
    rf = pickle.load(file)

In [3]:
# Load dataset
df=pd.read_csv('Data/90contaminants.csv')

In [4]:
# Remove asterisk in the value column
df['value'] = df['value'].str.replace('*','')

  """Entry point for launching an IPython kernel.


In [5]:
# Filter for the non-detects for testing data set
ND = df[df['value'] == 'ND']

In [6]:
# Reset index for the key to merge on
ND=ND.reset_index()

In [7]:
# Read in dependent variables
fips=pd.read_csv('Data/pwsid2fips.csv')
census = pd.read_csv("Data/Census_1982_2018.csv")

In [8]:
# Drop columns we dont need
census.drop(columns=['white','median_household_income','CPI_annual_2015'],inplace=True)

In [9]:
# Merge with utility characteristics
ND_fips = ND.merge(fips,left_on='utilityid',right_on='pwsid')

In [10]:
# Drop columns we dont need
ND_fips.drop(columns=['primacytype','duplicates','new_fips','countyname','state_fips','county_fips','fips_FIMS_MA','pwsname','eparegion','primacyagency','pwstype','activitystatus','deactivationdate','zip_code','l','pwsid','zipcode2'],inplace=True)

In [11]:
# Merge with demographics data
merged=ND_fips.merge(census,left_on=['year','fips'],right_on=['year','fips'])

In [12]:
# Observe the # of NaNs for each column
merged.isna().sum()

index                                     0
utilityid                                 0
contaminant_name                          0
date                                      0
labid                               2055940
value                                     0
year                                      0
state                                     0
fips                                      0
primarysource                        230563
populationservedcount                230563
ownertype                            298504
iswholesaler                         298504
isoutstandingperformer               298504
issourcewaterprotected               298504
serviceconnectionscount              298504
median_year_structure_built             479
total_pop                                 0
housing_density                           0
deflated_median_household_income          0
nonwhite                                  0
dtype: int64

In [13]:
# Drop the labid column
merged=merged.drop(columns=['labid']).dropna()

In [14]:
# Convert contaminant name to lowercase
merged['contaminant_name'] = merged['contaminant_name'].str.lower()

In [15]:
# Convert categorical variables to dummy variables
test=pd.concat([merged,pd.get_dummies(merged['year'],drop_first=True),
           pd.get_dummies(merged['state'],drop_first=True),
           pd.get_dummies(merged['primarysource'],drop_first=True),
           pd.get_dummies(merged['ownertype'],drop_first=True),
           pd.get_dummies(merged['isoutstandingperformer'],drop_first=True),
           pd.get_dummies(merged['issourcewaterprotected'],drop_first=True),
           pd.get_dummies(merged['contaminant_name'],drop_first=True)], axis=1)

In [16]:
# Drop columns that are not in the predictors
test.drop(columns=['utilityid','contaminant_name','date','value','year','state','primarysource','ownertype','isoutstandingperformer','issourcewaterprotected','index'],inplace=True)

In [17]:
# put the dummy variable for alachlor oxanilic acid to the last column, consistent with training data
test['AOA'] = test['alachlor oxanilic acid']
test.drop(columns=['alachlor oxanilic acid'],inplace=True)

In [18]:
# Make predictions
predictions = np.exp(rf.predict(test))

## Put predictions back to the dataset

In [19]:
# Put it in a temporary column first
merged[0] = predictions

In [20]:
# Reset the index of the main data set for merge
df=df.reset_index()

In [21]:
# Merge the predictions with the main data set on index
df=df.merge(merged[['index',0]],how='left',left_on='index',right_on='index')

In [22]:
# # Observe the # of NaNs for each column
df.isna().sum()

index                     0
utilityid                 0
contaminant_name          0
date                      0
labid               2726674
value                    36
year                      0
state                     0
0                   7968809
dtype: int64

## Make the units and values consistent across detects and predictions

In [49]:
# Split into detects and predictions
ND = df[df['value']=='ND']
D = df[df['value']!='ND']

In [24]:
# Rename the prediction column to value to be consistent with detects
ND['value'] = ND[0]
ND.drop(columns=[0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
# Add a column to indicate that the values are predictions
ND=ND.reset_index(drop=True)
ND['is_prediction'] = pd.Series([1]*ND.shape[0])

In [27]:
# Fill in NaN for those (very little) not in the testing dataset due to missing other varaibles
ND['value'] = ND['value'].fillna(np.nan)

In [28]:
# Add a column for the detects to indicate they are not predictions
D=D.reset_index(drop=True)
D.drop(columns=[0],inplace=True)
D['is_prediction'] = pd.Series([0]*D.shape[0])

In [29]:
# Remove the asterisk in the value column
D['value'] = D['value'].str.replace('*','')

  """Entry point for launching an IPython kernel.


In [30]:
# Extract the concentration and units info
D['concentration'] = D['value'].str.extract(r'(\S*) \S*')
D['concentration'] = D['concentration'].str.replace(',','').astype(float)
D['units'] = D['value'].str.extract(r'\S* (\S*)')

In [31]:
# Observe the contaminants that are in the units of PPM
D[D['units']=='ppm']['contaminant_name'].unique()

array(['Nitrate', 'Fluoride', 'Nitrite'], dtype=object)

## Convert units to PPB for all contaminants


In [33]:
# Separate the known and NaN values
gone = ND[ND['value'].isna()]
rest = ND[ND['value'].notna()]

In [34]:
# Filter for the contaminants in units of PPT
ppt=rest[rest['contaminant_name'].isin(['2,3,7,8-TCDD (Dioxin)','17-beta-Estradiol'])]
nonppt=rest[~rest['contaminant_name'].isin(['2,3,7,8-TCDD (Dioxin)','17-beta-Estradiol'])]

In [35]:
# Convert values to PPB
ppt['value'] = ppt['value'] * 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [36]:
# Append the data back to one
rest=pd.concat([ppt,nonppt])

In [37]:
# Filter for the contaminants in units of PPM
ppm=rest[rest['contaminant_name'].isin(['Nitrate', 'Fluoride', 'Nitrite'])]
nonppm=rest[~rest['contaminant_name'].isin(['Nitrate', 'Fluoride', 'Nitrite'])]

In [38]:
# Convert values to PPB
ppm['value'] = ppm['value'] / 1000

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
# Append the data back to one
rest=pd.concat([ppm,nonppm])

In [40]:
# Append again with the NaN values
ND=pd.concat([rest,gone])

In [41]:
# Get a contaminant units table
units=D[['contaminant_name','units']].drop_duplicates().dropna()

In [42]:
# Merge with the predictions so that they have the units column like the detects
ND=ND.merge(units,how='left',left_on='contaminant_name',right_on='contaminant_name')

In [43]:
# Fill in the rest with PPB
ND['units'] = ND['units'].fillna('ppb')

In [45]:
# Rename column
D['value'] = D['concentration']
D.drop(columns=['concentration'],inplace=True)

In [46]:
# Append the predictions and detects back to one dataframe in the same order
final=pd.concat([ND,D]).sort_values('index').set_index('index')

In [47]:
final

Unnamed: 0_level_0,utilityid,contaminant_name,date,labid,value,year,state,is_prediction,units
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,NH2512010,Diquat,2010-03-24,103-407-1,0.69035,2010,NH,1,ppb
1,NH2512010,Endothall,2010-03-24,103-407-1,8.778158,2010,NH,1,ppb
2,NH2512010,Endrin,2010-03-24,103-407-1,0.040157,2010,NH,1,ppb
3,NH2512010,Endrin aldehyde,2010-03-24,103-407-1,0.514252,2010,NH,1,ppb
4,NH2512010,Ethylbenzene,2010-03-24,8753301,0.486085,2010,NH,1,ppb
...,...,...,...,...,...,...,...,...,...
27752849,WA5351550,"Radium, combined (-226 & -228)",2015-06-23,13511,0.67,2015,WA,0,pCi/L
27752850,NC0136294,"Radium, combined (-226 & -228)",2013-11-07,1311102-02,1.098783,2013,NC,1,pCi/L
27752851,WA5351550,"Radium, combined (-226 & -228)",2015-04-16,,0.53,2015,WA,0,pCi/L
27752852,NC0136294,"Radium, combined (-226 & -228)",2013-11-07,1311102-01,1.098783,2013,NC,1,pCi/L


In [None]:
# Save file
final.to_csv('Data/90contaminants_prediction.csv',index=False)