Import relevant Python libraries.

In [27]:
import pandas as pd
import cx_Oracle
import numpy as np
import sys
import platform

Print version information.

In [28]:
print("Operating System " + platform.system() + " " + platform.release())
print("Python Version " + str(sys.version))
print("Pandas Version " + str(pd.__version__))
print("Numpy Version " + str(np.__version__))

Operating System Windows 7
Python Version 2.7.10 (default, May 23 2015, 09:40:32) [MSC v.1500 32 bit (Intel)]
Pandas Version 0.19.1
Numpy Version 1.11.2


Import database connection.  This is kept in a separate file because it contains sensitive information.

In [30]:
connection_filepath = "E:\Google Drive\WORK\Groundwater Chemistry"
sys.path.append(connection_filepath)
import oraconnection

This function below is just calling the imported database connection and assigning it to a variable.

In [31]:
connection = oraconnection.oraconnect()

`SDWISconst` is a list of the constituents of interest.  The percent sign is used as a wildcard because all of the parameter names have trailing spaces or other issues.

In [87]:
SDWISconst = ['PHOSPHORUS%', 'ARSENIC%','CALCIUM%','CHLORIDE%',
              'FLUORIDE%','IRON%', 'MAGNESIUM%','NITRATE%','CARBONATE%',
              'POTASSIUM%','BICARBONATE%','ALKA%','PHOSPHATE%',
              'SILICA%','SODIUM%','SULFATE%','BORON%','BROMIDE%','PH%',
              'TDS%','CONDUCT%','URANIUM%','TEMPER%','HARDN%']

The following is the SQL query used to pull the chemistry data from the database.  It renames the fields (from `SELECT` to `FROM`), then joins the appropriate tables (from `FROM` to `WHERE`, then limits the output to Springs and Wells and state issigned id code.  The `{:}` in the `WHERE` clause is a place holder for one of the above parameters in the `SDWISconst` list.

In [88]:
SQLtext = """
SELECT
UTV80.TINWSF.TYPE_CODE AS "SampType", 
UTV80.TINWSF.TINWSF_IS_NUMBER AS "site_no", 
UTV80.TINWSYS.TINWSYS_IS_NUMBER AS "OrgID", 
UTV80.TINWSYS.NAME AS "OrgName", 
UTV80.TSASAMPL.LAB_ASGND_ID_NUM AS "SampleID", 
UTV80.TSASAMPL.COLLLECTION_END_DT AS "sample_dt", 
UTV80.TSASAMPL.COLLCTN_END_TIME AS "sample_tm", 
UTV80.TSAANLYT.NAME AS "Param", 
UTV80.TSASAR.CONCENTRATION_MSR AS "result_va", 
UTV80.TSASAR.UOM_CODE AS "Unit", 
UTV80.TSASAR.ANALYSIS_START_DT AS "AnalysisDate", 
UTV80.TSASAR.DETECTN_LIMIT_NUM AS "MDL", 
UTV80.TSASAR.DETECTN_LIM_UOM_CD AS "MDLUnit", 
UTV80.TSAANLYT.CAS_REGISTRY_NUM AS "CAS_Reg", 
UTV80.TSASAR.TSASAR_IS_NUMBER AS "ID_NUM"

FROM UTV80.TINWSF 
JOIN UTV80.TINWSYS ON 
UTV80.TINWSF.TINWSYS_IS_NUMBER = UTV80.TINWSYS.TINWSYS_IS_NUMBER 
JOIN UTV80.TSASMPPT ON 
UTV80.TINWSF.TINWSF_IS_NUMBER = UTV80.TSASMPPT.TINWSF0IS_NUMBER
JOIN UTV80.TSASAMPL ON 
UTV80.TSASMPPT.TSASMPPT_IS_NUMBER = UTV80.TSASAMPL.TSASMPPT_IS_NUMBER 
JOIN UTV80.TSASAR ON 
UTV80.TSASAMPL.TSASAMPL_IS_NUMBER = UTV80.TSASAR.TSASAMPL_IS_NUMBER  
JOIN UTV80.TSAANLYT ON 
UTV80.TSASAR.TSAANLYT_IS_NUMBER = UTV80.TSAANLYT.TSAANLYT_IS_NUMBER 
WHERE (UTV80.TINWSF.TYPE_CODE = 'SP' Or UTV80.TINWSF.TYPE_CODE = 'WL') 
AND (UTV80.TSAANLYT.NAME LIKE '{:}') 
ORDER BY UTV80.TINWSF.ST_ASGN_IDENT_CD
"""

The following script loops through the constituents of interest and downloads to csvs based on the above query, inserting the constituent name each time. It also reformats the date and time fields using Pandas (see http://strftime.org) and adds an `agency_cd` field. It generates a Pandas Dataframe for each parameter and stores it in a <a href='https://www.tutorialspoint.com/python/python_dictionary.htm'>dictionary</a> with the parameter name as the key

In [90]:
df_ora = {}
for j in SDWISconst:
    SQL = SQLtext.format(j)
    k = j[:-1]
    df_ora[k] = pd.read_sql(SQL, con = connection,
                            parse_dates=['sample_dt','sample_tm','AnalysisDate'])
    df_ora[k]['agency_cd'] = 'UDDW'
    file_place = 'E:/Google Drive/WORK/Groundwater Chemistry/raw_SDWIS/{:}.csv'
    df_ora[k].to_csv(file_place.format(k))
    print(k)

PHOSPHORUS
ARSENIC
CALCIUM
CHLORIDE
FLUORIDE
IRON
MAGNESIUM
NITRATE
CARBONATE
POTASSIUM
BICARBONATE
ALKA
PHOSPHATE
SILICA
SODIUM
SULFATE
BORON
BROMIDE
PH
TDS
CONDUCT
URANIUM
TEMPER
HARDN


We can then use Pandas to combine all of the <a href='https://pandas.pydata.org/pandas-docs/stable/dsintro.html'>Dataframes</a> into one massive dataframe and then save it as a csv.  The reason I didn't do this in the first place (just query all of the constituents) is because the large data request tends to lag or lock up.

In [91]:
SDWISallraw = pd.concat(df_ora)
SDWISallraw.reset_index(inplace=True)
SDWISallraw.drop(['level_0','level_1'],inplace=True,axis=1)
SDWISallraw.to_csv(file_place.format('all'))

I then save the resulting Dataframe in <a href='https://docs.python.org/2/library/pickle.html'>pickle</a> format as well.

In [92]:
SDWISallraw.to_pickle(file_place[:-4].format("all.pickle"))

In [93]:
SDWISallraw = pd.read_pickle(file_place[:-4].format("all.pickle"))

We should probably preview the dataframe now to make sure it looks ok.

In [94]:
SDWISallraw.head()

Unnamed: 0,SampType,site_no,OrgID,OrgName,SampleID,sample_dt,sample_tm,Param,result_va,Unit,AnalysisDate,MDL,MDLUnit,CAS_Reg,ID_NUM,agency_cd
0,SP,10670.0,1788.0,MT STERLING RECREATION CAMP,90 02702,2009-03-19,NaT,"ALKALINITY, BICARBONATE",284.0,MG/L,NaT,0.0,,,4099358.0,UDDW
1,SP,5691.0,1098.0,ORANGEVILLE,1513215-03,2015-11-17,NaT,"ALKALINITY, TOTAL",189.0,MG/L,NaT,0.0,,,4080317.0,UDDW
2,SP,5819.0,1620.0,WASATCH RESORT,16D0366-01,2016-04-11,NaT,"ALKALINITY, TOTAL",55.0,MG/L,2016-04-14,1.0,MG/L,,4116199.0,UDDW
3,WL,9529.0,1769.0,DEEPWATER DISTRIBUTION COMPANY,1514188-01,2015-12-16,NaT,"ALKALINITY, CARBONATE",0.0,,NaT,1.0,MG/L,,4080538.0,UDDW
4,WL,8861.0,1743.0,HIDDEN HOLLOW WATER COMPANY,1513683-01,2015-12-01,NaT,"ALKALINITY, TOTAL",170.0,MG/L,2015-12-03,1.0,MG/L,,4076694.0,UDDW


Let's strip out unneeded white space to make units and parameter names more consistent.

In [95]:
def unitstrip(x):
    if x is None:
        return x
    else:
        return x.strip()
    
SDWISallraw['Param'] = SDWISallraw['Param'].apply(lambda x: x.strip(),1) 
SDWISallraw['Unit'] = SDWISallraw['Unit'].apply(lambda x: unitstrip(x),1)
SDWISallraw['MDLUnit'] = SDWISallraw['MDLUnit'].apply(lambda x: unitstrip(x),1)

Add `remark_cd` field and populate based on value in `result_va` and the reported MDL.

In [96]:
def fillmdlspot(x):
    """
    PARAM
    ------
    x[0] = result_va
    x[1] = Unit
    x[2] = MDL
    x[3] = MDLUnit
    
    RETURNS
    -------
    remark_cd, result_va, Unit
    """
    if (x[0] == 0 or x[0] is None or x[0] == ''):
        return '<',x[2],x[3]
    else:
        return np.nan, x[0], x[1]
SDWISallraw['remark_cd'] = SDWISallraw[['result_va',
                                        'Unit','MDL',
                                        'MDLUnit']].apply(lambda x: fillmdlspot(x)[0],1)
SDWISallraw['result_va'] = SDWISallraw[['result_va',
                                        'Unit','MDL',
                                        'MDLUnit']].apply(lambda x: fillmdlspot(x)[1],1)


Convert ug/L to mg/L in cases where units should be in mg/L. We have to do this for both the MDL and the results.

In [97]:
def converter(x):
    # these are the parameters where we want the reported units to be ug/L
    microgrammers = ['BORON,TOTAL','IRON','IRON, DISSOLVED','ARSENIC','BORON']
    if x[0] == 'UG/L' and x[2] not in microgrammers:
        return x[1]*0.001, 'MG/L'
    elif x[0] == 'LBS/GAL':
        return x[1]*119826.0, 'MG/L'
    else:
        return x[1], x[0]

In [98]:
SDWISallraw['result_va'] = SDWISallraw[['Unit','result_va','Param']].apply(lambda x: converter(x)[0],1)
SDWISallraw['Unit'] = SDWISallraw[['Unit','result_va','Param']].apply(lambda x: converter(x)[1],1)

In [99]:
SDWISallraw['MDL'] = SDWISallraw[['MDLUnit','MDL','Param']].apply(lambda x: converter(x)[0],1)
SDWISallraw['MDLUnit'] = SDWISallraw[['MDLUnit','MDL','Param']].apply(lambda x: converter(x)[1],1)

Match to USGS parameter codes to the named paramters.  The parameter codes can be found at the following links:<br>
https://nwis.waterdata.usgs.gov/usa/nwis/pmcodes?radio_pm_search=param_group&pm_group=All+--+include+all+parameter+groups&pm_search=&casrn_search=&srsname_search=&format=html_table&show=parameter_group_nm&show=parameter_nm&show=casrn&show=srsname&show=parameter_units<br>
https://nwis.waterdata.usgs.gov/usa/nwis/pmcodes

In [100]:
USGSmatch = {'CONDUCTIVITY @ 25 C UMHOS/CM':'00400', 'BORON':'01021',
             'ARSENIC':'01000', 'BICARBONATE AS HCO3':'00451',
             'ALKALINITY, BICARBONATE':'00451','ALKALINITY, CARBONATE':'00448',
             'CARBONATE':'00448',
             'BORON, TOTAL':'00999', 'BROMIDE':'71870', 'CALCIUM':'00915', 
             'CALCIUM HARDNESS':'00900','CHLORIDE':'00940', 'FLUORIDE':'00950', 
             'IRON':'01045', 'IRON, DISSOLVED':'01046','MAGNESIUM':'00925', 
             'NITRATE-NITRITE':'00631', 'NITRATE':'00620', 'PH':'00400',
             'PHOSPHATE, TOTAL':'00650', 'PHOSPHORUS, TOTAL':'00665', 'POTASSIUM':'00935', 
             'SILICA':'00955','SODIUM':'00930', 'SULFATE':'00945', 'TDS':'70300',
             'TEMPERATURE (CENTIGRADE)':'00010','ALKALINITY, TOTAL':'00421',
             'ALKALINITY, CACO3 STABILITY':'00421'}

These have not been matched to parameter codes yet:
'URANIUM-238'

In [101]:
SDWISallraw['Param'].unique()

array(['ALKALINITY, BICARBONATE', 'ALKALINITY, TOTAL',
       'ALKALINITY, CARBONATE', 'ALKALINITY, CACO3 STABILITY', 'ARSENIC',
       'BICARBONATE AS HCO3', 'BORON, TOTAL', 'BROMIDE', 'CALCIUM',
       'CALCIUM HARDNESS', 'CHLORIDE', 'CONDUCTIVITY @ 25 C UMHOS/CM',
       'FLUORIDE', 'HARDNESS, TOTAL (AS CACO3)', 'HARDNESS, CARBONATE',
       'HARDNESS, CALCIUM MAGNESIUM', 'IRON', 'IRON, DISSOLVED',
       'IRON BACTERIA ID', 'MAGNESIUM', 'NITRATE', 'NITRATE-NITRITE', 'PH',
       'PHOSPHATE, TOTAL', 'PHOSPHORUS, TOTAL', 'POTASSIUM', 'SILICA',
       'SODIUM', 'SULFATE', 'TDS', 'TEMPERATURE (CENTIGRADE)',
       'URANIUM-238', 'URANIUM-235', 'URANIUM-234'], dtype=object)

In [102]:
SDWISallraw['parm_cd'] = SDWISallraw['Param'].apply(lambda x: str(USGSmatch.get(x,'')),1)

In [103]:
SDWISallraw.to_csv(file_place.format('all_adjusted'),index=False)

In [104]:
def comb_res_rmk(x):
    if x[0]== '<':
        return str(x[0])+str(x[1])
    else:
        return str(x[1])


In [105]:
SDWISallraw['res_w_rmk'] = SDWISallraw[['remark_cd','result_va']].apply(lambda x: comb_res_rmk(x),1)

In [106]:
SDWIS = SDWISallraw.drop_duplicates(subset=['SampleID','Param'])

In [108]:
SDWISpiv = SDWIS.pivot(index='SampleID', columns = 'Param', values = 'res_w_rmk')
SDWISpiv.to_csv(file_place.format('by_sample'),index=False)

# USGS Format and Table

In [None]:
columns = ['agency_cd','site_no','sample_dt','sample_tm','sample_end_dt','sample_end_tm',
           'sample_start_time_datum_cd','tm_datum_rlbty_cd','coll_ent_cd','medium_cd',
           'tu_id','body_part_id','r00003','p00003','r00010','p00010','r00094','p00094',
           'r00400','p00400','r00631','p00631','r00671','p00671','r00900','p00900','r00915',
           'p00915','r00925','p00925','r00930','p00930','r00935','p00935','r00940','p00940',
           'r00945','p00945','r00950','p00950','r00955','p00955','r01000','p01000','r01046',
           'p01046','r22703','p22703','r70300','p70300','r71870','p71870','r90410','p90410']

In [None]:
usgs_piv_path = 'E:/Google Drive/WORK/Groundwater Chemistry/USGS_data/qwdata_pivot'
pd.read_csv(usgs_piv_path, sep = '\t',skiprows=12438,names=columns)

In [None]:
['']

In [None]:
SQL = """
SELECT * FROM UTV80.TSAMCSMP 
WHERE 
UTV80.TSAMCSMP.FIELD_TEMP_MSR > 0 OR 
UTV80.TSAMCSMP.FIELD_PH_MEASURE > 0
"""

In [None]:
df_ora