In [65]:
import pandas as pd
import numpy as np
import datetime
from numpy import dtype

In [66]:
#rootname = "E:/PROJECTS/MLSNF/Data/CHEM/"
rootname = "U:/GWP/Groundwater/UMSS_Manti/Data/"
#rootname = "/media/p/5F5B-8FCB/PROJECTS/MLSNF/Data/CHEM/" #ubuntu thumb drive

## DOGM Data

Utah Division of Oil Gas and Mining (DOGM) data derived from the <a href=http://linux1.ogm.utah.gov/WebStuff/wwwroot/wqdb.html>DOGM website</a>. Station information obtained from digitization of PDF maps provided by mining companies to DOGM. 

In [67]:
DOGM = rootname + "DOGM/UDOGM_allResults.csv"

In [68]:
DOGM_ST = rootname + "DOGM/dogm_stations.xls"

In [69]:
MAYO = rootname + "MayoChemdataV2.xlsx"

###DOGM Chemistry Data

Import and Prepare DOGM Results Data

In [70]:
parmatch = {"ACIDITY AS CACO3":"Acidity", "TOTAL ALKALINITY AS CACO3":"Alkalinity, total", 
            "DISSOLVED ALUMINUM":"Aluminum", "TOTAL ALUMINUM":"Aluminum", "AMMONIA AS N":"Ammonia-nitrogen as N", 
            "DISSOLVED ARSENIC":"Arsenic", "TOTAL ARSENIC":"Arsenic", "DISSOLVED BARIUM":"Barium", 
            "TOTAL BARIUM":"Barium", "TOTAL BERYLLIUM":"Beryllium", "BICARBONATE AS HCO3":"Bicarbonate", 
            "B.O.D. 5, MG/L":"Biochemical oxygen demand, standard conditions", "DISSOLVED BORON":"Boron", 
            "TOTAL BORON":"Boron", "BROMIDE":"Bromide", "DISSOLVED CADMIUM":"Cadmium", "TOTAL CADMIUM":"Cadmium", 
            "DISSOLVED CALCIUM":"Calcium", "TOTAL CALCIUM":"Calcium", "CARBONATE AS CO3":"Carbonate", 
            "C.O.D., MG/L":"Chemical oxygen demand", "CHLORIDE":"Chloride", "DISSOLVED CHROMIUM":"Chromium", 
            "TOTAL CHROMIUM":"Chromium", "CHROMIUM HEX, CR":"Chromium(VI)", "TOTAL COBALT":"Cobalt", 
            "SP. CONDUCTIVITY (FIELD)":"Conductivity", "SPECIFIC CONDUCTIVITY (LAB)":"Conductivity", 
            "DISSOLVED COPPER":"Copper", "TOTAL COPPER":"Copper", "CYANIDE":"Cyanide", "DEPTH":"Depth", 
            "DISSOLVED OXYGEN (FIELD)":"Dissolved oxygen (DO)", "FLOW":"Flow", "FLUORIDE":"Fluoride", 
            "TOTAL HARDNESS AS CACO3":"Hardness, Ca, Mg", "HYDROXIDE":"Hydroxide", "DISSOLVED IRON":"Iron", 
            "TOTAL IRON":"Iron", "TOTAL KIELDAHL NITROGEN, T.K.N.":"Kjeldahl nitrogen", "DISSOLVED LEAD":"Lead", 
            "TOTAL LEAD":"Lead", "DISSOLVED MAGNESIUM":"Magnesium", "TOTAL MAGNESIUM":"Magnesium", 
            "DISSOLVED MANGANESE":"Manganese", "TOTAL MANGANESE":"Manganese", "DISSOLVED MERCURY":"Mercury", 
            "TOTAL MERCURY":"Mercury", "DISSOLVED MOLYBDENUM":"Molybdenum", "TOTAL MOLYBDENUM":"Molybdenum", 
            "DISSOLVED NICKEL":"Nickel", "TOTAL NICKEL":"Nickel", "DISSOLVED NITRATE, NO3":"Nitrate", 
            "NITRATE AS N":"Nitrate as N", "DISSOLVED NITRITE, NO2":"Nitrite", "NITRITE AS N":"Nitrogen", 
            "NO2+NO3 AS N":"Nitrogen", "DISSOLVED ORTHO PHOSPH, OPO4":"Orthophosphate", 
            "ORTHO. PHOSPHATE":"Orthophosphate", "PH (FIELD)":"pH", "PH (LAB)":"pH, lab", 
            "TOTAL PHOSPHORUS":"Phosphorus", "DISSOLVED POTASSIUM":"Potassium", "DISSOLVED SELENIUM":"Selenium", 
            "TOTAL SELENIUM":"Selenium", "DISSOLVED SILICA, SIO2":"Silica", "DISSOLVED SILVER":"Silver", 
            "TOTAL SILVER":"Silver", "DISSOLVED SODIUM":"Sodium", "TOTAL SODIUM":"Sodium", 
            "SODIUM ADSORPTION RATIO":"Sodium adsorption ratio", "SULFATE":"Sulfate", "SULFIDE":"Sulfide", 
            "TOTAL ANIONS":"Sum of anions", "TOTAL CATIONS":"Sum of cations", 
            "AIR TEMPERATURE (FIELD)":"Temperature, air", "FIELD WATER TEMPERATURE":"Temperature, water", 
            "TOTAL DISSOLVED SOLIDS, @ 180 C":"Total dissolved solids", 
            "TOTAL SUSPENDED SOLIDS":"Total suspended solids", "TURBIDITY (FIELD)":"Turbidity", 
            "TURBIDITY (LAB)":"Turbidity", "TOTAL VANADIUM":"Vanadium", "DISSOLVED ZINC":"Zinc", "TOTAL ZINC":"Zinc"} 


In [71]:
dogtypes = {'Unnamed: 0':dtype('int8'),'MIN_DET': dtype('float32'), 'MINE_ID': dtype('int8'), 
            'ANAL_NAME': dtype('str_'), 'PARAM_ID': dtype('int8'), 'SAMPLE_ID': dtype('str_'), 'SITE_NAME': dtype('str_'), 
            'METHD': dtype('str_'), 'DATE_REC': dtype('str_'),'EQUALITY': dtype('str_'), 'TIME_ANAL': dtype('str_'), 
            'STATION_ID': dtype('str_'), 'PAR_DESC': dtype('str_'), 'PAR_ABB': dtype('str_'), 'ANAL_METHD': dtype('str_'), 
            'METH_DESC': dtype('str_'), 'SAMP_TYPE': dtype('str_'), 'LAB_ID': dtype('str_'), 'MINE_NAME': dtype('str_'), 
            'SITE_DESC': dtype('str_'), 'SITE_ID': dtype('int8'), 'VALUE': dtype('float32'), 'PERM_NO': dtype('str_'), 
            'LAB_NAME': dtype('str_'), 'UNITS': dtype('str_'), 'SAMP_DESC': dtype('str_'), 'DATE_ANAL': dtype('str_'), 
            'SITE_TYPE': dtype('str_'), 'LAB_CODE': dtype('str_'), 'SAMPLR_NAM': dtype('str_'), 'COMMENTS': dtype('str_')}

In [72]:
dogmr = pd.read_csv(DOGM, dtype=dogtypes, parse_dates=[11,14,15,19])

In [73]:
DResCols = {"ANAL_METHD":"AnalytMeth", "ANAL_NAME":"Param", "COMMENTS":"ResultComment", "DATE_ANAL":"AnalysisDate", 
            "DATETIME_SAMP":"SampleDate", "EQUALITY":"DetectCond", "LAB_CODE":"LabComments", "LAB_NAME":"LabName", 
            "METH_DESC":"MethodDescript", "METHD":"SampMeth", "MIN_DET":"MDL", "SAMP_TYPE":"SampMethName", 
            "SAMPLE_ID":"SampleId", "STATION_ID":"StationId", "UNITS":"Unit", "VALUE":"ResultValue"} 
dogmr.rename(columns=DResCols,inplace=True)

In [74]:
dogmr['Param'] = dogmr['PAR_DESC'].apply(lambda x: parmatch.get(x),1)

In [75]:
dogmr.dropna(how='any',subset=['StationId'],inplace=True)

In [76]:
dogmr.drop(['PARAM_ID','SAMPLR_NAM','SITE_ID','SITE_DESC','PAR_DESC','TIME_ANAL','DATE_RPT',
            'DATE_REC','Unnamed: 0', 'LAB_ID','MINE_ID','PAR_ABB','MINE_NAME', 'SITE_NAME', 'SAMP_DESC',
            'PERM_NO','SITE_TYPE'], inplace=True, axis=1)

In [296]:
mayoresults = pd.read_excel(MAYO,1)

In [78]:
dogmr.columns

Index([u'LabComments', u'DetectCond', u'ResultValue', u'Unit', u'MDL',
       u'AnalytMeth', u'AnalysisDate', u'Param', u'SampMethName',
       u'ResultComment', u'SampleDate', u'SampMeth', u'MethodDescript',
       u'LabName', u'SampleId', u'StationId'],
      dtype='object')

###DOGM Station Data

Import and Prepare DOGM Station Data

In [79]:
dogms = pd.read_excel(DOGM_ST)

In [80]:
def geoapply(dic, x):
    statlist=list(dic.keys())
    if x[0] in statlist:
        return dic.get(x[0])
    else:
        return x[1]

In [81]:
dogmgeo = {"UDOGM-02-11450":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-03-9292":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0045":"Star Point Sandstone of Mesaverde Group", "UDOGM-04-0048":"Blackhawk Formation of Mesaverde Group", "UDOGM-04-0153":"Price River Formation of Mesaverde Group", "UDOGM-04-0154":"Price River Formation of Mesaverde Group", "UDOGM-04-0155":"Price River Formation of Mesaverde Group", "UDOGM-04-0156":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0157":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0158":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0159":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0160":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0161":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0162":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0163":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0164":"North Horn Formation (Paleocene-Upper Cretaceous)", 
           "UDOGM-04-0165":"Price River Formation of Mesaverde Group", "UDOGM-04-0166":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-04-0167":"Blackhawk Formation of Mesaverde Group", "UDOGM-04-0168":"Price River Formation of Mesaverde Group", 
           "UDOGM-04-0169":"Price River Formation of Mesaverde Group", "UDOGM-04-0170":"Price River Formation of Mesaverde Group", 
           "UDOGM-04-0171":"Blackhawk Formation of Mesaverde Group", "UDOGM-04-0172":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-04-0173":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0174":"North Horn Formation (Paleocene-Upper Cretaceous)", 
           "UDOGM-04-0175":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0176":"North Horn Formation (Paleocene-Upper Cretaceous)", 
           "UDOGM-04-0177":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0178":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0179":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0180":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0181":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0182":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0183":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0184":"North Horn Formation (Paleocene-Upper Cretaceous)", 
           "UDOGM-04-0185":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0186":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0187":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0188":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0189":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0190":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0191":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0192":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0197":"Blackhawk Formation of Mesaverde Group", "UDOGM-04-0198":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-04-0282":"Valley Fill", "UDOGM-04-0283":"Valley Fill", "UDOGM-04-0284":"Valley Fill", "UDOGM-04-0285":"Valley Fill", "UDOGM-04-0286":"Valley Fill", "UDOGM-04-0287":"Valley Fill", "UDOGM-04-0288":"Valley Fill", "UDOGM-04-0289":"Valley Fill", "UDOGM-04-0290":"Valley Fill", "UDOGM-04-0291":"Valley Fill", "UDOGM-04-0292":"Valley Fill", "UDOGM-04-0293":"Valley Fill", "UDOGM-04-0294":"Valley Fill", "UDOGM-04-0295":"Valley Fill", "UDOGM-04-0296":"Valley Fill", "UDOGM-04-0297":"Valley Fill", "UDOGM-04-0298":"Valley Fill", "UDOGM-04-0299":"Valley Fill", "UDOGM-04-0300":"Valley Fill", "UDOGM-04-0301":"Valley Fill", "UDOGM-04-0302":"Valley Fill", "UDOGM-04-0303":"Valley Fill", 
           "UDOGM-04-0304":"Valley Fill", "UDOGM-04-0305":"Valley Fill", "UDOGM-04-0306":"Valley Fill", "UDOGM-04-0307":"Valley Fill", "UDOGM-04-0308":"Valley Fill", "UDOGM-04-0309":"Valley Fill", "UDOGM-04-0310":"Valley Fill", "UDOGM-04-0311":"Valley Fill", "UDOGM-04-0312":"Valley Fill", "UDOGM-04-0313":"Valley Fill", "UDOGM-04-0314":"Valley Fill", "UDOGM-04-0315":"Valley Fill", "UDOGM-04-0316":"Valley Fill", "UDOGM-05-0032":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-05-0067":"Blackhawk Formation of Mesaverde Group", "UDOGM-08-0025":"Price River Formation of Mesaverde Group", "UDOGM-10-0006":"Flagstaff Limestone (Eocene-Paleocene)", 
           "UDOGM-10-0007":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-10-0011":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-10-0012":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-15-0021":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-15-0031":"Flagstaff Limestone (Eocene-Paleocene)", "UDOGM-22-0012":"Blackhawk Formation of Mesaverde Group", "UDOGM-24-0017":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0018":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0019":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0021":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0022":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0023":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0024":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0025":"Valley Fill", "UDOGM-24-0027":"Valley Fill", "UDOGM-24-0028":"Valley Fill", "UDOGM-24-0029":"Valley Fill", "UDOGM-24-0030":"Valley Fill", "UDOGM-24-0031":"Valley Fill", "UDOGM-24-0032":"Valley Fill", "UDOGM-24-0033":"Valley Fill", "UDOGM-24-0034":"Valley Fill", "UDOGM-24-0035":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0036":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0038":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0039":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0040":"Blue Gate Shale Member of Mancos Shale", "UDOGM-24-0041":"Blue Gate Shale Member of Mancos Shale", 
           "UDOGM-24-0042":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0043":"Ferron Sandstone Member of Mancos Shale", 
           "UDOGM-24-0046":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0047":"Ferron Sandstone Member of Mancos Shale", 
           "UDOGM-24-0048":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0050":"Ferron Sandstone Member of Mancos Shale", 
           "UDOGM-24-0051":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0052":"Ferron Sandstone Member of Mancos Shale", 
           "UDOGM-24-0055":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0057":"Ferron Sandstone Member of Mancos Shale", 
           "UDOGM-24-0058":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-0068":"Ferron Sandstone Member of Mancos Shale", "UDOGM-24-10578":"Ferron Sandstone Member of Mancos Shale", "UDOGM-25-0014":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0015":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0016":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0017":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-25-0018":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0019":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-25-0020":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0021":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0022":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0023":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-25-0024":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0025":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0026":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0027":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0028":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0029":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0030":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0031":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0032":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0037":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-25-0038":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0046":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0059":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0060":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0061":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-25-0064":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0065":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0066":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-0078":"Price River Formation of Mesaverde Group", "UDOGM-25-0079":"Price River Formation of Mesaverde Group", "UDOGM-25-0080":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-25-0081":"Price River Formation of Mesaverde Group", "UDOGM-25-0082":"Price River Formation of Mesaverde Group", "UDOGM-25-0083":"North Horn Formation (Paleocene-Upper Cretaceous)", "UDOGM-25-0084":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0085":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0088":"Price River Formation of Mesaverde Group", "UDOGM-25-0089":"Price River Formation of Mesaverde Group", "UDOGM-25-0090":"Price River Formation of Mesaverde Group", "UDOGM-25-0091":"Castlegate Sandstone of Mesaverde Group", "UDOGM-25-0092":"Price River Formation of Mesaverde Group", "UDOGM-25-0093":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-25-0094":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0095":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-0096":"Price River Formation of Mesaverde Group", 
           "UDOGM-25-0097":"Price River Formation of Mesaverde Group", 
           "UDOGM-25-0098":"Price River Formation of Mesaverde Group", 
           "UDOGM-25-0099":"Price River Formation of Mesaverde Group", "UDOGM-25-0100":"Price River Formation of Mesaverde Group", 
           "UDOGM-25-9460":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-9510":"Castlegate Sandstone of Mesaverde Group", 
           "UDOGM-25-9545":"Blackhawk Formation of Mesaverde Group", "UDOGM-25-9547":"Blackhawk Formation of Mesaverde Group", 
           "UDOGM-28-0009":"Star Point Sandstone of Mesaverde Group", "UDOGM-41-0004":"Valley Fill", "UDOGM-41-0006":"Valley Fill", 
           "UDOGM-41-0007":"Valley Fill", "UDOGM-41-0008":"Valley Fill", "UDOGM-41-0009":"Valley Fill", 
           "UDOGM-41-0010":"Valley Fill", "UDOGM-41-0011":"Valley Fill", "UDOGM-41-0014":"Valley Fill", 
           "UDOGM-41-0025":"Valley Fill", "UDOGM-41-0026":"Valley Fill", "UDOGM-41-0041":"Valley Fill", 
           "UDOGM-41-0047":"Valley Fill", "UDOGM-41-0055":"Valley Fill", 
           "UDOGM-41-0083":"Valley Fill", "UDOGM-41-0084":"Valley Fill", "UDOGM-42-0020":"Valley Fill"}



In [82]:
dogms['FmType'] = dogms[['StationId','FmType']].apply(lambda x: geoapply(dogmgeo, x))

In [83]:
mayogeol = {"UDOGM-41-0083":"Blackhawk", "UDOGM-25-0033":"Star Point", "UDOGM-25-0011":"Star Point", 
            "UDOGM-25-0034":"North Horn", "UDOGM-25-0092":"Blackhawk", "UDOGM-25-0097":"North Horn", 
            "UDOGM-25-0038":"Castlegate", "UDOGM-25-0096":"North Horn", "UDOGM-25-0036":"Castlegate", 
            "UDOGM-25-0046":"Castlegate", "UDOGM-02-11452":"North Horn", "UDOGM-03-0013":"North Horn", 
            "UDOGM-03-0010":"North Horn", "UDOGM-03-0011":"Castlegate", "UDOGM-03-0033":"North Horn", 
            "UDOGM-02-11513":"North Horn", "UDOGM-03-0031":"North Horn", "UDOGM-03-0034":"North Horn", 
            "UDOGM-02-11508":"Price River", "UDOGM-03-0021":"North Horn", "UDOGM-03-0035":"North Horn", 
            "UDOGM-04-0020":"North Horn", "UDOGM-04-0030":"North Horn", "UDOGM-04-0031":"North Horn", 
            "UDOGM-03-0007":"North Horn", "UDOGM-03-0029":"North Horn", "UDOGM-03-9306":"North Horn", 
            "UDOGM-03-9306":"Castlegate", "UDOGM-04-0029":"Price River", "UDOGM-04-0027":"Price River", 
            "UDOGM-03-9305":"Price River", "UDOGM-04-0028":"North Horn", "UDOGM-02-11493":"Price River", 
            "UDOGM-04-0058":"North Horn", "UDOGM-03-0006":"North Horn", "UDOGM-04-0010":"North Horn", 
            "UDOGM-03-9319":"North Horn", "UDOGM-02-11479":"North Horn", "UDOGM-02-11476":"North Horn", 
            "UDOGM-04-0013":"North Horn", "UDOGM-04-0034":"Price River", "UDOGM-04-0025":"North Horn", 
            "UDOGM-02-11715":"Price River", "UDOGM-04-0036":"Price River", "UDOGM-04-0021":"North Horn", 
            "UDOGM-04-0024":"Flagstaff", "UDOGM-04-0015":"North Horn", "UDOGM-02-11467":"North Horn", 
            "UDOGM-02-11498":"North Horn", "UDOGM-02-11487":"North Horn", "UDOGM-04-0012":"Flagstaff", 
            "UDOGM-02-11497":"North Horn", "UDOGM-02-11496":"North Horn", "UDOGM-04-0037":"North Horn", 
            "UDOGM-04-0011":"North Horn", "UDOGM-02-11738":"Flagstaff", "UDOGM-04-0039":"North Horn", 
            "UDOGM-02-11528":"North Horn", "UDOGM-04-0038":"North Horn", "UDOGM-04-0040":"North Horn", 
            "UDOGM-04-0041":"North Horn", "UDOGM-04-0032":"North Horn", "UDOGM-02-11506":"Price River", 
            "UDOGM-02-11507":"Price River", "UDOGM-16-0051":"Star Point", "UDOGM-04-11366":"Blackhawk", 
            "UDOGM-04-11196":"North Horn", "UDOGM-04-11233":"Price River", "UDOGM-04-11234":"Price River", 
            "UDOGM-08-0016":"Blackhawk", "UDOGM-08-0013":"Blackhawk", "UDOGM-04-11236":"Price River", 
            "UDOGM-04-11258":"Star Point", "UDOGM-08-0039":"North Horn", "UDOGM-08-0015":"North Horn", 
            "UDOGM-08-0042":"North Horn", "UDOGM-08-0017":"Price River", "UDOGM-08-0012":"Blackhawk", 
            "UDOGM-08-0018":"North Horn", "UDOGM-08-0018":"North Horn", "UDOGM-08-0032":"North Horn", 
            "UDOGM-16-0056":"North Horn", "UDOGM-08-0025":"Price River", "UDOGM-16-0031":"North Horn", 
            "UDOGM-16-0032":"North Horn", "UDOGM-16-0033":"Price River", "UDOGM-16-0027":"North Horn", 
            "UDOGM-16-0054":"Price River", "UDOGM-16-0030":"North Horn", "UDOGM-16-0028":"North Horn", 
            "UDOGM-16-0041":"North Horn", "UDOGM-16-0040":"North Horn", "UDOGM-12-0014":"North Horn", 
            "UDOGM-12-0022":"North Horn", "UDOGM-16-0042":"North Horn", "UDOGM-16-0043":"North Horn", 
            "UDOGM-12-0034":"Castlegate", "UDOGM-16-0055":"Price River", "UDOGM-16-0037":"Price River", 
            "UDOGM-12-0023":"Castlegate", "UDOGM-16-0039":"Blackhawk", "UDOGM-16-0053":"North Horn", 
            "UDOGM-12-0011":"Castlegate", "UDOGM-16-9762":"Blackhawk", "UDOGM-16-9764":"Blackhawk", 
            "UDOGM-35-9165":"North Horn", "UDOGM-35-0003":"North Horn", "UDOGM-35-0002":"Blackhawk", 
            "UDOGM-35-9177":"North Horn", "UDOGM-22-0014":"Blackhawk", "UDOGM-22-0011":"Blackhawk", 
            "UDOGM-35-0005":"Castlegate", "UDOGM-35-9174":"North Horn", "UDOGM-21-0028":"Blackhawk", 
            "UDOGM-35-9175":"North Horn", "UDOGM-35-0004":"Castlegate", "UDOGM-22-0012":"Blackhawk", 
            "UDOGM-35-0001":"North Horn", "UDOGM-21-0029":"Blackhawk", "UDOGM-22-0013":"Blackhawk", 
            "UDOGM-21-0030":"Blackhawk", "UDOGM-22-0015":"Blackhawk", "UDOGM-21-0027":"Blackhawk", 
            "UDOGM-22-0010":"Blackhawk", "UDOGM-21-0025":"Blackhawk", "UDOGM-21-0024":"Blackhawk", 
            "UDOGM-21-0023":"Blackhawk", "UDOGM-21-0020":"Blackhawk", "UDOGM-21-0022":"Blackhawk", 
            "UDOGM-21-0021":"Blackhawk", "UDOGM-21-0018":"Blackhawk", "UDOGM-21-0019":"Blackhawk", 
            "UDOGM-21-0017":"Blackhawk", "UDOGM-21-0016":"Blackhawk", "UDOGM-28-10298":"North Horn", 
            "UDOGM-13-0011":"Flagstaff", "UDOGM-13-0013":"Flagstaff", "UDOGM-29-10714":"North Horn", 
            "UDOGM-13-0009":"North Horn", "UDOGM-13-0008":"Flagstaff", "UDOGM-29-10725":"Flagstaff"}

In [84]:
dogms['FmType'] = dogms[['StationId','FmType']].apply(lambda x: geoapply(mayogeol, x))

##WQP Data

Data downloaded from the <a href=http://waterqualitydata.us/portal/>Water Quality Portal (WQP)</a> on 7/22/2015. <br>
<b>Stations:</b><br>
http://waterqualitydata.us/Station/search?sampleMedia=Water&characteristicType=Information%3BInorganics%2C+Major%2C+Metals%3BInorganics%2C+Major%2C+Non-metals%3BInorganics%2C+Minor%2C+Metals%3BInorganics%2C+Minor%2C+Non-metals%3BNot+Assigned%3BNutrient%3BPhysical%3BStable+Isotopes&bBox=-111.68%2C38.8%2C-111%2C40&mimeType=csv&zip=yes&sorted=no <br>
<b>Results:</b><br>
http://waterqualitydata.us/Result/search?sampleMedia=Water&characteristicType=Information%3BInorganics%2C+Major%2C+Metals%3BInorganics%2C+Major%2C+Non-metals%3BInorganics%2C+Minor%2C+Metals%3BInorganics%2C+Minor%2C+Non-metals%3BNot+Assigned%3BNutrient%3BPhysical%3BStable+Isotopes&bBox=-111.68%2C38.8%2C-111%2C40&mimeType=csv&zip=yes&sorted=no

Define location of data downloaded from the Water Quality Portal site.

In [85]:
result = rootname +"WQP/Result.csv"

###Results Data

Map data types for data to be imported and designate columns to convert to datetimes.

In [86]:
Rdtypes = {"OrganizationIdentifier":np.str_, "OrganizationFormalName":np.str_, "ActivityIdentifier":np.str_, 
           "ActivityTypeCode":np.str_, "ActivityMediaName":np.str_, "ActivityMediaSubdivisionName":np.str_, 
           "ActivityStartDate":np.str_, "ActivityStartTime/Time":np.str_, "ActivityStartTime/TimeZoneCode":np.str_, 
           "ActivityEndDate":np.str_, "ActivityEndTime/Time":np.str_, "ActivityEndTime/TimeZoneCode":np.str_, 
           "ActivityDepthHeightMeasure/MeasureValue":np.float16, "ActivityDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ActivityDepthAltitudeReferencePointText":np.str_, "ActivityTopDepthHeightMeasure/MeasureValue":np.float16, 
           "ActivityTopDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ActivityBottomDepthHeightMeasure/MeasureValue":np.float16, 
           "ActivityBottomDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ProjectIdentifier":np.str_, "ActivityConductingOrganizationText":np.str_, 
           "MonitoringLocationIdentifier":np.str_, "ActivityCommentText":np.str_, 
           "SampleAquifer":np.str_, "HydrologicCondition":np.str_, "HydrologicEvent":np.str_, 
           "SampleCollectionMethod/MethodIdentifier":np.str_, "SampleCollectionMethod/MethodIdentifierContext":np.str_, 
           "SampleCollectionMethod/MethodName":np.str_, "SampleCollectionEquipmentName":np.str_, 
           "ResultDetectionConditionText":np.str_, "CharacteristicName":np.str_, "ResultSampleFractionText":np.str_, 
           "ResultMeasureValue":np.str_, "ResultMeasure/MeasureUnitCode":np.str_, "MeasureQualifierCode":np.str_, 
           "ResultStatusIdentifier":np.str_, "StatisticalBaseCode":np.str_, "ResultValueTypeName":np.str_, 
           "ResultWeightBasisText":np.str_, "ResultTimeBasisText":np.str_, "ResultTemperatureBasisText":np.str_, 
           "ResultParticleSizeBasisText":np.str_, "PrecisionValue":np.str_, "ResultCommentText":np.str_, 
           "USGSPCode":np.str_, "ResultDepthHeightMeasure/MeasureValue":np.float16, 
           "ResultDepthHeightMeasure/MeasureUnitCode":np.str_, "ResultDepthAltitudeReferencePointText":np.str_, 
           "SubjectTaxonomicName":np.str_, "SampleTissueAnatomyName":np.str_, 
           "ResultAnalyticalMethod/MethodIdentifier":np.str_, "ResultAnalyticalMethod/MethodIdentifierContext":np.str_, 
           "ResultAnalyticalMethod/MethodName":np.str_, "MethodDescriptionText":np.str_, "LaboratoryName":np.str_, 
           "AnalysisStartDate":np.str_, "ResultLaboratoryCommentText":np.str_, 
           "DetectionQuantitationLimitTypeName":np.str_, "DetectionQuantitationLimitMeasure/MeasureValue":np.str_, 
           "DetectionQuantitationLimitMeasure/MeasureUnitCode":np.str_, "PreparationStartDate":np.str_, 
           "ProviderName":np.str_} 

dt = [[6,7],56,61]

Import result data.

In [87]:
res = pd.read_csv(result, dtype=Rdtypes, parse_dates=dt)

In [88]:
#pd.DataFrame(list(res['CharacteristicName'].unique())).to_clipboard()

Remap column names to standards and drop unneeded columns

In [89]:
ResFieldDict = {"AnalysisStartDate":"AnalysisDate", "ResultAnalyticalMethod/MethodIdentifier":"AnalytMeth", 
                "ResultAnalyticalMethod/MethodName":"AnalytMethId", "ResultDetectionConditionText":"DetectCond", 
                "ResultLaboratoryCommentText":"LabComments", "LaboratoryName":"LabName", 
                "DetectionQuantitationLimitTypeName":"LimitType", "DetectionQuantitationLimitMeasure/MeasureValue":"MDL", 
                "DetectionQuantitationLimitMeasure/MeasureUnitCode":"MDLUnit", "MethodDescriptionText":"MethodDescript", 
                "OrganizationIdentifier":"OrgId", "OrganizationFormalName":"OrgName", "CharacteristicName":"Param", 
                "ProjectIdentifier":"ProjectId", "MeasureQualifierCode":"QualCode", "ResultCommentText":"ResultComment", 
                "ResultStatusIdentifier":"ResultStatus", "ResultMeasureValue":"ResultValue", 
                "ActivityCommentText":"SampComment", "ActivityDepthHeightMeasure/MeasureValue":"SampDepth", 
                "ActivityDepthAltitudeReferencePointText":"SampDepthRef", 
                "ActivityDepthHeightMeasure/MeasureUnitCode":"SampDepthU", "SampleCollectionEquipmentName":"SampEquip", 
                "ResultSampleFractionText":"SampFrac", "ActivityStartDate":"SampleDate", "ActivityIdentifier":"SampleId", 
                "ActivityStartTime/Time":"SampleTime", "ActivityMediaSubdivisionName":"SampMedia", 
                "SampleCollectionMethod/MethodIdentifier":"SampMeth", "SampleCollectionMethod/MethodName":"SampMethName", 
                "ActivityTypeCode":"SampType", "MonitoringLocationIdentifier":"StationId", 
                "ResultMeasure/MeasureUnitCode":"Unit", "USGSPCode":"USGSPCode",
                "ActivityStartDate_ActivityStartTime/Time":"SampleDate"} 

In [90]:
res.rename(columns=ResFieldDict,inplace=True)

In [91]:
resdroplist = ["ActivityBottomDepthHeightMeasure/MeasureUnitCode", "ActivityBottomDepthHeightMeasure/MeasureValue", 
               "ActivityConductingOrganizationText", "ActivityEndDate", "ActivityEndTime/Time", 
               "ActivityEndTime/TimeZoneCode", "ActivityMediaName", "ActivityStartTime/TimeZoneCode", 
               "ActivityTopDepthHeightMeasure/MeasureUnitCode", "ActivityTopDepthHeightMeasure/MeasureValue", 
               "HydrologicCondition", "HydrologicEvent", "PrecisionValue", "PreparationStartDate", "ProviderName", 
               "ResultAnalyticalMethod/MethodIdentifierContext", "ResultDepthAltitudeReferencePointText", 
               "ResultDepthHeightMeasure/MeasureUnitCode", "ResultDepthHeightMeasure/MeasureValue", 
               "ResultParticleSizeBasisText", "ResultTemperatureBasisText", 
               "ResultTimeBasisText", "ResultValueTypeName", "ResultWeightBasisText", "SampleAquifer", 
               "SampleCollectionMethod/MethodIdentifierContext", "SampleTissueAnatomyName", "StatisticalBaseCode", 
               "SubjectTaxonomicName"] 

In [92]:
res.drop(resdroplist,inplace=True,axis=1)

Convert string fields to values.

In [93]:
res['ResultValue'] = res['ResultValue'].convert_objects(convert_numeric=True)
res['MDL'] = res['MDL'].convert_objects(convert_numeric=True)

Remove `_WQX` suffix from `StationID`.

In [94]:
res['StationId'] = res['StationId'].str.replace('_WQX-','-')

###Station Data

In [95]:
station = rootname + "WQP\\Station.csv"

In [96]:
stat = pd.read_csv(station)

Map Fields to Standard UGS database format.

In [97]:
StatFieldDict = {"MonitoringLocationIdentifier":"StationId", "AquiferName":"Aquifer", "AquiferTypeName":"AquiferType", 
             "ConstructionDateText":"ConstDate", "CountyCode":"CountyCode", "WellDepthMeasure/MeasureValue":"Depth", 
             "WellDepthMeasure/MeasureUnitCode":"DepthUnit", "VerticalMeasure/MeasureValue":"Elev", 
             "VerticalAccuracyMeasure/MeasureValue":"ElevAcc", "VerticalAccuracyMeasure/MeasureUnitCode":"ElevAccUnit", 
             "VerticalCollectionMethodName":"ElevMeth", "VerticalCoordinateReferenceSystemDatumName":"ElevRef", 
             "VerticalMeasure/MeasureUnitCode":"ElevUnit", "FormationTypeText":"FmType", 
             "WellHoleDepthMeasure/MeasureValue":"HoleDepth", "WellHoleDepthMeasure/MeasureUnitCode":"HoleDUnit", 
             "HorizontalAccuracyMeasure/MeasureValue":"HorAcc", "HorizontalAccuracyMeasure/MeasureUnitCode":"HorAccUnit", 
             "HorizontalCollectionMethodName":"HorCollMeth", "HorizontalCoordinateReferenceSystemDatumName":"HorRef", 
             "HUCEightDigitCode":"HUC8", "LatitudeMeasure":"Lat_Y", "LongitudeMeasure":"Lon_X", 
             "OrganizationIdentifier":"OrgId", "OrganizationFormalName":"OrgName", "StateCode":"StateCode", 
             "MonitoringLocationDescriptionText":"StationComment", "MonitoringLocationName":"StationName", 
             "MonitoringLocationTypeName":"StationType"} 

In [98]:
stat.rename(columns=StatFieldDict,inplace=True)

Drop leftover fields.

In [99]:
statdroplist = ["ContributingDrainageAreaMeasure/MeasureUnitCode", "ContributingDrainageAreaMeasure/MeasureValue", 
                "DrainageAreaMeasure/MeasureUnitCode", "DrainageAreaMeasure/MeasureValue", "CountryCode", "ProviderName", 
                "SourceMapScaleNumeric"]

In [100]:
stat.drop(statdroplist,inplace=True,axis=1)

Remove `_WQX` suffix from `StationID` and take out duplicate stations.

In [101]:
stat['StationId'] = stat['StationId'].str.replace('_WQX-','-')
stat.drop_duplicates(subset=['StationId'],inplace=True)

##Combine Data

Append Results Data Together

In [102]:
results = pd.concat([res,dogmr])

In [103]:
del(res,dogmr)

In [104]:
ParAbb = {"Alkalinity":"Alk", "Alkalinity, Carbonate as CaCO3":"Alk", "Alkalinity, total":"Alk", 
          "Arsenic":"As", "Calcium":"Ca", "Chloride":"Cl", "Carbon dioxide":"CO2", "Carbonate":"CO3", 
          "Carbonate (CO3)":"CO3", "Specific conductance":"Cond", "Conductivity":"Cond", "Copper":"Cu", 
          "Depth":"Depth", "Dissolved oxygen (DO)":"DO", "Iron":"Fe", 
          "Hardness, Ca, Mg":"Hard", "Total hardness -- SDWA NPDWR":"Hard", 
          "Bicarbonate":"HCO3", "Potassium":"K", "Magnesium":"Mg", "Kjeldahl nitrogen":"N", 
          "Nitrogen, mixed forms (NH3), (NH4), organic, (NO2) and (NO3)":"N", "Nitrogen":"N", "Sodium":"Na", 
          "Sodium plus potassium":"NaK", "Ammonia-nitrogen":"NH3_N", "Ammonia-nitrogen as N":"N", "Nitrite":"NO2", 
          "Nitrate":"NO3", "Nitrate as N":"N", "pH, lab":"pH", "pH":"pH",  "Phosphate-phosphorus":"PO4", 
          "Orthophosphate":"PO4", "Phosphate":"PO4", "Stream flow, instantaneous":"Q", "Flow":"Q", 
          "Flow rate, instantaneous":"Q", "Silica":"Si", "Sulfate":"SO4", "Sulfate as SO4":"SO4", 
          "Total dissolved solids":"TDS", "Temperature, water":"Temp", 
          "Total suspended solids":"TSS", "Turbidity":"Turb"}

In [105]:
ParUnAbb = {'Alk':u'mg/L', 'As':u'\u03BCg/L', 'Ca':u'mg/L', 'Cl':u'mg/L', 'CO2':u'mg/L', 'CO3':u'mg/L', 'Cond':u'\u03BChoms/cm',
           'Cu':u'\u03BCg/L', 'DO':u'mg/L', 'Hard':u'mg/L', 'HCO3':u'mg/L', 'K':u'mg/L', 'Mg':u'mg/L', 'N':u'mg/L', 'pH':u'', 
            'Q':u'gpm', 'Fe':u'\u03BCg/L', 'TDS':u'mg/L', 'Si':u'mg/L', 'SO4':u'mg/L', 'Temp':u'\N{DEGREE SIGN}C', 'PO4':u'mg/L', 'TSS':u'mg/L',
           'Turb':u'Turb. Units', 'Na':u'mg/L', 'NaK':u'mg/L', 'NO3':u'mg/L', 'NH3':u'mg/L', 'NO2':u'mg/L', 'NH3_N':u'mg/L'}

In [106]:
results['ParAbb'] = results['Param'].apply(lambda x: ParAbb.get(x),1)

In [107]:
del results['USGSPCode']

In [108]:
results.columns

Index([u'AnalysisDate', u'AnalytMeth', u'AnalytMethId', u'DetectCond',
       u'LabComments', u'LabName', u'LimitType', u'MDL', u'MDLUnit',
       u'MethodDescript', u'OrgId', u'OrgName', u'Param', u'ProjectId',
       u'QualCode', u'ResultComment', u'ResultStatus', u'ResultValue',
       u'SampComment', u'SampDepth', u'SampDepthRef', u'SampDepthU',
       u'SampEquip', u'SampFrac', u'SampMedia', u'SampMeth', u'SampMethName',
       u'SampType', u'SampleDate', u'SampleId', u'StationId', u'Unit',
       u'ParAbb'],
      dtype='object')

In [181]:
def dateparseclean(x):
    try:
        if 'nan' in x:
            x = str(x).replace(' nan','')
            return datetime.datetime.strptime(x, '%Y-%m-%d').date()
        else:
            try:
                return datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').date()
            except(ValueError):
                print('damn')
                return np.nan
    except(TypeError):
        try:
            return x.date()
        except(AttributeError):
            return x

In [182]:
results['SampleDate'] = results['SampleDate'].apply(lambda x: dateparseclean(x))

In [231]:
results.columns

Index([u'index', u'AnalysisDate', u'AnalytMeth', u'AnalytMethId',
       u'DetectCond', u'LabComments', u'LabName', u'LimitType', u'MDL',
       u'MDLUnit', u'MethodDescript', u'OrgId', u'OrgName', u'Param',
       u'ProjectId', u'QualCode', u'ResultComment', u'ResultStatus',
       u'ResultValue', u'SampComment', u'SampDepth', u'SampDepthRef',
       u'SampDepthU', u'SampEquip', u'SampFrac', u'SampMedia', u'SampMeth',
       u'SampMethName', u'SampType', u'SampleDate', u'SampleId', u'StationId',
       u'Unit', u'ParAbb'],
      dtype='object')

In [230]:
def flowconv(x):
    x[1] = str(x[1]).rstrip().lower()
    if x[1] in ['mgd']:
        return x[0]*1.8581441079
    elif x[1] in ['gallons per minute','gpm','g/min','gal/min']:
        return x[0]*0.00222800926
    elif x[1] in ['gpd']:
        return x[0]*1.54722865E-6
    elif x[1] in ['ft3/sec', 'cfs','ft3/s']:
        return x[0]
    elif x[1] in ['cfm']:
        return x[0]*0.0166667
    else:
        return x[0]
    
def unitconv(x):
    z = str(x).rstrip().lower()
    if z in ['mgd','gpm','gpd','cfs','ft3/sec','gallons per min','g/min','gal/min','cfm','ft3/s']:
        return 'cfs'
    else:
        return x

In [215]:
results['ResultValue'] = results[['ResultValue','Unit']].apply(lambda x: flowconv(x))

In [216]:
results['Unit'] = results['Unit'].apply(lambda x: unitconv(x))

In [134]:
#results.to_csv(rootname+"AllResults.csv", chunksize=10000)

In [169]:
def condfix(x):
    if x[1]=='Cond':
        if str(x[2]).rstrip().lower() == 'ms/cm':
            return x[0]/1000
        else:
            return x[0]
    else:
        return x[0]
    
def condunitfix(x):
    if x[1]=='Cond':
        if str(x[0]).rstrip().lower() == 'ms/cm':
            return 'uS/cm'
        else:
            return 'uS/cm'
    else:
        return x[0]

def pHfix(x):
    if x[1] == 'pH':
        if x[0] == 0.0:
            return np.nan
        elif x[0] >= 14.0:
            return np.nan
        else:
            return x[0]
    else:
        return x[0]

In [170]:
results['ResultValue'] = results[['ResultValue','ParAbb','Unit']].apply(lambda x: condfix(x),1)

In [171]:
results['Unit'] = results[['Unit','ParAbb']].apply(lambda x: condunitfix(x),1)

In [172]:
results['ResultValue'] = results[['ResultValue','ParAbb']].apply(lambda x: pHfix(x),1)

##Stations

In [271]:
#dogms.set_index(['StationId'],inplace=True)
station = pd.concat([stat,dogms])
station.set_index(['StationId'],inplace=True)

In [272]:
station["index"] = station.index
station.drop_duplicates(subset='index', take_last=True, inplace=True)
del station["index"]

In [273]:
stattype = {"Stream: Ditch":"Stream", "Stream":"Stream", "Well":"Well", "Spring":"Spring", 
            "Stream: Canal":"Stream", "Subsurface: Tunnel, shaft, or mine":"Mine Drain", 
            "Well: Test hole not completed as a well":"Well", "Subsurface: Groundwater drain":"Mine Drain", 
            "Lake, Reservoir, Impoundment":"Lake", "River/Stream":"Stream", "Lake":"Lake", "Facility Other":"Other", 
            "Canal Drainage":"Stream", "Canal Irrigation":"Stream", "MD":"Mine Drain", "SW":"Stream", "GW":"Well", 
            "SP":"Spring", "UPDES Permit discharge point":"Mine Drain", "Lake; Sediment Pond; Stagnant water":"Lake", 
            "Other":"Other", "CG-2":"Other"}


In [274]:
station['StationType'] = station['StationType'].apply(lambda x: stattype.get(x),1)

In [275]:
fmdict = {"Blackhawk Formation of Mesaverde Group":4, "Castlegate Sandstone of Mesaverde Group":5, 
          "Ferron Sandstone Member of Mancos Shale":2, "Flagstaff Limestone (Eocene-Paleocene)":8, 
          "Green River Formation":10, "Mancos Shale":1, "North Horn Formation (Paleocene-Upper Cretaceous)":7,
          "Paleozoic Erathem":0, "Price River Formation of Mesaverde Group":6,
          "Star Point Sandstone of Mesaverde Group":3, "Valley Fill":9}

In [276]:
station['FmNum'] = station['FmType'].apply(lambda x: fmdict.get(x),1)

#Merge Station Duplicates

In [277]:
switchStat =   {'UDOGM-02-11435':'UDOGM-03-9248', 'UDOGM-19-11903':'UDOGM-10-0037', 
                'UDOGM-19-11904':'UDOGM-10-0038', 'UDOGM-41-11816':'UDOGM-41-11817', 'UDOGM-24-10581':'UDOGM-24-10579', 
                'UDOGM-24-10497':'UDOGM-24-10501', 'UDOGM-24-10502':'UDOGM-24-10503',
                'UDOGM-02-11451':'UDOGM-03-9300', 'UDOGM-03-0016':'UDOGM-03-9300', 'UDOGM-02-11435':'UDOGM-03-9248', 
                'UDOGM-02-11787':'UDOGM-03-9405', 'UDOGM-03-9373':'UDOGM-02-11774', 'UDOGM-02-11777':'UDOGM-03-9384', 
                'UDOGM-02-11782':'UDOGM-03-9397', 'UDOGM-02-11786':'UDOGM-03-9404', 'UDOGM-02-11777':'UDOGM-03-9384', 
                'UDOGM-02-11775':'UDOGM-03-9374', 'UDOGM-02-11778':'UDOGM-03-9385', 'UDOGM-02-11772':'UDOGM-03-9371',
                'UDOGM-02-11691':'UDOGM-03-9348', 'UDOGM-02-11780':'UDOGM-03-9389', 'UDOGM-02-11785':'UDOGM-03-9400', 
                'UDOGM-02-11596':'UDOGM-03-9325', 'UDOGM-02-11788':'UDOGM-03-9409', 'UDOGM-24-10546':'USGS-385049111153101',
                'UDOGM-24-10545':'USGS-385045111171801', 'UDOGM-24-10578':'UDOGM-24-0055',
                'UDOGM-24-10579':'UDOGM-24-10581', 'UDOGM-24-10547':'USGS-385155111141801',
                'UDOGM-24-10492':'USGS-385209111171801', 'UDOGM-24-0019':'USGS-385224111142602',
                'UDOGM-24-10526':'USGS-385233111130301', 'UDOGM-24-10466':'USGS-385232111172701',
                'UDOGM-24-10528':'USGS-385249111130901', 'UDOGM-24-10474':'USGS-385249111132301',  
                'UDOGM-24-10543':'USGS-385300111153701', 'UDOGM-24-10529':'USGS-385303111131301',
                'UDOGM-24-10502':'UDOGM-24-10503', 'UDOGM-24-10473':'USGS-385400111122901',
                'UDOGM-24-10485':'USGS-385427111161201', 'UDOGM-24-10544':'USGS 385426111160801',
                'UDOGM-24-10484':'USGS 385554111152102', 'UDOGM-02-11766':'UDOGM-03-9361',
                'UDOGM-02-11450':'USGS-391751111130201', 'UDOGM-03-9292':'USGS-391751111130201',
                'UDOGM-02-11690':'UDOGM-03-9347', 'UDOGM-02-11597':'UDOGM-03-9326',
                'UDOGM-02-11449':'UDOGM-03-9291', 'UDOGM-02-11598':'UDOGM-03-9327',
                'UDOGM-02-11689':'UDOGM-03-9346', 'UDOGM-02-11599':'UDOGM-03-9328',
                'UDOGM-02-11457':'UDOGM-03-9312', 'UDOGM-02-11784':'UDOGM-03-9399',
                'UDOGM-02-11444':'UDOGM-03-9272', 'UDOGM-02-11456':'UDOGM-03-9310',
                'UDOGM-02-11443':'UDOGM-03-0060', 'UDOGM-02-11783':'UDOGM-03-9398',
                'UDOGM-02-11770':'UDOGM-03-9369', 'UDOGM-02-11453':'UDOGM-03-9307',
                'UDOGM-02-0004':'UTAHDWQ-4931020', 'UDOGM-02-11768':'UDOGM-03-9363',
                'UDOGM-02-11454':'UDOGM-03-9308', 'UDOGM-02-0006':'UTAHDWQ-4930730',
                'UDOGM-02-11630':'UDOGM-03-9330', 'UDOGM-02-11682':'UDOGM-03-9344',
                'UDOGM-02-11634':'UDOGM-03-9333', 'UDOGM-02-11439':'UDOGM-03-0058',
                'UDOGM-02-11440':'UDOGM-03-0059', 'UDOGM-02-11571':'UDOGM-03-9324',
                'UDOGM-03-9356':'UDOGM-03-9355', 'UDOGM-02-11769':'UDOGM-9366',
                'UDOGM-02-11633':'UDOGM-03-9332', 'UDOGM-02-11644':'UDOGM-03-9336',
                'UDOGM-02-11479':'UDOGM-04-0019', 'UDOGM-02-11536':'UDOGM-03-9322',
                'UDOGM-02-11535':'UDOGM-03-9321', 'UDOGM-02-11662':'UDOGM-03-9340',
                'UDOGM-02-11466':'UDOGM-03-9314', 'UDOGM-02-11674':'UDOGM-03-9343',
                'UDOGM-02-11632':'UDOGM-03-9331', 'UDOGM-02-11437':'UDOGM-03-9255',
                'UDOGM-02-11669':'UDOGM-03-9341', 'UDOGM-02-11478':'UDOGM-03-9315',
                'UDOGM-02-11483':'UDOGM-03-9317', 'UDOGM-02-11438':'UDOGM-03-9256',
                'UDOGM-02-11484':'UDOGM-03-9318', 'UDOGM-02-11648':'UDOGM-03-9339',
                'UDOGM-02-11767':'UDOGM-03-9362', 'UDOGM-02-11673':'UDOGM-03-9342',
                'UDOGM-02-11537':'UDOGM-03-9323', 'UDOGM-02-11569':'USGS-392210111094401',
                'UDOGM-04-0011':'USGS-392210111094401', 'UDOGM-02-11518':'UDOGM-04-0037',
                'UDOGM-02-11609':'UDOGM-04-10812', 'UDOGM-02-11659':'UDOGM-04-0067',
                'UDOGM-02-11610':'UDOGM-04-10813', 'UDOGM-04-11192':'UDOGM-02-11757',
                'UDOGM-04-11158':'UDOGM-04-0203', 'UDOGM-04-11171':'UDOGM-04-0205',
                'UDOGM-04-11170':'UDOGM-04-0204', 'UDOGM-04-10817':'UDOGM-02-11616',
                'UDOGM-02-11702':'UDOGM-04-10870', 'UDOGM-04-11174':'UDOGM-04-0234',
                'UDOGM-04-10869':'UDOGM-02-11667', 'UDOGM-04-10818':'UDOGM-02-11618',
                'UDOGM-04-10820':'UDOGM-02-11620', 'UDOGM-04-10866':'UDOGM-02-11663',
                'UDOGM-04-10874':'UDOGM-04-10873', 'UDOGM-04-11194':'UDOGM-04-0164',
                'UDOGM-04-11172':'UDOGM-04-0206', 'UDOGM-04-11176':'UDOGM-04-0208',
                'UDOGM-02-11661':'UDOGM-04-10865', 'UDOGM-02-11665':'UDOGM-04-10868',
                'UDOGM-06-0004':'UDOGM-05-0080', 'UDOGM-02-11660':'UDOGM-04-10864',
                'UDOGM-04-11178':'UDOGM-04-0210', 'UDOGM-04-11180':'UDOGM-04-0212',
                'UDOGM-04-11148':'UDOGM-04-11114', 'UDOGM-04-11181':'UDOGM-04-11147',
                'UDOGM-04-11191':'UDOGM-02-11756', 'UDOGM-04-11149':'UDOGM-04-0214',
                'UDOGM-04-11163':'UDOGM-04-0227', 'UDOGM-04-11162':'UDOGM-04-0226',
                'UDOGM-04-11165':'UDOGM-04-11131', 'UDOGM-04-11150':'UDOGM-04-0215',
                'UDOGM-04-11164':'UDOGM-04-0228', 'UDOGM-02-11605':'UDOGM-04-10810',
                'UDOGM-04-11159':'UDOGM-04-0223', 'UDOGM-04-11166':'UDOGM-04-0229',
                'UDOGM-04-11161':'UDOGM-04-0225', 'UDOGM-04-11160':'UDOGM-04-0224',
                'UDOGM-04-11157':'UDOGM-04-0222', 'UDOGM-04-11155':'UDOGM-04-0220',
                'UDOGM-04-11154':'UDOGM-04-0219', 'UDOGM-04-11152':'UDOGM-04-0217',
                'UDOGM-04-11167':'UDOGM-04-0230', 'UDOGM-04-11363':'UDOGM-04-11333',
                'UDOGM-04-11153':'UDOGM-04-0218', 'UDOGM-04-11046':'UDOGM-04-11002',
                'UDOGM-04-11151':'UDOGM-04-11151', 'UDOGM-04-11156':'UDOGM-04-0221',
                'UDOGM-04-11044':'UDOGM-04-11000', 'UDOGM-04-11045':'UDOGM-04-11001',
                'UDOGM-02-11542':'UDOGM-04-10776', 'UDOGM-04-11348':'UDOGM-04-11318',
                'UDOGM-02-11554':'UDOGM-04-10783', 'UDOGM-02-11563':'UDOGM-04-10793',
                'UDOGM-04-11342':'UDOGM-04-11312', 'UDOGM-04-11177':'UDOGM-04-0209',
                'UDOGM-02-11561':'UDOGM-04-10791', 'UDOGM-02-11558':'UDOGM-04-10788',
                'UDOGM-04-11168':'UDOGM-04-0231', 'UDOGM-04-0266':'UDOGM-04-0265',
                'UDOGM-04-11169':'UDOGM-04-0232', 'UDOGM-04-10784':'UDOGM-02-11555',
                'UDOGM-04-11005':'UDOGM-04-0201', 'UDOGM-04-11179':'UDOGM-04-0211',
                'UDOGM-04-11173':'UDOGM-04-0233', 'UDOGM-04-10823':'UDOGM-02-11626',
                'UDOGM-04-11058':'UDOGM-04-11012', 'UDOGM-04-10789':'UDOGM-02-11559',
                'UDOGM-04-11057':'UDOGM-04-11011', 'UDOGM-04-11055':'UDOGM-04-11009',
                'UDOGM-04-11054':'UDOGM-04-11008', 'UDOGM-04-11006':'UDOGM-04-0202',
                'UDOGM-04-11056':'UDOGM-04-11010', 'UDOGM-02-11541':'UDOGM-04-10775',
                'UDOGM-04-10785':'UDOGM-02-11556', 'UDOGM-04-11053':'UDOGM-04-11007',
                'UDOGM-04-11004':'UDOGM-04-0193', 'UDOGM-04-10822':'UDOGM-02-11625',
                'UDOGM-02-11714':'UDOGM-04-10882', 'UDOGM-02-11722':'UDOGM-04-10974',
                'UDOGM-04-10877':'UDOGM-02-11710', 'UDOGM-04-10973':'UDOGM-02-11720',
                'UDOGM-04-10821':'UDOGM-02-11621', 'UDOGM-04-10972':'UDOGM-02-11719',
                'UDOGM-04-11349':'UDOGM-04-11319', 'UDOGM-02-11713':'UDOGM-04-10881',
                'UDOGM-04-10971':'UDOGM-02-11718', 'UDOGM-04-10787':'UDOGM-02-11557',
                'UDOGM-08-11384':'UDOGM-16-9768', 'UDOGM-08-11378':'UDOGM-08-11372',
                'UDOGM-05-0032':'USGS-392644111061701', 'UDOGM-08-11379':'UDOGM-08-11373',
                'UDOGM-08-11380':'UDOGM-08-11374', 'UDOGM-04-10953':'UDOGM-08-11374',
                'UDOGM-08-11381':'UDOGM-08-11375', 'UDOGM-08-11382':'UDOGM-08-11376',
                'UDOGM-04-11299':'UDOGM-08-11416', 'UDOGM-23-10374':'USGS-392750111050401',
                'UDOGM-08-0039':'UDOGM-04-11197', 'UDOGM-23-10371':'USGS-392818111064801',
                'UDOGM-23-10370':'USGS-392834111063001', 'UDOGM-16-9752':'UDOGM-16-0056', 
                'UDOGM-08-0025':'USGS-392917111131601', 'UDOGM-16-0035':'USGS-393151111041301', 
                'UDOGM-16-0036':'USGS-393154111041401', 'UDOGM-16-0025':'USGS-393148111044001', 
                'UDOGM-16-9766':'USGS-393247111065501', 'UDOGM-16-9767':'USGS-393247111065501', 
                'UDOGM-16-9760':'UDOGM-16-9761', 'UDOGM-35-9133':'UDOGM-35-9131', 
                'UDOGM-21-0026':'UDOGM-22-0009', 'UDOGM-21-0037':'UTAHDWQ-4933070', 'UDOGM-22-0005':'UTAHDWQ-4933070',
                'UDOGM-22-0023':'UTAHDWQ-5931590', 'UDOGM-21-0039':'UTAHDWQ-5931590',
                'UDOGM-21-0015':'UDOGM-22-0022', 'UDOGM-21-0005':'USGS-394103111091602',
                'UDOGM-29-10699':'UDOGM-29-0016', 'UDOGM-15-0025':'UDOGM-10-0025',
                'UDOGM-42-0020':'USGS-394424111090501', 'UTAHDWQ-5931500':'USGS-394424111090501',
                'UDOGM-10-0016':'UDOGM-15-0023', 'UDOGM-15-0031':'UDOGM-10-0006',
                'UDOGM-29-10583':'UDOGM-13-0008','UDOGM-29-10727':'UDOGM-13-0009',
                'UDOGM-02-11472':'UDOGM-04-0014','UDOGM-02-11486':'UDOGM-03-9320',
                'UDOGM-02-11485':'UDOGM-03-9319',
                'UTAHDWQ-4931140':'USGS-391634111170001', 'UTAHDWQ-4955460':'USGS-384845111195101',
                'UTAHDWQ-4955300':'USGS-09332100', 'UTAHDWQ-4955320':'USGS-384846111121701',
                'UTAHDWQ-4955423':'USGS-385236111172901', 'UTAHDWQ-4955570':'USGS-385715111121601',
                'UTAHDWQ-4930590':'USGS-392455111070000', 'USGS-391745111403101':'USGS-391750111402501'}

def statget(x):
    statMatch = list(switchStat.keys())
    if x in statMatch:
        return switchStat.get(x)
    else:
        return x

In [142]:
results['StationId'] = results['StationId'].apply(lambda x: statget(x))

In [278]:
station.reset_index(inplace=True)
station['StationId'] = station['StationId'].apply(lambda x: statget(x))

In [279]:
station.reset_index(inplace=True)
station.set_index(['StationId'],inplace=True)

In [144]:
results.reset_index(inplace=True)

#Summarize and Plot Data

In [145]:
import matplotlib.pyplot as plt
from pylab import rcParams, rc
from matplotlib.backends.backend_pdf import PdfPages
#from __future__ import unicode_literals
%matplotlib inline
rcParams['figure.figsize'] = 15, 10

In [253]:
durationsummary = results.groupby(['StationId','ParAbb'])['SampleDate'].agg([np.min,np.max,np.size]).reset_index()

In [239]:
durationsummary.convert_objects(convert_numeric=True)
print list(durationsummary.columns)
durationsummary['amax'] =  durationsummary['amax'].apply(lambda x: str(x).replace('nan','00:00:00'),1)
durationsummary['amin'] =  durationsummary['amin'].apply(lambda x: str(x).replace('nan','00:00:00'),1)
durationsummary['duration'] = durationsummary['amax'].astype(np.datetime64) - durationsummary['amin'].astype(np.datetime64)

['StationId', 'ParAbb', 'amin', 'amax', 'size']


In [263]:
durationsummary.drop_duplicates(subset=['StationId'],inplace=True)
LongFlow = durationsummary[(durationsummary['ParAbb']=='Q') & (durationsummary['size']>10)]

In [245]:
durationsummary.to_csv(rootname + "fieldsummaries.csv")

In [280]:
LongFlowList = list(LongFlow['StationId'])
FlowRes = results[(results['StationId'].isin(LongFlowList)) & (results['ResultValue'] != np.nan)]

In [281]:
FlowRes['SampleDate'] = FlowRes.SampleDate.astype(np.datetime64)
FlowRes.reset_index(inplace=True)
FlowRes.set_index('SampleDate',inplace=True)
FlowRes.sort_index(inplace=True)
FlowRes['pyDate'] = FlowRes.index.to_pydatetime()
FlowRes['JulDate'] = FlowRes.index.to_julian_date()


Flow = pd.merge(FlowRes, station, left_on='StationId', right_index=True, how='left')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [282]:
pdf = PdfPages(rootname+'flowres.pdf')

xmax = np.max(Flow['pyDate'].values)
xmin = np.min(Flow['pyDate'].values)

from pylab import rcParams
rcParams.update({'font.size': 10})

for key, grp in Flow.groupby(['StationId']):
    if np.max(grp['ResultValue'])>0:
        # determine cubicft flowing for each time period
        grp['CubicFt'] = grp['ResultValue'] * grp['JulDate'].diff()*60*60*24*2.29568411E-5 
        y2 = grp['CubicFt'].cumsum().values
        # filter out graphs smaller than 30 non-zero points
        #if np.count_nonzero(grp['ResultValue']) > 30:
        fig = plt.figure(figsize=(10,8))
        title = str(str(grp['StationName'].values[0]) + ' ' + str(grp['StationType'].values[0]))
        if title == "nan nan":
            plt.title(key)
        else:
            plt.title(title)

        ax = fig.add_subplot(111)
        ax.plot(grp['pyDate'].values,grp['ResultValue'].values, label='Discharge', linestyle='-', marker='.', color='b')
        ax.set_ylabel('Discharge (cfs)')
        ax.set_xlabel('Date')
        ax.set_xlim([xmin,xmax])

        ax2 = ax.twinx()
        ax2.plot(grp['pyDate'].values, y2, label='Cumulative Discharge', linestyle='--', color='g')
        ax2.set_ylabel('Cumulative Discharge (ac-ft)')


        # ask matplotlib for the plotted objects and their labels
        lines, labels = ax.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        ax2.legend(lines + lines2, labels + labels2, loc='upper left', bbox_to_anchor=(-0.10, 1.15))
        ax2.set_xlim([xmin,xmax])
        #plt.tight_layout()
        pdf.savefig(fig)
        plt.close()
    


d = pdf.infodict()
d['Title'] = 'Discharge Data'
d['Author'] = u'Paul Inkenbrandt'
d['Subject'] = 'Discharge Data from Manti LaSal'
d['Keywords'] = 'PdfPages multipage keywords author title subject'
d['CreationDate'] = datetime.datetime.today()
d['ModDate'] = datetime.datetime.today()
pdf.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


#Pivot Data

Drop rows from the `results` table that have null `SampleId` values or parameter abbreviations.

In [286]:
results.dropna(subset=['SampleId','ParAbb'],how='any', inplace=True)

Drop rows from the `results` table that have a detection condition (ex. "not detected").  This eliminates nondetects, which will inhereantly bias the data, as it is not considering values below the detection levels.

In [287]:
results = results[pd.isnull(results['DetectCond'])]

Drop rows from the `results` table that have duplicate `SampleId` values and parameters (chemical concentrations, field measurements). The `SampleId` field will be the index for the pivoted table.  Each row in the pivoted table will represent an individual water sample.  The `SampleId` is applied to each parameter that comes from the same water sample.  For example, if I go out to a stream and fill a water bottle and have that analyzed for 4 different parameters (i.e. calcium, magnesium, sodium, and chloride), then each result from the analysis of that water will have the same `SampleId`.  Sometimes we sample a station multiple times, so one `StationId` can have many `SampleId` values. 

In [288]:
results.drop_duplicates(subset=['SampleId','ParAbb'],inplace=True)

Select results that have more than 50 flow values to plot flow change over time.
http://stackoverflow.com/questions/17926273/how-to-count-distinct-values-in-a-column-of-a-pandas-group-by-object

Summarize All Fields by date and count

Pivot the data so that parameters are now columns.  

In [289]:
datap = results.pivot(index='SampleId', columns='ParAbb', values='ResultValue')

Drop columns from the pivot table that are pretty much empty.

In [290]:
datap.dropna(subset=['SO4','Cond','Temp','TDS','pH'],how='all',inplace=True)

The table lost the `StationId` field when it was pivoted, so now we need to add the `StationId` field back on to the table by joining it to the previous results table using the `SampleId` field.  First we parse down the results table to only the information we want to retain, then we join the tables.

In [291]:
resdrop = ['AnalysisDate', 'AnalytMeth', 'AnalytMethId',
             'DetectCond', 'LabComments', 'LabName', 'LimitType',
             'MDL', 'MDLUnit', 'MethodDescript',
             'OrgId', 'OrgName', 'Param', 'ProjectId',
             'QualCode', 'ResultComment', 'ResultStatus', 'ResultValue',
             'SampComment', 'SampDepth',
             'SampDepthRef', 'SampDepthU', 'SampEquip', 'SampFrac',
             'SampMedia', 'SampMeth', 'SampMethName', 'SampType',
             'Unit', 'ParAbb']
resPivot = results.drop(resdrop, axis=1)

In [292]:
datapiv = pd.merge(datap, resPivot, left_index=True, right_on='SampleId',how='left')

Now that we have a `StationId` field, we can add our station data, but only the data that will be useful for plotting our data.

In [293]:
pivStats = station.drop(['Aquifer', 'ConstDate', 'Depth', 'DepthUnit',
                         'HoleDUnit', 'HoleDepth', 
                         'OrgId', 'StationComment', 'StationName', 'matchid'], axis=1)

In [294]:
datapiv.columns

Index([u'Alk', u'As', u'CO2', u'CO3', u'Ca', u'Cl', u'Cond', u'Cu', u'DO',
       u'Depth', u'Fe', u'HCO3', u'Hard', u'K', u'Mg', u'N', u'NH3_N', u'NO2',
       u'NO3', u'Na', u'NaK', u'PO4', u'Q', u'SO4', u'Si', u'TDS', u'TSS',
       u'Temp', u'Turb', u'pH', u'index', u'SampleDate', u'SampleId',
       u'StationId'],
      dtype='object', name=u'ParAbb')

In [298]:
mayoresults['SampleDate'] = mayoresults['Date']
mayoresults['Q'] = mayoresults['Flow']
mayoresults.drop(['Flow','Date','Site'],inplace=True,axis=1)

In [299]:
datapiv = pd.concat([mayoresults,datapiv])

In [300]:
datapiv = pd.merge(datapiv, pivStats, left_on='StationId', right_index=True, how='left')

In [301]:
datapiv.drop_duplicates(inplace=True)

MemoryError: 

In [None]:
datapiv.to_csv(rootname+"AllResultsPivot.csv", chunksize=10000)

#Create Table For Plotting

In [None]:
piperdata = datapiv.dropna(subset = ['Ca','Na','Cl','K','Mg','SO4'],how='any')

Find Relationship between Bicarbonate and Alkalinity.  Fill in missing bicarbonate values.

In [None]:
from scipy.stats import linregress

In [None]:
piv = piperdata.ix[:,['Alk','HCO3']]
piv = piv[(piv.Alk < 5000)&(piv.HCO3 < 5000)]
piv = piv[(piv.Alk > 0)&(piv.HCO3 > 0)]
piv.dropna(inplace=True)
lin = linregress(piv.Alk.values,piv.HCO3.values)
print lin
piperdata.ix[:,"HCO3"] = piperdata.apply(lambda x: x['Alk']*lin[0]+lin[1] if np.isnan(x['HCO3']) else x['HCO3'],1)

In [None]:
piperdata = piperdata.drop(['Alk','As','CO2','Cu','DO','NaK','PO4'], axis=1)

In [None]:
piperdata.dropna(subset=['Lat_Y','HCO3'], how='any', inplace=True)

In [None]:
piperdata = pd.merge(piperdata, StatFreq, on='StationId', how='left')

In [None]:
d = {'Ca':0.04990269, 'Mg':0.082287595, 'Na':0.043497608, 'K':0.02557656, 'Cl':0.028206596, 
     'HCO3':0.016388838, 'CO3':0.033328223, 'SO4':0.020833333, 'NO2':0.021736513, 'NO3':0.016129032}
parList = ['Ca','Mg','Na','K','Cl','HCO3','SO4']


def CO3zero(x):
    if (np.isnan(x)):
        return 0.0
    else:
        return x

def chrgbal(x):
    cation = (x[0]*d['Na'] + x[1]*d['K'] + x[2]*d['Mg'] + x[3]*d['Ca'])
    anion = (x[4]*d['Cl'] + x[5]*d['HCO3'] + x[6]*d['CO3'] + x[7]*d['SO4']) 
    return  (cation - anion)/(cation+anion)*100

In [None]:
piperdata['CO3'] = piperdata['CO3'].apply(lambda x: CO3zero(x),1)
piperdata['chrgbal'] = piperdata[['Na','K','Mg','Ca','Cl','HCO3','CO3','SO4']].apply(lambda x: chrgbal(x),1)

In [None]:
for par in parList:
    piperdata[par+"_mql"] = piperdata[par].apply(lambda x: x*d[par],1)

In [None]:
piperdata = piperdata[abs(piperdata['chrgbal']) < 5.0]

In [None]:
piperdata.to_csv(rootname+"PiperData.csv")

In [None]:
piperdata.groupby('FmType')['Mg_mql'].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()

In [None]:
print piperdata.groupby('FmType')['StationId'].nunique()
piperdata.groupby('StationType')['StationId'].nunique()

In [None]:
names = list(set(list(piperdata.columns)) - set(['SampleDate', 'SampleId', 'StationId', 'FmType', 'Lat_Y', 
                                          'Lon_X', 'OrgName', 'StationType','DEM','HorCollMet','PLSS','USGSCAD',
                                              'UNITNAME', 'Elev', 'MAP_UNIT_SYMBOL', u'HUC_12', u'HUC_10',
                                              'Depth', u'OBJECTID', u'ElevUnit',u'HU_12_NAME',u'HU_10_NAME', u'FmNum']))

In [None]:
Forms = pd.DataFrame(fmdict.items(),columns=['FmType','FmNum'] )

In [None]:
?pd.merge

In [None]:
names = list(set(list(piperdata.columns)) - set(['SampleDate', 'SampleId', 'StationId', 'FmType', 'Lat_Y', 
                                          'Lon_X', 'OrgName', 'StationType','DEM','HorCollMet','PLSS','USGSCAD',
                                              'UNITNAME', 'Elev', 'MAP_UNIT_SYMBOL', u'HUC_12', u'HUC_10',
                                              'Depth', u'OBJECTID', u'ElevUnit',u'HU_12_NAME',u'HU_10_NAME', u'FmNum',
                                                'count_nonzero','chrgbal']))
print names

rc('font', family='Arial')
pdf = PdfPages(rootname+'pipergeoboxes.pdf')
j = {}

for i in range(len(names)):
    fms = piperdata[[names[i],'FmNum','FmType']]
    fms.dropna(inplace=True)
    fms = fms[~fms['FmType'].isin([' ','', np.nan, 'Green River Formation', 'Rock Springs Formation of Mesaverde Group', 
                                   'Holocene Alluvium', 'Masuk Member of Mancos Shale', 
                                   'Tununk Shale Member of Mancos Shale', 'Blue Gate Shale Member of Mancos Shale', 
                                   'Emery Sandstone Member of Mancos Shale', 'Pleistocene Series', 
                             'Paleozoic Erathem', 'Mancos Shale'])]
    if len(fms)>30:
        j[names[i]] = fms.groupby('FmNum')[names[i]].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()
        j[names[i]] = pd.merge(j[names[i]],Forms, left_on='FmNum', right_on='FmNum',how='left')
        labs = [str(j[names[i]]['FmType'][b]) + " (n=" + str(int(j[names[i]]['size'][b])) + ")" for b in range(len(j[names[i]][va]))]
        tickloc = [b+1 for b in range(len(j[names[i]][va]))]
        ax = fms.boxplot(column=names[i], by='FmNum', vert=False)
        try:
            plt.title(ParAbb.keys()[ParAbb.values().index(names[i])])
        except(ValueError):
            plt.title(names[i])
        plt.suptitle('')
        plt.yticks(tickloc, labs)
        fig = ax.get_figure()
        if ParUnAbb.get(names[i],'') == '':
            units = names[i]
        else:
            units = u'%s (%s)'%(names[i],ParUnAbb.get(names[i]))
        plt.xlabel(units)     
        if np.max(fms[names[i]].values) > 100:
            plt.xscale('log')  
        plt.tight_layout()      
        pdf.savefig(fig)
        plt.close()
pdf.close()
parsum = pd.concat(j)
parsum.to_csv(rootname + "geology_stats.csv")

In [None]:
names = list(set(list(piperdata.columns)) - set(['SampleDate', 'SampleId', 'StationId', 'FmType', 'Lat_Y', 
                                          'Lon_X', 'OrgName', 'StationType','DEM','HorCollMet','PLSS','USGSCAD',
                                              'UNITNAME', 'Elev', 'MAP_UNIT_SYMBOL', u'HUC_12', u'HUC_10',
                                              'Depth', u'OBJECTID', u'ElevUnit',u'HU_12_NAME',u'HU_10_NAME', 
                                                 u'FmNum', u'FmNum',
                                                'count_nonzero','chrgbal']))
print names

pipersprings = piperdata[piperdata['StationType']=='Spring']

rc('font', family='Arial')
pdf = PdfPages(rootname+'piperspringsgeoboxes.pdf')
j = {}

for i in range(len(names)):
    fms = pipersprings[[names[i],'FmNum']]
    fms.dropna(inplace=True)
    fms = fms[~fms[va].isin([' ','', np.nan, 'Green River Formation', 'Rock Springs Formation of Mesaverde Group', 
                                   'Holocene Alluvium', 'Masuk Member of Mancos Shale', 
                                   'Tununk Shale Member of Mancos Shale', 'Blue Gate Shale Member of Mancos Shale', 
                                   'Emery Sandstone Member of Mancos Shale', 'Pleistocene Series', 'Paleozoic Erathem',
                             'Mancos Shale', 'Ferron Sandstone Member of Mancos Shale'])]
    if len(fms)>30:
        j[names[i]] = fms.groupby('FmNum')[names[i]].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()
        j[names[i]] = pd.merge(j[names[i]],Forms, left_on='FmNum', right_on='FmNum',how='left')
        labs = [str(j[names[i]]['FmType'][b]) + " (n=" + str(int(j[names[i]]['size'][b])) + ")" for b in range(len(j[names[i]][va]))]
        tickloc = [b+1 for b in range(len(j[names[i]][va]))]
        ax = fms.boxplot(column=names[i], by='FmNum', vert=False)
        try:
            plt.title(ParAbb.keys()[ParAbb.values().index(names[i])])
        except(ValueError):
            plt.title(names[i])
        plt.suptitle('')
        plt.yticks(tickloc, labs)
        fig = ax.get_figure()
        if ParUnAbb.get(names[i],'') == '':
            units = names[i]
        else:
            units = u'%s (%s)'%(names[i],ParUnAbb.get(names[i]))
        plt.xlabel(units)     
        if np.max(fms[names[i]].values) > 100:
            plt.xscale('log')  
        plt.tight_layout()      
        pdf.savefig(fig)
        plt.close()
pdf.close()
parsum = pd.concat(j)
parsum.to_csv(rootname + "springs_geology_stats.csv")

In [None]:
pdf = PdfPages(rootname+'hucboxes.pdf')
j = {}
for i in range(len(names)):
    hucs = datapiv[[names[i],'HU_10_NAME']]
    hucs.dropna(inplace=True)
    hucs = hucs[~hucs['HU_10_NAME'].isin([' ','', np.nan, 'Spanish Fork Creek',
                                          'Twelvemile Creek','Chicken Creek','North Salt Wash', 'West Creek'])]
    if len(hucs)>30:
        j[names[i]] = hucs.groupby('HU_10_NAME')[names[i]].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()
        labs = [str(j[names[i]]['HU_10_NAME'][b]) + " (n=" + str(int(j[names[i]]['size'][b])) + ")" for b in range(len(j[names[i]]['HU_10_NAME']))]
        tickloc = [b+1 for b in range(len(j[names[i]]['HU_10_NAME']))]              
        ax = hucs.boxplot(column=names[i],by='HU_10_NAME',vert=False)
        plt.suptitle('')
        plt.title(ParAbb.keys()[ParAbb.values().index(names[i])])
        plt.yticks(tickloc, labs)
        fig = ax.get_figure() 
        if ParUnAbb.get(names[i],'') == '':
            units = names[i]
        else:
            units = names[i] +' (' + u'%s'% (ParUnAbb.get(names[i])) +')'
        plt.xlabel(units) 
        if np.max(hucs[names[i]].values) > 100:
            plt.xscale('log')
        
        plt.tight_layout()
        
        pdf.savefig(fig)
        plt.close()
pdf.close()
parsum = pd.concat(j)
parsum.to_csv(rootname + "stream_stats.csv")

In [None]:
datapivcor = datapiv.dropna(subset=['TDS','Q'],how='any')
datapivcor = datapivcor[datapivcor['Q']>1]
datapivcor = datapivcor[datapivcor['TDS']<10000]
datapivcor = datapivcor[datapivcor['StationType'].isin(['Stream','Spring'])]
pdf = PdfPages(rootname+'Q_TDS.pdf')

for key, grp in datapivcor.groupby(['StationId']):
    lin = linregress(grp['Q'],grp['TDS'])
    if len(grp) > 3:
        if lin[2]>0.5:
            fig = plt.figure()
            plt.scatter(x=grp['Q'],y= grp['TDS'], label=key)
            plt.title(str(key) + ' ' +str(grp['StationType'].values[0]))
            plt.xlabel('Discharge (gpm)')
            plt.ylabel('TDS (mg/L)')
            plt.legend(loc='best')    
            pdf.savefig(fig)
            plt.close()
pdf.close()
    
            