In [1]:
import pandas as pd
import numpy as np
from numpy import dtype

## DOGM Data

Utah Division of Oil Gas and Mining (DOGM) data derived from the <a href=http://linux1.ogm.utah.gov/WebStuff/wwwroot/wqdb.html>DOGM website</a>. Station information obtained from digitization of PDF maps provided by mining companies to DOGM. 

Ubuntu rootname = "/media/p/5F5B-8FCB/PROJECTS/MLSNF/Data/CHEM/"

In [2]:
#rootname = "E:/PROJECTS/MLSNF/Data/CHEM/"
rootname = "U:/GWP/Groundwater/UMSS_Manti/Data/"
#rootname = "/media/p/5F5B-8FCB/PROJECTS/MLSNF/Data/CHEM/"

In [3]:
DOGM = rootname + "DOGM/UDOGM_allResults.csv"

In [4]:
DOGM_ST = rootname + "DOGM/UDOGM_Station.txt"

Import and Prepare DOGM Results Data

In [5]:
parmatch = {"ACIDITY AS CACO3":"Acidity", "TOTAL ALKALINITY AS CACO3":"Alkalinity, total", 
            "DISSOLVED ALUMINUM":"Aluminum", "TOTAL ALUMINUM":"Aluminum", "AMMONIA AS N":"Ammonia-nitrogen as N", 
            "DISSOLVED ARSENIC":"Arsenic", "TOTAL ARSENIC":"Arsenic", "DISSOLVED BARIUM":"Barium", 
            "TOTAL BARIUM":"Barium", "TOTAL BERYLLIUM":"Beryllium", "BICARBONATE AS HCO3":"Bicarbonate", 
            "B.O.D. 5, MG/L":"Biochemical oxygen demand, standard conditions", "DISSOLVED BORON":"Boron", 
            "TOTAL BORON":"Boron", "BROMIDE":"Bromide", "DISSOLVED CADMIUM":"Cadmium", "TOTAL CADMIUM":"Cadmium", 
            "DISSOLVED CALCIUM":"Calcium", "TOTAL CALCIUM":"Calcium", "CARBONATE AS CO3":"Carbonate", 
            "C.O.D., MG/L":"Chemical oxygen demand", "CHLORIDE":"Chloride", "DISSOLVED CHROMIUM":"Chromium", 
            "TOTAL CHROMIUM":"Chromium", "CHROMIUM HEX, CR":"Chromium(VI)", "TOTAL COBALT":"Cobalt", 
            "SP. CONDUCTIVITY (FIELD)":"Conductivity", "SPECIFIC CONDUCTIVITY (LAB)":"Conductivity", 
            "DISSOLVED COPPER":"Copper", "TOTAL COPPER":"Copper", "CYANIDE":"Cyanide", "DEPTH":"Depth", 
            "DISSOLVED OXYGEN (FIELD)":"Dissolved oxygen (DO)", "FLOW":"Flow", "FLUORIDE":"Fluoride", 
            "TOTAL HARDNESS AS CACO3":"Hardness, Ca, Mg", "HYDROXIDE":"Hydroxide", "DISSOLVED IRON":"Iron", 
            "TOTAL IRON":"Iron", "TOTAL KIELDAHL NITROGEN, T.K.N.":"Kjeldahl nitrogen", "DISSOLVED LEAD":"Lead", 
            "TOTAL LEAD":"Lead", "DISSOLVED MAGNESIUM":"Magnesium", "TOTAL MAGNESIUM":"Magnesium", 
            "DISSOLVED MANGANESE":"Manganese", "TOTAL MANGANESE":"Manganese", "DISSOLVED MERCURY":"Mercury", 
            "TOTAL MERCURY":"Mercury", "DISSOLVED MOLYBDENUM":"Molybdenum", "TOTAL MOLYBDENUM":"Molybdenum", 
            "DISSOLVED NICKEL":"Nickel", "TOTAL NICKEL":"Nickel", "DISSOLVED NITRATE, NO3":"Nitrate", 
            "NITRATE AS N":"Nitrate as N", "DISSOLVED NITRITE, NO2":"Nitrite", "NITRITE AS N":"Nitrogen", 
            "NO2+NO3 AS N":"Nitrogen", "DISSOLVED ORTHO PHOSPH, OPO4":"Orthophosphate", 
            "ORTHO. PHOSPHATE":"Orthophosphate", "PH (FIELD)":"pH", "PH (LAB)":"pH, lab", 
            "TOTAL PHOSPHORUS":"Phosphorus", "DISSOLVED POTASSIUM":"Potassium", "DISSOLVED SELENIUM":"Selenium", 
            "TOTAL SELENIUM":"Selenium", "DISSOLVED SILICA, SIO2":"Silica", "DISSOLVED SILVER":"Silver", 
            "TOTAL SILVER":"Silver", "DISSOLVED SODIUM":"Sodium", "TOTAL SODIUM":"Sodium", 
            "SODIUM ADSORPTION RATIO":"Sodium adsorption ratio", "SULFATE":"Sulfate", "SULFIDE":"Sulfide", 
            "TOTAL ANIONS":"Sum of anions", "TOTAL CATIONS":"Sum of cations", 
            "AIR TEMPERATURE (FIELD)":"Temperature, air", "FIELD WATER TEMPERATURE":"Temperature, water", 
            "TOTAL DISSOLVED SOLIDS, @ 180 C":"Total dissolved solids", 
            "TOTAL SUSPENDED SOLIDS":"Total suspended solids", "TURBIDITY (FIELD)":"Turbidity", 
            "TURBIDITY (LAB)":"Turbidity", "TOTAL VANADIUM":"Vanadium", "DISSOLVED ZINC":"Zinc", "TOTAL ZINC":"Zinc"} 


In [6]:
dogtypes = {'Unnamed: 0':dtype('int8'),'MIN_DET': dtype('float32'), 'MINE_ID': dtype('int8'), 
            'ANAL_NAME': dtype('str_'), 'PARAM_ID': dtype('int8'), 'SAMPLE_ID': dtype('str_'), 'SITE_NAME': dtype('str_'), 
            'METHD': dtype('str_'), 'DATE_REC': dtype('str_'),'EQUALITY': dtype('str_'), 'TIME_ANAL': dtype('str_'), 
            'STATION_ID': dtype('str_'), 'PAR_DESC': dtype('str_'), 'PAR_ABB': dtype('str_'), 'ANAL_METHD': dtype('str_'), 
            'METH_DESC': dtype('str_'), 'SAMP_TYPE': dtype('str_'), 'LAB_ID': dtype('str_'), 'MINE_NAME': dtype('str_'), 
            'SITE_DESC': dtype('str_'), 'SITE_ID': dtype('int8'), 'VALUE': dtype('float32'), 'PERM_NO': dtype('str_'), 
            'LAB_NAME': dtype('str_'), 'UNITS': dtype('str_'), 'SAMP_DESC': dtype('str_'), 'DATE_ANAL': dtype('str_'), 
            'SITE_TYPE': dtype('str_'), 'LAB_CODE': dtype('str_'), 'SAMPLR_NAM': dtype('str_'), 'COMMENTS': dtype('str_')}

In [7]:
dogmr = pd.read_csv(DOGM, dtype=dogtypes, parse_dates=[11,14,15,19])

In [8]:
DResCols = {"ANAL_METHD":"AnalytMeth", "ANAL_NAME":"Param", "COMMENTS":"ResultComment", "DATE_ANAL":"AnalysisDate", 
            "DATETIME_SAMP":"SampleDate", "EQUALITY":"DetectCond", "LAB_CODE":"LabComments", "LAB_NAME":"LabName", 
            "METH_DESC":"MethodDescript", "METHD":"SampMeth", "MIN_DET":"MDL", "SAMP_TYPE":"SampMethName", 
            "SAMPLE_ID":"SampleId", "STATION_ID":"StationId", "UNITS":"Unit", "VALUE":"ResultValue"} 
dogmr.rename(columns=DResCols,inplace=True)

In [9]:
dogmr['Param'] = dogmr['PAR_DESC'].apply(lambda x: parmatch.get(x),1)

In [10]:
dogmr.dropna(how='any',subset=['StationId'],inplace=True)

In [11]:
dogmr.drop(['PARAM_ID','SAMPLR_NAM','SITE_ID','SITE_DESC','PAR_DESC','TIME_ANAL','DATE_RPT',
            'DATE_REC','Unnamed: 0', 'LAB_ID','MINE_ID','PAR_ABB','MINE_NAME', 'SITE_NAME', 'SAMP_DESC',
            'PERM_NO','SITE_TYPE'], inplace=True, axis=1)

In [12]:
dogmr.columns

Index([u'LabComments', u'DetectCond', u'ResultValue', u'Unit', u'MDL',
       u'AnalytMeth', u'AnalysisDate', u'Param', u'SampMethName',
       u'ResultComment', u'SampleDate', u'SampMeth', u'MethodDescript',
       u'LabName', u'SampleId', u'StationId'],
      dtype='object')

Import and Prepare DOGM Station Data

In [13]:
dogms = pd.read_csv(DOGM_ST)

In [14]:
dogms.columns
dogms['StationComment'] = dogms[['StationComment','DESCRIPT']].apply(lambda x: x[0] if 
                                                                     (x[1])==np.nan else 
                                                                     str(x[0])+';'+str(x[1]),1)
dogms.drop(['DESCRIPT'],inplace=True,axis=1)

##WQP Data

Data downloaded from the <a href=http://waterqualitydata.us/portal/>Water Quality Portal (WQP)</a> on 7/22/2015. <br>
<b>Stations:</b><br>
http://waterqualitydata.us/Station/search?sampleMedia=Water&characteristicType=Information%3BInorganics%2C+Major%2C+Metals%3BInorganics%2C+Major%2C+Non-metals%3BInorganics%2C+Minor%2C+Metals%3BInorganics%2C+Minor%2C+Non-metals%3BNot+Assigned%3BNutrient%3BPhysical%3BStable+Isotopes&bBox=-111.68%2C38.8%2C-111%2C40&mimeType=csv&zip=yes&sorted=no <br>
<b>Results:</b><br>
http://waterqualitydata.us/Result/search?sampleMedia=Water&characteristicType=Information%3BInorganics%2C+Major%2C+Metals%3BInorganics%2C+Major%2C+Non-metals%3BInorganics%2C+Minor%2C+Metals%3BInorganics%2C+Minor%2C+Non-metals%3BNot+Assigned%3BNutrient%3BPhysical%3BStable+Isotopes&bBox=-111.68%2C38.8%2C-111%2C40&mimeType=csv&zip=yes&sorted=no

##Import data

Define data location.

In [15]:
station = rootname + "WQP/Station.csv"
result = rootname +"WQP/Result.csv"

###Results Data

Map data types for data to be imported and designate columns to convert to datetimes.

In [16]:
Rdtypes = {"OrganizationIdentifier":np.str_, "OrganizationFormalName":np.str_, "ActivityIdentifier":np.str_, 
           "ActivityTypeCode":np.str_, "ActivityMediaName":np.str_, "ActivityMediaSubdivisionName":np.str_, 
           "ActivityStartDate":np.str_, "ActivityStartTime/Time":np.str_, "ActivityStartTime/TimeZoneCode":np.str_, 
           "ActivityEndDate":np.str_, "ActivityEndTime/Time":np.str_, "ActivityEndTime/TimeZoneCode":np.str_, 
           "ActivityDepthHeightMeasure/MeasureValue":np.float16, "ActivityDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ActivityDepthAltitudeReferencePointText":np.str_, "ActivityTopDepthHeightMeasure/MeasureValue":np.float16, 
           "ActivityTopDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ActivityBottomDepthHeightMeasure/MeasureValue":np.float16, 
           "ActivityBottomDepthHeightMeasure/MeasureUnitCode":np.str_, 
           "ProjectIdentifier":np.str_, "ActivityConductingOrganizationText":np.str_, 
           "MonitoringLocationIdentifier":np.str_, "ActivityCommentText":np.str_, 
           "SampleAquifer":np.str_, "HydrologicCondition":np.str_, "HydrologicEvent":np.str_, 
           "SampleCollectionMethod/MethodIdentifier":np.str_, "SampleCollectionMethod/MethodIdentifierContext":np.str_, 
           "SampleCollectionMethod/MethodName":np.str_, "SampleCollectionEquipmentName":np.str_, 
           "ResultDetectionConditionText":np.str_, "CharacteristicName":np.str_, "ResultSampleFractionText":np.str_, 
           "ResultMeasureValue":np.str_, "ResultMeasure/MeasureUnitCode":np.str_, "MeasureQualifierCode":np.str_, 
           "ResultStatusIdentifier":np.str_, "StatisticalBaseCode":np.str_, "ResultValueTypeName":np.str_, 
           "ResultWeightBasisText":np.str_, "ResultTimeBasisText":np.str_, "ResultTemperatureBasisText":np.str_, 
           "ResultParticleSizeBasisText":np.str_, "PrecisionValue":np.str_, "ResultCommentText":np.str_, 
           "USGSPCode":np.str_, "ResultDepthHeightMeasure/MeasureValue":np.float16, 
           "ResultDepthHeightMeasure/MeasureUnitCode":np.str_, "ResultDepthAltitudeReferencePointText":np.str_, 
           "SubjectTaxonomicName":np.str_, "SampleTissueAnatomyName":np.str_, 
           "ResultAnalyticalMethod/MethodIdentifier":np.str_, "ResultAnalyticalMethod/MethodIdentifierContext":np.str_, 
           "ResultAnalyticalMethod/MethodName":np.str_, "MethodDescriptionText":np.str_, "LaboratoryName":np.str_, 
           "AnalysisStartDate":np.str_, "ResultLaboratoryCommentText":np.str_, 
           "DetectionQuantitationLimitTypeName":np.str_, "DetectionQuantitationLimitMeasure/MeasureValue":np.str_, 
           "DetectionQuantitationLimitMeasure/MeasureUnitCode":np.str_, "PreparationStartDate":np.str_, 
           "ProviderName":np.str_} 

dt = [[6,7],56,61]

Import result data.

In [17]:
res = pd.read_csv(result, dtype=Rdtypes, parse_dates=dt)

In [18]:
#pd.DataFrame(list(res['CharacteristicName'].unique())).to_clipboard()

Remap column names to standards and drop unneeded columns

In [19]:
ResFieldDict = {"AnalysisStartDate":"AnalysisDate", "ResultAnalyticalMethod/MethodIdentifier":"AnalytMeth", 
                "ResultAnalyticalMethod/MethodName":"AnalytMethId", "ResultDetectionConditionText":"DetectCond", 
                "ResultLaboratoryCommentText":"LabComments", "LaboratoryName":"LabName", 
                "DetectionQuantitationLimitTypeName":"LimitType", "DetectionQuantitationLimitMeasure/MeasureValue":"MDL", 
                "DetectionQuantitationLimitMeasure/MeasureUnitCode":"MDLUnit", "MethodDescriptionText":"MethodDescript", 
                "OrganizationIdentifier":"OrgId", "OrganizationFormalName":"OrgName", "CharacteristicName":"Param", 
                "ProjectIdentifier":"ProjectId", "MeasureQualifierCode":"QualCode", "ResultCommentText":"ResultComment", 
                "ResultStatusIdentifier":"ResultStatus", "ResultMeasureValue":"ResultValue", 
                "ActivityCommentText":"SampComment", "ActivityDepthHeightMeasure/MeasureValue":"SampDepth", 
                "ActivityDepthAltitudeReferencePointText":"SampDepthRef", 
                "ActivityDepthHeightMeasure/MeasureUnitCode":"SampDepthU", "SampleCollectionEquipmentName":"SampEquip", 
                "ResultSampleFractionText":"SampFrac", "ActivityStartDate":"SampleDate", "ActivityIdentifier":"SampleId", 
                "ActivityStartTime/Time":"SampleTime", "ActivityMediaSubdivisionName":"SampMedia", 
                "SampleCollectionMethod/MethodIdentifier":"SampMeth", "SampleCollectionMethod/MethodName":"SampMethName", 
                "ActivityTypeCode":"SampType", "MonitoringLocationIdentifier":"StationId", 
                "ResultMeasure/MeasureUnitCode":"Unit", "USGSPCode":"USGSPCode",
                "ActivityStartDate_ActivityStartTime/Time":"SampleDate"} 

In [20]:
res.rename(columns=ResFieldDict,inplace=True)

In [21]:
resdroplist = ["ActivityBottomDepthHeightMeasure/MeasureUnitCode", "ActivityBottomDepthHeightMeasure/MeasureValue", 
               "ActivityConductingOrganizationText", "ActivityEndDate", "ActivityEndTime/Time", 
               "ActivityEndTime/TimeZoneCode", "ActivityMediaName", "ActivityStartTime/TimeZoneCode", 
               "ActivityTopDepthHeightMeasure/MeasureUnitCode", "ActivityTopDepthHeightMeasure/MeasureValue", 
               "HydrologicCondition", "HydrologicEvent", "PrecisionValue", "PreparationStartDate", "ProviderName", 
               "ResultAnalyticalMethod/MethodIdentifierContext", "ResultDepthAltitudeReferencePointText", 
               "ResultDepthHeightMeasure/MeasureUnitCode", "ResultDepthHeightMeasure/MeasureValue", 
               "ResultParticleSizeBasisText", "ResultTemperatureBasisText", 
               "ResultTimeBasisText", "ResultValueTypeName", "ResultWeightBasisText", "SampleAquifer", 
               "SampleCollectionMethod/MethodIdentifierContext", "SampleTissueAnatomyName", "StatisticalBaseCode", 
               "SubjectTaxonomicName"] 

In [22]:
res.drop(resdroplist,inplace=True,axis=1)

Convert string fields to values.

In [23]:
res['ResultValue'] = res['ResultValue'].convert_objects(convert_numeric=True)
res['MDL'] = res['MDL'].convert_objects(convert_numeric=True)

Remove `_WQX` suffix from `StationID`.

In [24]:
res['StationId'] = res['StationId'].str.replace('_WQX-','-')

In [25]:
#pd.DataFrame(list(res['Unit'].unique())).to_clipboard()

###Station Data

In [26]:
stat = pd.read_csv(station)

Map Fields to Standard UGS database format.

In [27]:
StatFieldDict = {"MonitoringLocationIdentifier":"StationId", "AquiferName":"Aquifer", "AquiferTypeName":"AquiferType", 
             "ConstructionDateText":"ConstDate", "CountyCode":"CountyCode", "WellDepthMeasure/MeasureValue":"Depth", 
             "WellDepthMeasure/MeasureUnitCode":"DepthUnit", "VerticalMeasure/MeasureValue":"Elev", 
             "VerticalAccuracyMeasure/MeasureValue":"ElevAcc", "VerticalAccuracyMeasure/MeasureUnitCode":"ElevAccUnit", 
             "VerticalCollectionMethodName":"ElevMeth", "VerticalCoordinateReferenceSystemDatumName":"ElevRef", 
             "VerticalMeasure/MeasureUnitCode":"ElevUnit", "FormationTypeText":"FmType", 
             "WellHoleDepthMeasure/MeasureValue":"HoleDepth", "WellHoleDepthMeasure/MeasureUnitCode":"HoleDUnit", 
             "HorizontalAccuracyMeasure/MeasureValue":"HorAcc", "HorizontalAccuracyMeasure/MeasureUnitCode":"HorAccUnit", 
             "HorizontalCollectionMethodName":"HorCollMeth", "HorizontalCoordinateReferenceSystemDatumName":"HorRef", 
             "HUCEightDigitCode":"HUC8", "LatitudeMeasure":"Lat_Y", "LongitudeMeasure":"Lon_X", 
             "OrganizationIdentifier":"OrgId", "OrganizationFormalName":"OrgName", "StateCode":"StateCode", 
             "MonitoringLocationDescriptionText":"StationComment", "MonitoringLocationName":"StationName", 
             "MonitoringLocationTypeName":"StationType"} 

In [28]:
stat.rename(columns=StatFieldDict,inplace=True)

Drop leftover fields.

In [29]:
statdroplist = ["ContributingDrainageAreaMeasure/MeasureUnitCode", "ContributingDrainageAreaMeasure/MeasureValue", 
                "DrainageAreaMeasure/MeasureUnitCode", "DrainageAreaMeasure/MeasureValue", "CountryCode", "ProviderName", 
                "SourceMapScaleNumeric"]

In [30]:
stat.drop(statdroplist,inplace=True,axis=1)

Remove `_WQX` suffix from `StationID` and take out duplicate stations.

In [31]:
stat['StationId'] = stat['StationId'].str.replace('_WQX-','-')
stat.drop_duplicates(subset=['StationId'],inplace=True)

##Combine Data

Append Results Data Together

In [32]:
results = pd.concat([res,dogmr])

In [33]:
del(res,dogmr)

In [34]:
ParAbb = {"Alkalinity":"Alk", "Alkalinity, Carbonate as CaCO3":"Alk", "Alkalinity, total":"Alk", 
          "Arsenic":"As", "Calcium":"Ca", "Chloride":"Cl", "Carbon dioxide":"CO2", "Carbonate":"CO3", 
          "Carbonate (CO3)":"CO3", "Specific conductance":"Cond", "Conductivity":"Cond", "Copper":"Cu", 
          "Depth":"Depth", "Dissolved oxygen (DO)":"DO", "Iron":"Fe", 
          "Hardness, Ca, Mg":"Hard", "Total hardness -- SDWA NPDWR":"Hard", 
          "Bicarbonate":"HCO3", "Potassium":"K", "Magnesium":"Mg", "Kjeldahl nitrogen":"N", 
          "Nitrogen, mixed forms (NH3), (NH4), organic, (NO2) and (NO3)":"N", "Nitrogen":"N", "Sodium":"Na", 
          "Sodium plus potassium":"NaK", "Ammonia-nitrogen":"NH3_N", "Ammonia-nitrogen as N":"NH3_N", "Nitrite":"NO2", 
          "Nitrate":"NO3", "Nitrate as N":"NO3_N", "pH":"pH", "pH, lab":"pH", "Phosphate-phosphorus":"PO4", 
          "Orthophosphate":"PO4", "Phosphate":"PO4", "Stream flow, instantaneous":"Q", "Flow":"Q", 
          "Flow rate, instantaneous":"Q", "Silica":"Si", "Sulfate":"SO4", "Sulfate as SO4":"SO4", 
          "Total dissolved solids":"TDS", "Temperature, water":"Temp", 
          "Total suspended solids":"TSS", "Turbidity":"Turb"}


In [35]:
results['ParAbb'] = results['Param'].apply(lambda x: ParAbb.get(x),1)

In [36]:
del results['USGSPCode']

In [37]:
#results.to_csv(rootname+"AllResults.csv", chunksize=10000)

##Stations

In [38]:
stat.set_index(['StationId'],inplace=True)
dogms.set_index(['StationId'],inplace=True)

In [39]:
station = pd.concat([stat,dogms])

In [40]:
station["index"] = station.index
station.drop_duplicates(subset='index', take_last=False, inplace=True)
del station["index"]

In [41]:
del(stat,dogms)

In [42]:
station.drop(['AquiferType','UTM_X','UTM_Y','SITE_ID','OBJECTID','HUC8','MINE_ID','ElevMeth','Elev','ElevAcc',
              'ElevRef','ElevUnit','HorAcc', 'HorAccUnit','StateCode',
              'ElevAccUnit','HorRef','CountyCode'], inplace=True,axis=1)

In [43]:
station.columns

Index([u'Aquifer', u'ConstDate', u'Depth', u'DepthUnit', u'FmType',
       u'HoleDUnit', u'HoleDepth', u'HorCollMeth', u'Lat_Y', u'Lon_X',
       u'OrgId', u'OrgName', u'StationComment', u'StationName', u'StationType',
       u'matchid'],
      dtype='object')

In [44]:
stattype = {"Stream: Ditch":"Stream", "Stream":"Stream", "Well":"Well", "Spring":"Spring", 
            "Stream: Canal":"Stream", "Subsurface: Tunnel, shaft, or mine":"Mine Drain", 
            "Well: Test hole not completed as a well":"Well", "Subsurface: Groundwater drain":"Mine Drain", 
            "Lake, Reservoir, Impoundment":"Lake", "River/Stream":"Stream", "Lake":"Lake", "Facility Other":"Other", 
            "Canal Drainage":"Stream", "Canal Irrigation":"Stream", "MD":"Mine Drain", "SW":"Stream", "GW":"Well", 
            "SP":"Spring", "UPDES Permit discharge point":"Mine Drain", "Lake; Sediment Pond; Stagnant water":"Lake", 
            "Other":"Other", "CG-2":"Other"}


In [45]:
station['StationType'] = station['StationType'].apply(lambda x: stattype.get(x),1)

In [46]:
#station.to_csv(rootname + "AllStations.csv")

#Merge Station and Results Data

In [47]:
#complete = pd.merge(results, station, left_on = "StationId", right_index=True, how="left" )

In [48]:
#complete.to_csv(rootname + "AllChemData.csv", chunksize=10000)

#Pivot Data

Drop rows from the `results` table that have null `SampleId` values or parameter abbreviations.

In [49]:
results.dropna(subset=['SampleId','ParAbb'],how='any', inplace=True)

Drop rows from the `results` table that have a detection condition (ex. "not detected").  This eliminates nondetects, which will inhereantly bias the data, as it is not considering values below the detection levels.

In [50]:
results = results[pd.isnull(results['DetectCond'])]

Drop rows from the `results` table that have duplicate `SampleId` values and parameters (chemical concentrations, field measurements). The `SampleId` field will be the index for the pivoted table.  Each row in the pivoted table will represent an individual water sample.  The `SampleId` is applied to each parameter that comes from the same water sample.  For example, if I go out to a stream and fill a water bottle and have that analyzed for 4 different parameters (i.e. calcium, magnesium, sodium, and chloride), then each result from the analysis of that water will have the same `SampleId`.  Sometimes we sample a station multiple times, so one `StationId` can have many `SampleId` values. 

In [51]:
results.drop_duplicates(subset=['SampleId','ParAbb'],inplace=True)

Select results that have more than 50 flow values to plot flow change over time.
http://stackoverflow.com/questions/17926273/how-to-count-distinct-values-in-a-column-of-a-pandas-group-by-object

In [88]:
fieldsummary = results.groupby(['StationId','Param'])['ResultValue'].nunique().reset_index()
fieldsummary = fieldsummary[fieldsummary['ResultValue']>50]
manyQs = fieldsummary[fieldsummary['Param']=='Flow']
manyDs = fieldsummary[fieldsummary['Param']=='Depth']
manyQsStats = list(manyQs['StationId'].values)
manyDsStats = list(manyDs['StationId'].values)

Summarize All Fields by date and count

In [93]:
durationsummary = results.groupby(['StationId','Param'])['SampleDate'].agg([np.min,np.max,np.size]).reset_index()
durationsummary.to_csv(rootname + "fieldsummaries.csv")

In [90]:
manyQresults = results[(results['StationId'].isin(manyQsStats)) & (results['Param'] == 'Flow')]
manyDresults = results[(results['StationId'].isin(manyDsStats)) & (results['Param'] == 'Depth')]
print("Depth Values= "+ str(len(manyDresults)))
print("Flow Values= "+ str(len(manyQresults)))
manyQresults.to_csv(rootname + "ManyQonly.csv")
manyDresults.to_csv(rootname + "ManyDonly.csv")
manyDresults

Depth Values= 7734
Flow Values= 31370


Unnamed: 0,AnalysisDate,AnalytMeth,AnalytMethId,DetectCond,LabComments,LabName,LimitType,MDL,MDLUnit,MethodDescript,...,SampFrac,SampMedia,SampMeth,SampMethName,SampType,SampleDate,SampleId,StationId,Unit,ParAbb
16634,03/17/2015,,,,SGS,,,0,,No Method,...,,,,0,,2015-03-17 12:35:00,02-0011-2015-03-17 12:35:00,UDOGM-02-0011,feet,Depth
16656,12/08/2014,,,,SGS,,,0,,No Method,...,,,,0,,2014-12-08 13:57:00,02-0011-2014-12-08 13:57:00,UDOGM-02-0011,feet,Depth
16678,09/04/2014,,,,SGS,,,0,,No Method,...,,,,0,,2014-09-04 13:40:00,02-0011-2014-09-04 13:40:00,UDOGM-02-0011,feet,Depth
16701,06/09/2014,,,,SGS,,,0,,No Method,...,,,,0,,2014-06-09 13:50:00,02-0011-2014-06-09 13:50:00,UDOGM-02-0011,feet,Depth
16724,03/18/2014,,,,SGS,,,0,,No Method,...,,,,0,,2014-03-18 13:09:00,02-0011-2014-03-18 13:09:00,UDOGM-02-0011,feet,Depth
16747,12/09/2013,,,,SGS,,,0,,No Method,...,,,,0,,2013-12-09 13:00:00,02-0011-2013-12-09 13:00:00,UDOGM-02-0011,feet,Depth
16770,09/04/2013,,,,SGS,,,0,,No Method,...,,,,0,,2013-09-04 13:30:00,02-0011-2013-09-04 13:30:00,UDOGM-02-0011,feet,Depth
16793,06/18/2013,,,,SGS,,,0,,No Method,...,,,,0,,2013-06-18 13:37:00,02-0011-2013-06-18 13:37:00,UDOGM-02-0011,feet,Depth
16816,03/13/2013,,,,SGS,,,0,,No Method,...,,,,0,,2013-03-13 08:35:00,02-0011-2013-03-13 08:35:00,UDOGM-02-0011,feet,Depth
16839,12/05/2012,,,,SGS,,,0,,No Method,...,,,,0,,2012-12-05 12:55:00,02-0011-2012-12-05 12:55:00,UDOGM-02-0011,feet,Depth


Pivot the data so that parameters are now columns.  

In [52]:
datap = results.pivot(index='SampleId', columns='ParAbb', values='ResultValue')

Drop columns from the pivot table that are pretty much empty.

In [53]:
datap.dropna(subset=['SO4','Cond','Temp','TDS','pH'],how='all',inplace=True)

The table lost the `StationId` field when it was pivoted, so now we need to add the `StationId` field back on to the table by joining it to the previous results table using the `SampleId` field.  First we parse down the results table to only the information we want to retain, then we join the tables.

In [55]:
resdrop = ['AnalysisDate', 'AnalytMeth', 'AnalytMethId',
             'DetectCond', 'LabComments', 'LabName', 'LimitType',
             'MDL', 'MDLUnit', 'MethodDescript',
             'OrgId', 'OrgName', 'Param', 'ProjectId',
             'QualCode', 'ResultComment', 'ResultStatus', 'ResultValue',
             'SampComment', 'SampDepth',
             'SampDepthRef', 'SampDepthU', 'SampEquip', 'SampFrac',
             'SampMedia', 'SampMeth', 'SampMethName', 'SampType',
             'Unit', 'ParAbb']
resPivot = results.drop(resdrop, axis=1)

In [56]:
datapiv = pd.merge(datap, resPivot, left_index=True, right_on='SampleId',how='left')

Now that we have a `StationId` field, we can add our station data, but only the data that will be useful for plotting our data.

In [57]:
pivStats = station.drop(['Aquifer', 'ConstDate', 'Depth', 'DepthUnit',
                         'HoleDUnit', 'HoleDepth', 'HorCollMeth', 
                         'OrgId', 'StationComment', 'StationName', 'matchid'], axis=1)

In [58]:
datapiv = pd.merge(datapiv, pivStats, left_on='StationId', right_index=True,how='left')

In [59]:
datapiv.drop_duplicates(inplace=True)

In [60]:
#datapiv.to_csv(rootname+"AllResultsPivot.csv", chunksize=10000)

#Create Table For Plotting

In [61]:
piperdata = datapiv.dropna(subset = ['Ca','Na','Cl','K','Mg','SO4'],how='any')

Find Relationship between Bicarbonate and Alkalinity.  Fill in missing bicarbonate values.

In [62]:
import matplotlib.pyplot as plt
from pylab import rcParams
from scipy.stats import linregress
%matplotlib inline
rcParams['figure.figsize'] = 15, 10

In [63]:
piv = piperdata.ix[:,['Alk','HCO3']]
piv = piv[(piv.Alk < 5000)&(piv.HCO3 < 5000)]
piv = piv[(piv.Alk > 0)&(piv.HCO3 > 0)]
piv.dropna(inplace=True)
lin = linregress(piv.Alk.values,piv.HCO3.values)
print lin
piperdata.ix[:,"HCO3"] = piperdata.apply(lambda x: x['Alk']*lin[0]+lin[1] if np.isnan(x['HCO3']) else x['HCO3'],1)

(1.0941152486856753, 20.027751188998934, 0.97224865250018067, 0.0, 0.0025819900883626017)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [64]:
piperdata = piperdata.drop(['Alk','As','CO2','Cu','DO','NaK','PO4'], axis=1)

In [65]:
piperdata.dropna(subset=['Lat_Y','HCO3'], how='any', inplace=True)

In [66]:
StatFreq = piperdata.groupby('StationId')['StationId'].agg([np.count_nonzero]).reset_index()
piperdata = pd.merge(piperdata, StatFreq, on='StationId', how='left')

In [None]:
piperdata.to_csv(rootname+"PiperData.csv")

In [67]:
#statlump = df.groupby('MonitoringLocationIdentifier').agg([np.mean, np.min, np.max, np.std, np.size, np.median]).reset_index()
statlump = datapiv.groupby('StationId').agg([np.median]).reset_index()
statlump.dropna(how='any',subset=[('HCO3','median'),('Ca','median')],inplace=True)

In [68]:
datapiv.groupby('FmType')['Temp'].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()

Unnamed: 0,FmType,amin,mean,median,amax,std,size
0,Blackhawk Formation of Mesaverde Group,0.1,7.824107,7.15,17.5,2.953048,231
1,Blue Gate Shale Member of Mancos Shale,12.5,13.75,13.25,16.0,1.554563,4
2,Castlegate Sandstone of Mesaverde Group,1.7,7.778824,7.05,20.5,3.41395,173
3,Emery Sandstone Member of Mancos Shale,13.5,13.5,13.5,13.5,,1
4,Ferron Sandstone Member of Mancos Shale,8.5,13.891892,13.0,26.5,4.265715,44
5,Flagstaff Limestone (Eocene-Paleocene),1.0,6.796552,6.0,33.0,4.303021,119
6,Green River Formation,,,,,,1
7,Holocene Alluvium,0.1,7.773333,6.55,16.5,3.502209,30
8,Mancos Shale,8.5,11.5375,10.5,21.0,3.956527,8
9,Masuk Member of Mancos Shale,9.0,13.5,13.5,20.0,3.817254,8


In [69]:
datapiv.groupby('FmType')['Cond'].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()

Unnamed: 0,FmType,amin,mean,median,amax,std,size
0,Blackhawk Formation of Mesaverde Group,51.0,585.581081,528.0,1700,255.185835,231
1,Blue Gate Shale Member of Mancos Shale,8000.0,16000.0,9000.0,31000,13000.0,4
2,Castlegate Sandstone of Mesaverde Group,118.0,414.888235,326.5,1290,274.811338,173
3,Emery Sandstone Member of Mancos Shale,1700.0,1700.0,1700.0,1700,,1
4,Ferron Sandstone Member of Mancos Shale,900.0,2492.307692,1700.0,9500,1913.968338,44
5,Flagstaff Limestone (Eocene-Paleocene),220.0,511.247863,400.0,4400,556.724982,119
6,Green River Formation,820.0,820.0,820.0,820,,1
7,Holocene Alluvium,130.0,655.633333,580.0,1850,336.01298,30
8,Mancos Shale,465.0,803.75,540.0,2710,771.217729,8
9,Masuk Member of Mancos Shale,290.0,488.125,457.5,725,181.086749,8


In [None]:
datapiv.groupby('FmType')['Cond'].agg([np.min, np.mean, np.median, np.max, np.std, np.size]).reset_index()

In [None]:

datapiv.columns

In [None]:
import seaborn as sns
sns.set_style("whitegrid",{'ytick.major.size': '1.0','xtick.major.size': '1.0','axes.grid': False})
fms = datapiv[['Cond','FmType']]
fms.dropna(inplace=True)
fms = fms[fms['Cond'] >= 100.0]
fms = fms[fms['Cond'] < 15000.0]
fms = fms[~fms['FmType'].isin(['Green River Formation', 'Rock Springs Formation of Mesaverde Group', 'Holocene Alluvium', 
 'Masuk Member of Mancos Shale', 'Mancos Shale', 'Tununk Shale Member of Mancos Shale', 
 'Blue Gate Shale Member of Mancos Shale', 'Emery Sandstone Member of Mancos Shale', 
 'Pleistocene Series', 'Paleozoic Erathem','Ferron Sandstone Member of Mancos Shale'])]

#fms.reset_index(inplace=True)
#fms.set_index('FmType',inplace=True)
#fms.drop(['index'],inplace=True,axis=1)
fig = plt.figure()
sns.violinplot(vals=fms['Cond'], groupby=fms['FmType'],vert=False, inner='box', bw=.5)
fig.savefig(rootname+"violin.svg")

In [None]:
from matplotlib.backends.backend_pdf import PdfPages
datapivcor = datapiv.dropna(subset=['TDS','Q'],how='any')
datapivcor = datapivcor[datapivcor['Q']>1]
for key, grp in datapivcor.groupby(['StationId']):
    with PdfPages(rootname+'multipage_pdf.pdf') as pdf:
        if len(grp) > 3:
            plt.figure()
            plt.scatter(x=grp['Q'],y= grp['TDS'], label=key)
            plt.title(key, grp['StationType'])
            plt.xlabel('Discharge (cfs)')
            plt.ylabel('Total Dissolved Solids (mg/L)')
            plt.legend(loc='best')    
            plt.show()
            pdf.savefig()
            plt.close
            

In [None]:
?sns.violinplot