In [1]:
import sqlite3
import pandas as pd
import numpy as np
from toolz import frequencies, valfilter

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)

    return conn

In [3]:
def select_geolocation(conn):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute("SELECT * FROM geolocation ORDER BY epa_srcid")

    rows = cur.fetchall()
    
    df = pd.DataFrame(columns = ['EPA_srcID','PWS_ID','lon','lat','Own_Loc_Co','Src_Type','Src_Name',
                                 'Src_Avl','Src_Locati','Locn_Dtrm','WELL_I_RAT','WSWCLASS','RWQ_RATE',
                                 'INTK_LOC','Yield_Gpm','DEG_CONF','Depth','RECHRATE','AQRATE','Pws_Type',
                                 'Rtl_Nbr_Cn','Rtl_Pop','System_Nam','Descript_1','Loc_Addr1','Loc_City',
                                 'Loc_State','Loc_Zip1','County','Owner','Ownr_Addr1','Ownr_Addr2','Ownr_City',
                                 'Ownr_Zip1','Ownr_Zip2','Prdtn_Capt','Prdtn_Ca_1','Prdtn_Ca_2','Prdtn_Ca_3',
                                 'PWSTypeDes','PWSTypeNam','Begin_Date','Sys_County','DEQ_Reg_Na','Ownr_State']) 
    for row in rows:
        df = df.append({'EPA_srcID': row[0],'PWS_ID': row[1],'lon': row[2],'lat': row[3],'Own_Loc_Co': row[4],
                   'Src_Type': row[5],'Src_Name': row[6],'Src_Avl': row[7],'Src_Locati': ", ".join(str(row[8]).splitlines()),
                   'Locn_Dtrm': row[9],'WELL_I_RAT': row[10],'WSWCLASS': row[11],'RWQ_RATE': row[12],
                   'INTK_LOC': row[13],'Yield_Gpm': row[14],'DEG_CONF': row[15],'Depth': row[16],
                   'RECHRATE': row[17],'AQRATE': row[18],'Pws_Type': row[19],'Rtl_Nbr_Cn': row[20],
                   'Rtl_Pop': row[21],'System_Nam': row[22],'Descript_1': row[23],'Loc_Addr1': row[24],
                   'Loc_City': row[25],'Loc_State': row[26],'Loc_Zip1': row[27],'County': row[28],
                   'Owner': row[29],'Ownr_Addr1': row[30],'Ownr_Addr2': row[31],'Ownr_City': row[32],
                   'Ownr_Zip1': row[33],'Ownr_Zip2': row[34],'Prdtn_Capt': row[35],'Prdtn_Ca_1': row[36],
                   'Prdtn_Ca_2': row[37],'Prdtn_Ca_3': row[38],'PWSTypeDes': row[39],'PWSTypeNam': row[40],
                   'Begin_Date': row[41],'Sys_County': row[42],'DEQ_Reg_Na': row[43],'Ownr_State': row[44]}, 
                  ignore_index = True)

    return df

In [4]:
db_file = '../RoundOneDB/RoundOneDB_final.db'
conn = create_connection(db_file)
df = select_geolocation(conn)
df.head()

Unnamed: 0,EPA_srcID,PWS_ID,lon,lat,Own_Loc_Co,Src_Type,Src_Name,Src_Avl,Src_Locati,Locn_Dtrm,...,Prdtn_Capt,Prdtn_Ca_1,Prdtn_Ca_2,Prdtn_Ca_3,PWSTypeDes,PWSTypeNam,Begin_Date,Sys_County,DEQ_Reg_Na,Ownr_State
0,11245,100103,-82.207111,35.763762,W01,G,WELL #1,P,SWEETWATER RD- ON RIGHT JUST PAST BRIDGE,E,...,57572.0,162000.0,0.0,0.0,Serves 15+ connections or regularly serves 25+...,Community,1977-01-01,YANCEY,ASHEVILLE REGIONAL OFFICE,NC
1,11246,100103,-82.207991,35.764855,W03,G,WELL #3,P,SWEETWATER RD ON LEFT,E,...,57572.0,162000.0,0.0,0.0,Serves 15+ connections or regularly serves 25+...,Community,1986-07-01,YANCEY,ASHEVILLE REGIONAL OFFICE,NC
2,11247,100103,-82.204079,35.767117,W3A,G,WELL #3A,P,RIGHT SIDE OF S TOE RIVER RD GOING TO S\D,E,...,57572.0,162000.0,0.0,0.0,Serves 15+ connections or regularly serves 25+...,Community,1994-10-01,YANCEY,ASHEVILLE REGIONAL OFFICE,NC
3,11248,100103,-82.194494,35.767939,W04,G,WELL #4,P,END OF FOREST SERVICE RD ON RT,E,...,57572.0,162000.0,0.0,0.0,Serves 15+ connections or regularly serves 25+...,Community,1994-10-01,YANCEY,ASHEVILLE REGIONAL OFFICE,NC
4,11251,100105,-82.349286,35.867846,W01,G,WELL #1,P,MTN AIR DRIVE,E,...,30000.0,25180.0,0.0,45000.0,Serves 15+ connections or regularly serves 25+...,Community,1995-04-01,YANCEY,ASHEVILLE REGIONAL OFFICE,NC


In [5]:
outfile = '../RoundOneDB/RoundOneDB-Geolocation.csv'
df.to_csv(outfile, header=True, index=False)

In [6]:
def select_quant_summary(conn):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute("""SELECT q.epa_srcid,q.station_id,g.lat,g.lon,q.analyte_name,q.abbreviation,
                   q.cas,q.conc_ppt,q.class,q.rl_ppt,q.source_file,q.trip_no,q.station_name,
                   q.sample_date,q.lab, g.src_type, g.src_name, g.well_i_rat, g.intk_loc, g.depth
                   FROM quant_summary q 
                   INNER JOIN geolocation g ON g.EPA_srcID = q.EPA_srcID
                   ORDER BY q.epa_srcid""")

    rows = cur.fetchall()
    
    df = pd.DataFrame(columns = ['epa_srcid','station_id','lon','lat','analyte_name','abbreviation','cas',
                                 'conc_ppt','class','rl_ppt','source_file','trip_no','station_name',
                                 'sample_date','lab','src_type', 'src_name', 'well_i_rat', 'intk_loc', 'depth']) 
    for row in rows:
        df = df.append({'epa_srcid': row[0],'station_id': row[1],'lon': row[2],'lat': row[3],
                        'analyte_name': row[4],'abbreviation': row[5],'cas': row[6],'conc_ppt': row[7],
                        'class': row[8],'rl_ppt': row[9],'source_file': row[10],'trip_no': row[11],
                        'station_name': row[12],'sample_date': row[13],'lab': row[14],'src_type': row[15], 
                        'src_name': row[16], 'well_i_rat': row[17], 'intk_loc': row[18], 'depth': row[19]}, 
                  ignore_index = True)

    return df

In [7]:
df = select_quant_summary(conn)
df.head()

Unnamed: 0,epa_srcid,station_id,lon,lat,analyte_name,abbreviation,cas,conc_ppt,class,rl_ppt,source_file,trip_no,station_name,sample_date,lab,src_type,src_name,well_i_rat,intk_loc,depth
0,11606,NC0103010,-81.134111,36.512317,Perfluoro-2-ethoxypropanoic acid,PEPA,267239-61-2,,Perfluoroalkylether Acids (PFEAs),1,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"SPARTA, TOWN OF",2019-07-02,Knappe,G,WELL #9,H,,329
1,11606,NC0103010,-81.134111,36.512317,"1,1,2,2-tetrafluoro-2-(1,2,2,2-tetrafluoro-eth...",NVHOS,,,Perfluoroalkylether Acids (PFEAs),5,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"SPARTA, TOWN OF",2019-07-02,Knappe,G,WELL #9,H,,329
2,11606,NC0103010,-81.134111,36.512317,N-(3-dimethylaminopropan-1-yl)perfluoro-1-hexa...,N-AP-FHxSA,50598-28-2,,Zwitterions,10,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"SPARTA, TOWN OF",2019-07-02,Knappe,G,WELL #9,H,,329
3,11606,NC0103010,-81.134111,36.512317,Perfluorononanesulfonic acid,PFNS,68259-12-1,,Perfluoroalkylsulfonic Acids (PFSAs),2,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"SPARTA, TOWN OF",2019-07-02,Knappe,G,WELL #9,H,,329
4,11606,NC0103010,-81.134111,36.512317,Perfluorooctanoic acid,PFOA,335-67-1,,Perfluoroalkylcarboxylic acids (PFCAs),2,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"SPARTA, TOWN OF",2019-07-02,Knappe,G,WELL #9,H,,329


In [8]:
outfile = '../RoundOneDB/RoundOneDB-QuantSummary.csv'
df.to_csv(outfile, header=True, index=False)

In [9]:
def count_station_ids(conn):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute("""SELECT DISTINCT epa_srcid FROM quant_summary
                   ORDER BY epa_srcid""")

    rows = cur.fetchall()
    
    for row in rows:
        epa_srcid = row[0]
        cur.execute("""SELECT DISTINCT station_id, sample_date FROM quant_summary
                       WHERE epa_srcid = ? ORDER BY station_id""",
                   (epa_srcid,))
        
        srows = cur.fetchall()
        if len(srows) == 1:
            for srow in srows:
                print(epa_srcid,srow[0],srow[1])

In [10]:
count_station_ids(conn)

11606 NC0103010 2019-07-02
11736 NC0105010 2019-07-02
11747 NC0105015-2 2019-07-02
11748 NC0105020 2019-10-09
11886 NC0106015 2019-07-03
11894 NC0106025 2019-07-03
12111 NC0111020 2019-06-24
12369 NC0111484 2019-07-30
14068 NC0122010 2019-07-26
15144 NC0138105 2019-07-25
17170 NC0156025 2019-06-24
17612 NC0158015-1 2019-07-10
17615 NC0158015-2 2019-10-09
17616 NC0158020 2019-07-11
18600 NC0161010 2019-10-09
18601 NC0161015 2019-10-09
18648 NC0175015 2019-08-12
18765 NC0180050 2019-06-03
19326 NC0181020 2019-06-24
19637 NC0187010 2019-07-25
19829 NC0188010 2019-08-12
20088 NC0190010 2019-08-24
20609 NC0195118 2019-07-03
20844 NC0197050 2019-04-17
21433 NC0217015 2019-06-11
22844 NC0276025 2019-07-17
23726 NC0285015 2019-07-18
23730 NC0285020 2019-07-11
24697 NC0299020 2019-04-16
24989 NC0309015 2019-08-29
24990 NC0309020 2019-10-16
24993 NC0309025 2019-08-29
24996 NC0309030 2019-08-29
25001 NC0309050 2019-05-29
25006 NC0309055 2019-08-29
25008 NC0309060 2019-08-29
25354 NC0326040 2019-0

39325 NC0433010 2019-06-25
39326 NC0442010-2 2019-09-17
39327 NC0442010-1 2019-09-17
39328 NC0442020 2019-09-17
39329 NC0442025 2019-06-25
39330 NC0464010 2019-06-25
39333 NC0465010-1 2019-05-29
39334 NC0465010 2019-07-16
39338 NC0496010 2019-08-20
39343 NC0498010-2 2019-08-06
39344 NC0498010-1 2019-08-06
65263 NC0286020-2 2019-04-17
65281 NC0407025 2019-04-24
66223 NC0472025 2019-08-20
66799 NC0180055 2019-06-03
67146 NC0446010 2019-08-20
68214 NC0149010-1 2019-04-16
69783 NC0230015-3 2019-07-11
70125 NC0113025 2019-08-12
70211 NC6054001 2019-08-06
70482 NC0347010 2019-08-22
71288 NC0465232 2019-07-16
71348 NC0410130 2019-05-29
71661 NC0425010 2019-08-06
71958 NC7024013 2019-11-05
71962 NC0431025 2019-07-16
71963 NC6059015-1 2019-08-06
71995 NC3076010 2019-08-24
72318 NC0106010 2019-07-03
72434 NC0106020 2019-07-03
72708 NC0448020 2019-08-20
72833 NC0467010 2019-05-07
72946 NC0378030 2019-08-22
72995 NC0138010-4 2019-07-25
73099 NC1081024 2019-07-10
73132 NC5009012 2019-08-22
73153 NC

In [11]:
def getStations(conn):
    cur = conn.cursor()
    cur.execute("""select distinct q.station_id, q.station_name, q.epa_srcid, q.sample_date, g.lon, g.lat 
                   from quant_summary q INNER JOIN geolocation g ON g.EPA_srcID = q.EPA_srcID 
                   order by station_id""")
    rows = cur.fetchall()
    
    df = pd.DataFrame(columns = ['station_id', 'station_name', 'epa_srcid', 'sample_date', 'lon', 'lat']) 
    for row in rows:
        df = df.append({'station_id': row[0], 'station_name': row[1], 'epa_srcid': row[2], 
                        'sample_date': row[3], 'lon': row[4], 'lat': row[5]}, 
                  ignore_index = True)

    return df    

In [12]:
db_file = '../RoundOneDB/RoundOneDB_final.db'
conn = create_connection(db_file)
df = getStations(conn)
df.head()

Unnamed: 0,station_id,station_name,epa_srcid,sample_date,lon,lat
0,NC0100010-1,"BURNSVILLE, TOWN OF",39013,2019-07-31,35.900561,-82.315046
1,NC0100010-2,"BURNSVILLE, TOWN OF",39012,2019-10-09,35.872044,-82.280285
2,NC0103010,"SPARTA, TOWN OF",11606,2019-07-02,36.512317,-81.134111
3,NC0105010,"WEST JEFFERSON, TOWN OF",11736,2019-07-02,36.408568,-81.496376
4,NC0105015-1,"JEFFERSON, TOWN OF - 1",39024,2019-07-02,36.411667,-81.404243


In [13]:
a = list(df['epa_srcid'])
list(valfilter(lambda count: count > 1, frequencies(a)).keys())

[39113, 39114, 39256, 39336]

In [14]:
df = pd.read_csv('../RoundOneDB/stationNames.csv', sep='|')
df.head()

Unnamed: 0,epa_srcid,station_id,station_name,src_name,src_locati,src_type,src_avl,depth,lat,lon
0,39013,NC0100010-1,"BURNSVILLE, TOWN OF",CANE RIVER,2 MILES UP CANE RIVER RD,S,P,,-82.315046,35.900561
1,39012,NC0100010-2,"BURNSVILLE, TOWN OF",BOLENS CREEK,HWY 19 TO 197N SR 1198,S,P,,-82.280285,35.872044
2,11606,NC0103010,"SPARTA, TOWN OF",WELL #9,MITCHELL MOUNTAIN RD,G,P,329.0,-81.134111,36.512317
3,11736,NC0105010,"WEST JEFFERSON, TOWN OF",WELL #3A-WEST SEVENTH ST,WEST 7TH ST OUTSIDE OF FENCE,G,P,500.0,-81.496376,36.408568
4,39024,NC0105015-1,"JEFFERSON, TOWN OF - 1",SO FORK NEW RIVER,DON WALTERS RD PAST JEFFERSON,S,P,,-81.404243,36.411667


In [15]:
src = list(df['src_type'])
src

['S',
 'S',
 'G',
 'G',
 'S',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'G',
 'G',
 'S',
 'S',
 'G',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'G',
 'G',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'G',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'G',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'S',
 'G',
 'S',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'G',
 'S',
 'S',
 'S',
 'S'

In [17]:
depth = list(df['depth'])
np.nanmax(depth)

1000.0

In [18]:
np.nanmin(depth)

40.0

In [3]:
def select_quant_summary2(conn):
    """
    Query all rows in the tasks table
    :param conn: the Connection object
    :return:
    """
    cur = conn.cursor()
    cur.execute("""SELECT q.epa_srcid,q.station_id,g.lat,g.lon,q.analyte_name,q.abbreviation,
                   q.cas,q.conc_ppt,q.class,q.rl_ppt,q.source_file,q.trip_no,q.station_name,
                   q.sample_date,q.lab, g.src_type, g.src_name, g.well_i_rat, g.intk_loc, g.depth
                   FROM quant_summary q 
                   INNER JOIN geolocation g ON g.EPA_srcID = q.EPA_srcID
                   WHERE q.conc_ppt NOT NULL
                   ORDER BY q.epa_srcid""")

    rows = cur.fetchall()
    
    df = pd.DataFrame(columns = ['epa_srcid','station_id','lon','lat','analyte_name','abbreviation','cas',
                                 'conc_ppt','class','rl_ppt','source_file','trip_no','station_name',
                                 'sample_date','lab','src_type', 'src_name', 'well_i_rat', 'intk_loc', 'depth']) 
    for row in rows:
        df = df.append({'epa_srcid': row[0],'station_id': row[1],'lon': row[2],'lat': row[3],
                        'analyte_name': row[4],'abbreviation': row[5],'cas': row[6],'conc_ppt': row[7],
                        'class': row[8],'rl_ppt': row[9],'source_file': row[10],'trip_no': row[11],
                        'station_name': row[12],'sample_date': row[13],'lab': row[14],'src_type': row[15], 
                        'src_name': row[16], 'well_i_rat': row[17], 'intk_loc': row[18], 'depth': row[19]}, 
                  ignore_index = True)

    return df

In [4]:
db_file = '../RoundOneDB/RoundOneDB_final.db'
conn = create_connection(db_file)
df = select_quant_summary2(conn)
df.head()

Unnamed: 0,epa_srcid,station_id,lon,lat,analyte_name,abbreviation,cas,conc_ppt,class,rl_ppt,source_file,trip_no,station_name,sample_date,lab,src_type,src_name,well_i_rat,intk_loc,depth
0,11736,NC0105010,-81.496376,36.408568,Perfluorobutanesulfonic acid,PFBS,375-73-5,3.399,Perfluoroalkylsulfonic Acids (PFSAs),1.0,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"WEST JEFFERSON, TOWN OF",2019-07-02,Knappe,G,WELL #3A-WEST SEVENTH ST,H,,500
1,11736,NC0105010,-81.496376,36.408568,Perfluorohexanesulfonic acid,PFHxS,355-46-4,4.438,Perfluoroalkylsulfonic Acids (PFSAs),2.0,E:/TraceFinderData/Projects/PFAST/Reports/Trip...,17,"WEST JEFFERSON, TOWN OF",2019-07-02,Knappe,G,WELL #3A-WEST SEVENTH ST,H,,500
2,12111,NC0111020,-82.334676,35.62829,Perfluorooctanoic acid,PFOA,335-67-1,2.2,Perfluoroalkylcarboxylic acids (PFCAs),1.0,C:/Users/asj31/Desktop/PFaster/Trip24.db,24,"BLACK MOUNTAIN, TOWN OF",2019-06-24,Ferguson,G,WELL #1,M,,505
3,12111,NC0111020,-82.334676,35.62829,Perfluoroundecanoic acid,PFUnDA,2058-94-8,1.3,Perfluoroalkylcarboxylic acids (PFCAs),1.0,C:/Users/asj31/Desktop/PFaster/Trip24.db,24,"BLACK MOUNTAIN, TOWN OF",2019-06-24,Ferguson,G,WELL #1,M,,505
4,17170,NC0156025,-82.157259,35.641169,Perfluorooctanoic acid,PFOA,335-67-1,1.8,Perfluoroalkylcarboxylic acids (PFCAs),1.0,C:/Users/asj31/Desktop/PFaster/Trip24.db,24,"OLD FORT, TOWN OF",2019-06-24,Ferguson,G,WELL #10,M,,305


In [5]:
outfile = '../RoundOneDB/RoundOneDB-QuantSummary-NN.csv'
df.to_csv(outfile, header=True, index=False)

In [8]:
np.unique(np.array(df['src_type']))

array(['G', 'S'], dtype=object)