<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#scan_zip()" data-toc-modified-id="scan_zip()-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>scan_zip()</a></span></li><li><span><a href="#parse_kml(kml)" data-toc-modified-id="parse_kml(kml)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>parse_kml(kml)</a></span></li><li><span><a href="#habitat()" data-toc-modified-id="habitat()-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>habitat()</a></span></li><li><span><a href="#geolocators(lat,-long,-df)" data-toc-modified-id="geolocators(lat,-long,-df)-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>geolocators(lat, long, df)</a></span></li><li><span><a href="#reverse-geocoding" data-toc-modified-id="reverse-geocoding-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>reverse geocoding</a></span></li><li><span><a href="#parse_file()" data-toc-modified-id="parse_file()-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>parse_file()</a></span></li><li><span><a href="#Database-insertion" data-toc-modified-id="Database-insertion-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>Database insertion</a></span></li></ul></div>

In [1]:
from zipfile import ZipFile  
import zipfile
import sys  #sys.exit to stop program when conditions not met
import os  
import pandas as pd  
import numpy as np
import requests  #Used for sunrise sunset api
import json  
from bs4 import BeautifulSoup as Soup  
import geopandas as gpd  
from shapely.geometry import Point  
from shapely import wkt  
from geopy.geocoders import Nominatim 
import shutil  
import pymongo
#cd C:\Program Files\MongoDB\Server\5.0\bin
#mongod
#mongo

# scan_zip()

In [2]:
#Scan through directory to find zip file
#Scan through zip file to ensure only .wav and .kml files are present
#If other files present then code exited
def scan_zip():
    for i in os.listdir():  #looping through directory to find zip file
        if '.zip' in i:
            zips = [i]

    if len(zips) == 1:  #will only proceed if 1 zip file found
        if (zipfile.is_zipfile(zips[0])) == True:
            print('One zip file found, will continue to process.')
        else:
            sys.exit('zip file corrupted.')  #if 0 or more than 1 zip found processing stopped
    if len(zips) < 1:
        sys.exit('No zip file found, not able to process.')
    if len(zips) > 1:
        sys.exit('More than one zip file found, not able to process.')

    ziplist = zipfile.ZipFile(zips[0]).namelist()  #Looking through zip file to ensure only approriate files found
    #should be at least 1 wav file and only 1 kml file and no other files
    wav_count = 0
    kml_count = 0
    other_count = 0
    other_file = []
    for i in ziplist:
        if '.wav' in i:
            wav_count += 1
        if '.kml' in i:
            kml_count += 1
        if '.wav' not in i:
            if '.kml' not in i:
                other_file.append(i)
                other_file += 1
    if wav_count < 1:
        sys.exit('Error, no .wav files found')
    if kml_count > 1:
        sys.exit('Error, too many .kml files found, should only be 1.')
    if kml_count < 1:
        sys.exit('Error, no .kml file found, should be 1.')
    if kml_count == 1:
        if wav_count >1:
            print('1 .kml file found,', 'and', wav_count, '.wav files found. Able to process files.')
            for i in os.listdir():
                if '.zip' in i:
                    zip_file = [os.path.abspath(i)]
    with ZipFile(zip_file[0], 'r') as zipObj:  #Unzipping and identifying kml file path
        listOfFileNames = zipObj.namelist()
        for fileName in listOfFileNames:
            if fileName.endswith('.kml'):
                zipObj.extract(fileName)
                fn = fileName
    destination = 'C:\\Users\\matta\\OneDrive\\Documents\\Python\\Bat Database\\LT_storage'
    if zips[0] in os.listdir(destination):
        print('File already uploaded to database.')
        process_opt = {1 : 'Yes', 2 : 'No'}
        
        while True:
            try:
                opt = int(input('Do you want to continue processing to replace existing file? 1 for yes, 2 for no and exit'))
                if opt in process_opt.keys():
                    if opt == 1:
                        print('Original file in database will be replaced.')
                        os.remove(destination + '\\' + zips[0])
                        return(fn)  #kml file path
                        break
                    else:
                        if opt == 2:
                            print('Upload will not continue.')
                            return('xxx')
                            break
            except:
                print('Invalid input, select either 1 for yes to replace or 2 for no and exit.')
    else:
        return(fn)

# parse_kml(kml)

In [3]:
#Parsing data in kml to place in dataframe or json for insertion into database
def parse_kml(kml):
    f = open(kml, 'r')  #open kml file from path
    s = Soup(f, 'xml')  #Creating beautiful soup object
    placemark = s.find_all("Placemark")  #parsing different sections of kml
    names = []  #will be used for list of bat names detected
    coordinatesbats = []  #will be used for coordinates for each wav file
    for i in range(len(placemark)-1):  #last placemark in KML is a list of all locations logged in session so it is removed from loop
        names.append(placemark[i].find('name').text)
        coordinatesbats.append(placemark[i].find("coordinates").text)
    coords = s.find('LineString').find('coordinates').text
    coordinates = []  #coordinates of path walked for survey
    for i in coords.split(' '):  #splitting out lat, long, elevation 
        coordinates.append(list(i.split(',')))
    #extracting divice used, removed .text.replace()
    detector = placemark[0].find('ExtendedData').text.replace('\n', ' ').strip().split('  ')[1].strip()  
    df = pd.DataFrame()  #making dataframe to place all metadata extracted from kml for each detection
    df['names'] = names  #file names
    df['coordinates'] = coordinatesbats  #coordinates of each detection
    df[['id', 'date', 'time']]  = df['names'].str.split('_', expand=True)  #bat id, date and time of detection
    df[['long', 'lat', 'elevation']] = df['coordinates'].str.split(',', expand=True)  #Lat, long, elevation of each detection
    df = df.drop(columns = 'coordinates')  #drop coordinated column now that it has been parsed out
    bats = s.find_all('Icon')  #Identification of bat or each wav file
    species = []
    for i in range(len(bats)-1):  #finding names of all bat potential ids
        x = bats[i].find('href').text  #extracting potential bat name
        y = x.split('_')[1]  
        species.append(y.split('.')[0])  #extracting actual bat name
    df = df[df['id'].isin(species)]  #making sure all id wav files found in possible bat id file
    table = pd.DataFrame(df.id.value_counts()).T  #new dataframe for aggregated data
    #averaging out all lat, long, elev as a generalize decription of location
    table[['latitude', 'longitude', 'elevation']] = np.mean(df.lat.astype(float)), np.mean(df.long.astype(float)), np.mean(df.elevation.astype(float))
    detect = list(df.id.unique())  #these will be used as column lables for bat detection total counts
    table['total_detections'] = table[detect].sum(axis=1)  #total detections of bats, sum of all detections
    df['datetime'] = pd.to_datetime((df.date + df.time), format='%Y%m%d%H%M%S', errors='ignore')  #changing datatype
    
    table['1st_detection_time'] = min(df.datetime)  #Datetime of first recording
    table['1st_detection_time'] = str(table['1st_detection_time']).split('\n')[0].split(' ')[4]
    table['last_detection_time'] = max(df.datetime) #Datetime of last recoding
    table['last_detection_time'] = str(table['last_detection_time']).split('\n')[0].split(' ')[4]
    table['recording_duration'] = max(df.datetime) - min(df.datetime)  #difference in time between first and last recodings
    table['recording_duration'] = str(table.recording_duration).split('\n')[0].split(' ')[5]
    table['date'] = df.date[0]
    
    lat = table.latitude.values[0]
    long = table.longitude.values[0]
    date = pd.to_datetime(table['date']).dt.strftime('%Y-%m-%d').values[0]
    key = 'b1f36a90a5b0448888404882991efde3'
    query = 'https://api.ipgeolocation.io/astronomy?'f'apiKey={key}&lat={lat}&long={long}&date={date}'  #API queiry for sunrise/sunset
    a = requests.get(query)
    a = json.loads(a.text)
    table['sunrise_time'] = a['sunrise']
    table['sunset_time'] = a['sunset']
    table['detector'] = detector
    f.close()
    os.remove(kml)  #Deleting unzipped kml
    os.rmdir(kml.split('/')[0])  #deleting unzipped kml directory 
    for i in os.listdir():
        if '.zip' in i:
            zip_file = [os.path.abspath(i)]
    source = zip_file[0]  #Path of raw data zipfile
    #Destination of zipfile for long term storage
    destination = 'C:\\Users\\matta\\OneDrive\\Documents\\Python\\Bat Database\\LT_storage'
    dest = shutil.move(source, destination)  #moving file to long term storage
    table['path'] = dest  #saving path of raw data in lont term storage
    return(table)  #returns summary 1 row dataframe of data collected in survey

# habitat()

In [4]:
#Asks user basic habitat questions that will be entered into the dataframe
def habitat():
    #Wetland
    wetland_opt = {0 : 'Upland', 1 : 'Wetland'}
    while True:
        try:
            wtl = int(input('Did the area surveyed contain wetland?: 1 = yes, 0 = no'))
            if (wtl in wetland_opt.keys()):
                print(wetland_opt[wtl])
                wetland = wetland_opt[wtl]
                break
            else:
                print('Invalid input, select either 1 for wetland present or 0 for no wetlands') 
        except:
            print('Invalid input, select either 1 for wetland present or 0 for no wetlands')

    #Development
    dvlp_opt = {3 : 'Urban', 2 : 'Suburban', 1 : 'Rural', 0 : 'Undeveloped'}
    while True:
        try:
            dvlp = int(input('How developed was the area surveyed?: 3 = Urban, 2 = Suburban, 1 = Rural, 0 = Undeveloped'))
            if (dvlp in dvlp_opt.keys()):
                print(dvlp_opt[dvlp])
                development = dvlp_opt[dvlp]
                break
            else:
                print("Invalid input: select 3 = Urban, 2 = Suburban, 1 = Rural, 0 = Undeveloped") 
        except:
            print("Invalid input: select 3 = Urban, 2 = Suburban, 1 = Rural, 0 = Undeveloped")

    #Vegetation
    veg_opt = {1 : 'desert scrub', 2 : 'grassland', 3 : 'shrubland', 4 : 'open woodland', 5 : 'forest', 
               6 : 'urban/suburban park or greenspace', 7 : 'predominantly buildings/cement', 8 : 'neighborhood'}
    while True:
        try:
            print('What type of vegetation/habitat is present?: 1 : desert scrub, 2 : grassland, 3 : shrubland,', 
                            '4 : open woodland, 5 : forest, 6 : urban/suburban park or greenspace,'
                            '7 : predominantly buildings/cement, 8 : neighborhood')
            veg = int(input())
            if (veg in veg_opt.keys()):
                print(veg_opt[veg])
                vegetation = veg_opt[veg]
                break
            else:
                print('Invalid input: select 1 : desert scrub, 2 : grassland, 3 : shrubland,', 
                            '4 : open woodland, 5 : forest, 6 : urban/suburban park or greenspace,'
                            '7 : predominantly buildings/cement, 8 : neighborhood') 
        except:
            print('Invalid input: select 1 : desert scrub, 2 : grassland, 3 : shrubland, 4 : open woodland, 5 : forest, 6 : urban/suburban park or greenspace, 7 : agriculture, 8 : predominantly buildings/cement, 9 : neighborhood')
        
    #Dataframe of output
    hdf = pd.DataFrame()
    hdf['id'] = [wetland, development, vegetation]
    hdf = hdf.T
    hdf.columns = 'Wetland_status', 'Development', 'Vegetation'
    return(hdf)

# geolocators(lat, long, df)

In [5]:
#EPA North American Ecoregions Level 3
NA3 = pd.read_csv('ecoregions3.csv') #Load Pandas dataframe
NA3['geometry'] = NA3['geometry'].apply(wkt.loads) #Convert geometry for geopandas
NA3 = gpd.GeoDataFrame(NA3) #Convert pandas dataframe to geopandas dataframe

#Create function to obtain epa ecoregions 1, 2, and 3
#Uses single set of lat and long for output
def get_eco(lat, long):
  point = Point(long, lat)
  for i in range(0, len(NA3)):
    if point.within(NA3["geometry"][i])==True:
      loc = [NA3['NA_L1KEY'][i],  NA3['NA_L2KEY'][i],  NA3['NA_L3KEY'][i]]
      return(loc)

#Create function that uses get_eco for entire dataframe.
def get_ecoregions(lat, long, df):  #df is dataframe function will be used on
  ecos = []
  for i in range(0, len(df)):
    ecos.append(get_eco(lat[i], long[i]))
  df[['Level_1', 'Level_2', 'Level_3']] = ecos

########## 
#EPA United States Ecoregions Level 4
US4 = pd.read_csv('ecoregions4.csv') #Load Pandas dataframe
US4['geometry'] = US4['geometry'].apply(wkt.loads) #Convert geometry for geopandas
US4 = gpd.GeoDataFrame(US4) #Convert pandas dataframe to geopandas dataframe

#Create function to obtain epa ecoregion 4
#Uses single set of lat and long for output
def get_eco4(lat, long):
  point = Point(long, lat)
  for i in range(0, len(US4)):
    if point.within(US4["geometry"][i])==True:
      loc = [US4['L1_KEY'][i], US4['L2_KEY'][i], US4['L3_KEY'][i], US4['L4_KEY'][i]]
      return(loc)

#Create function that uses get_eco for entire dataframe.
def get_ecoregions4(lat, long, df):  #df is dataframe function will be used on
  ecos = []
  for i in range(0, len(df)):
    ecos.append(get_eco4(lat[i], long[i]))
  df[['Level_4', 'Level_3', 'Level_2', 'Level_1']] = ecos

###########
#WWF ecoregions
wwf = pd.read_csv('wwfecos.csv') #Load Pandas dataframe
wwf['geometry'] = wwf['geometry'].apply(wkt.loads) #Convert geometry for geopandas
wwf = gpd.GeoDataFrame(wwf) #Convert pandas dataframe to geopandas dataframe

#Create function to obtain epa ecoregions 1, 2, and 3
#Uses single set of lat and long for output
def get_wwfeco(lat, long):
  point = Point(long, lat)
  for i in range(0, len(wwf)):
    if point.within(wwf["geometry"][i])==True:
      loc = [wwf['ECO_NAME'][i]]
      return(loc)

#Create function that uses get_eco for entire dataframe.
def get_wwfecoregions(lat, long, df):  #df is dataframe function will be used on
  ecos = []
  for i in range(0, len(df)):
    ecos.append(get_wwfeco(lat[i], long[i]))
  df[['WWF_ecoregion']] = ecos

###################
#koppen_geiger regions
kg = pd.read_csv('koppen_geiger.csv') #Load Pandas dataframe
kg['geometry'] = kg['geometry'].apply(wkt.loads) #Convert geometry for geopandas
kg = gpd.GeoDataFrame(kg) #Convert pandas dataframe to geopandas dataframe

#Create function to obtain epa ecoregions 1, 2, and 3
#Uses single set of lat and long for output
def get_clim(lat, long):
  point = Point(long, lat)
  for i in range(0, len(kg)):
    if point.within(kg["geometry"][i])==True:
      loc = [kg['climates_f'][i]]
      return(loc)

#Create function that uses get_eco for entire dataframe.
def get_koppen_greiger(lat, long, df):  #df is dataframe function will be used on
  ecos = []
  for i in range(0, len(df)):
    ecos.append(get_clim(lat[i], long[i]))
  df[['koppen_geiger']] = ecos

In [6]:
#combining all geolocators into one function
def geolocators(lat, long, df):
    get_koppen_greiger(lat, long, df)
    get_wwfecoregions(lat, long, df)
    try:
        get_ecoregions4(lat, long, df)  
    except:
        pass  #pass is used here because lat, long outside of USA will produce errors
    try:
        get_ecoregions(lat, long, df)
    except:
        pass  #pass used here because lat, long outside of North America will produce errors

Need username and password sign-in shell <br>
Need drag and drop file uploader for zip data <br>
Need to insert user name from sign-in into dataframe <br>
Need to insert survey site name into dataframe <br>
Function to insert metadata into database with path to raw data zip file

# reverse geocoding

In [7]:
def location(df):
    geolocator = Nominatim(user_agent="geoapi")
    coords = str(df.latitude[0]) + ', ' + str(df.longitude[0])
    location = geolocator.reverse(coords)
    for k, v in location.raw['address'].items():
        df[k] = v

# parse_file()

In [8]:
#combining all functions into one function
def process_file():
    fn = scan_zip()
    if fn == 'xxx':
        sys.exit('File already uploaded')
    else:
        x = parse_kml(fn)
        geolocators(x.latitude, x.longitude, x)
        location(x)
        h = habitat()
        fdf = pd.concat([x, h], axis = 1, join = 'outer')
    return(fdf)

In [9]:
df = process_file()

One zip file found, will continue to process.
1 .kml file found, and 139 .wav files found. Able to process files.
Did the area surveyed contain wetland?: 1 = yes, 0 = no0
Upland
How developed was the area surveyed?: 3 = Urban, 2 = Suburban, 1 = Rural, 0 = Undeveloped1
Rural
What type of vegetation/habitat is present?: 1 : desert scrub, 2 : grassland, 3 : shrubland, 4 : open woodland, 5 : forest, 6 : urban/suburban park or greenspace,7 : predominantly buildings/cement, 8 : neighborhood
1
desert scrub


In [10]:
df

Unnamed: 0,NoID,DICALB,MOLRUF,CYNMEX,MOLMOL,MOLSIN,MYOCAL,SACBIL,EUMPER,PARHES,...,village,municipality,state,ISO3166-2-lvl4,postcode,country,country_code,Wetland_status,Development,Vegetation
id,92,14,13,4,4,3,3,2,1,1,...,Tulum,Solidaridad,Quintana Roo,MX-ROO,77774,México,mx,Upland,Rural,desert scrub


# Database insertion

In [18]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

mydb = myclient["batdatabase"]

In [19]:
mycol = mydb["batsurveys"]

In [20]:
data = df.to_dict("records")

In [21]:
data

[{'NoID': 92,
  'DICALB': 14,
  'MOLRUF': 13,
  'CYNMEX': 4,
  'MOLMOL': 4,
  'MOLSIN': 3,
  'MYOCAL': 3,
  'SACBIL': 2,
  'EUMPER': 1,
  'PARHES': 1,
  'TADBRA': 1,
  'latitude': 20.254680195652174,
  'longitude': -87.40469659782609,
  'elevation': -4.688760717742896,
  'total_detections': 138,
  '1st_detection_time': '21:10:55',
  'last_detection_time': '21:42:56',
  'recording_duration': '00:32:01',
  'date': '20220629',
  'sunrise_time': '06:12',
  'sunset_time': '19:33',
  'detector': 'Echo Meter Touch 2 Standard Android',
  'path': 'C:\\Users\\matta\\OneDrive\\Documents\\Python\\Bat Database\\LT_storage\\Session_20220629_211054.zip',
  'koppen_geiger': 'AW',
  'WWF_ecoregion': 'Yucatán moist forests',
  'Level_1': '15  TROPICAL WET FORESTS',
  'Level_2': '15.2  PLAIN AND HILLS OF THE YUCATAN PENINSULA',
  'Level_3': '15.2.2  Plain with Medium and High Semi-Evergreen Tropical Forest',
  'leisure': 'Dreams Tulum Resort & spa',
  'road': 'Avenida Tulúm',
  'village': 'Tulum',
  'mun

In [22]:
mycol.insert_one({"index" : "bat_survey","data" : data})

<pymongo.results.InsertOneResult at 0x22c41f8ace0>

In [23]:
mycol

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'batdatabase'), 'batsurveys')

In [24]:
mycol.find_one()

{'_id': ObjectId('636449ced762aae0a80f61e7'),
 'index': 'bat_survey',
 'data': [{'NoID': 92,
   'DICALB': 14,
   'MOLRUF': 13,
   'CYNMEX': 4,
   'MOLMOL': 4,
   'MOLSIN': 3,
   'MYOCAL': 3,
   'SACBIL': 2,
   'EUMPER': 1,
   'PARHES': 1,
   'TADBRA': 1,
   'latitude': 20.254680195652174,
   'longitude': -87.40469659782609,
   'elevation': -4.688760717742896,
   'total_detections': 138,
   '1st_detection_time': '21:10:55',
   'last_detection_time': '21:42:56',
   'recording_duration': '00:32:01',
   'date': '20220629',
   'sunrise_time': '06:12',
   'sunset_time': '19:33',
   'detector': 'Echo Meter Touch 2 Standard Android',
   'path': 'C:\\Users\\matta\\OneDrive\\Documents\\Python\\Bat Database\\LT_storage\\Session_20220629_211054.zip',
   'koppen_geiger': 'AW',
   'WWF_ecoregion': 'Yucatán moist forests',
   'Level_1': '15  TROPICAL WET FORESTS',
   'Level_2': '15.2  PLAIN AND HILLS OF THE YUCATAN PENINSULA',
   'Level_3': '15.2.2  Plain with Medium and High Semi-Evergreen Tropical 

In [25]:
for i in mycol.find():
    print(i)

{'_id': ObjectId('636449ced762aae0a80f61e7'), 'index': 'bat_survey', 'data': [{'NoID': 92, 'DICALB': 14, 'MOLRUF': 13, 'CYNMEX': 4, 'MOLMOL': 4, 'MOLSIN': 3, 'MYOCAL': 3, 'SACBIL': 2, 'EUMPER': 1, 'PARHES': 1, 'TADBRA': 1, 'latitude': 20.254680195652174, 'longitude': -87.40469659782609, 'elevation': -4.688760717742896, 'total_detections': 138, '1st_detection_time': '21:10:55', 'last_detection_time': '21:42:56', 'recording_duration': '00:32:01', 'date': '20220629', 'sunrise_time': '06:12', 'sunset_time': '19:33', 'detector': 'Echo Meter Touch 2 Standard Android', 'path': 'C:\\Users\\matta\\OneDrive\\Documents\\Python\\Bat Database\\LT_storage\\Session_20220629_211054.zip', 'koppen_geiger': 'AW', 'WWF_ecoregion': 'Yucatán moist forests', 'Level_1': '15  TROPICAL WET FORESTS', 'Level_2': '15.2  PLAIN AND HILLS OF THE YUCATAN PENINSULA', 'Level_3': '15.2.2  Plain with Medium and High Semi-Evergreen Tropical Forest', 'leisure': 'Dreams Tulum Resort & spa', 'road': 'Avenida Tulúm', 'village'