In [1]:
# to import the appropriate modules
import pandas as pd
import numpy as np

# load the data
df = pd.read_csv('Arctic_Tidal_Constituents_Database_v2.txt',sep="\t",
                low_memory=False) 

In [2]:
def arctic_subset_variable(dataset,variable_name,options,num_of_options,minimum=False,maximum=False):
    
    
    '''
    dataset:       the full arctic dataset, can also be an already subset dataset.
    variable:      the name of the variable wanting to be selected
    options:       the selected options. Note the format of the date options are as follows 1990-01-01, 
                   year-month-date
    no_of_options: the number of options for the variable specified
    minumum:       is the option chosen to be a minimum value for which all data above this threshold 
                   are provided. Default is False, so only selections are given.
    maximum:       is the option chosen to be a maximum value for which all data above this threshold 
                   are provided. Default is False, so only selections are given.    
    
    '''
    variables = ['source_id','lon','lat','cons','amp','pha','start',
                 'end','number_of_obs','missing_obs','source','instrument',
                 'site','rec_length','sampling_rate','inference','notes',
                 'data_flag','expert_flag','site_record','site_total','amp_units']
    
    if variable_name not in variables:
        
        print(variable_name,'is either spelt incorrectly or not in the dataset',
             'see below the list of variables within the dataset. Returning full dataset')
        print(variables)
        md = dataset
        
        return md

    if (variable_name == 'lon') | (variable_name == 'lat'):

        try:
            
            range_coord = np.sort(np.array(options))

            ind = np.where((dataset[variable_name]>range_coord.min()) & 
                           (dataset[variable_name]<range_coord.max()))

            md1 = dataset.iloc[ind].reset_index(drop=True)
            
        except:
            
            print('cannot select only one lon or lat. Please provide a range of longitude or latitude')
            print('Returning full dataset')
            md1 = dataset
        return md1
        
    elif (variable_name == 'start') | (variable_name == 'end'):
        
        times_range = np.array(pd.to_datetime(df[variable_name]))
        
        if (minimum==False and maximum==True):
            
            ind = np.where((times_range<=np.datetime64(options)))
            md1 = dataset.iloc[ind].reset_index(drop=True)
            
        elif (minimum==True and maximum==False):
            
            ind = np.where((times_range>=np.datetime64(options)))
            md1 = dataset.iloc[ind].reset_index(drop=True)
        
        elif (minimum==False and maximum==False):
            
            ind = np.where((times_range>=np.datetime64(options)))
            md1 = dataset.iloc[ind].reset_index(drop=True)
            print('no minimum or maximum specified, so assumed to be minimum')
            
        elif (minimum==True and maximum==True):
            
            print('cannot provide both maximum and minimum for the time variables')
            print('Returning full dataset')
            md1 = dataset
            
        else:

            md1 = dataset
            
        return md1
    
    else:
        if (minimum==True and maximum==True):
            
            options = np.sort(options)
            ind = np.where((dataset[variable_name]>=options[0])&(dataset[variable_name]<=options[1]))
            md1 = dataset.iloc[ind].reset_index(drop=True)
        
        elif (minimum==False and maximum==True):
            
            ind = np.where(dataset[variable_name]<=options)
            md1 = dataset.iloc[ind].reset_index(drop=True)
       
        elif (minimum==True and maximum==False):
            
            ind = np.where(np.array(dataset[variable_name],dtype='float64')>=options)
            md1 = dataset.iloc[ind].reset_index(drop=True)
            
        else:
    
            if num_of_options > 1:

                for sel in options:

                    ind = np.where(dataset[variable_name] == sel)
                    md = dataset.iloc[ind].reset_index(drop=True)

                    try:

                        md1 = pd.concat((md1,md))

                    except:

                        md1 = md

            elif num_of_options == 1:
#                 print(variable_name)

                ind = np.where(dataset[variable_name] == options)
                md1 = dataset.iloc[ind].reset_index(drop=True)

            else:
                
                print('please provide the number of options you are providing.'
                      ' This is done to correctly subset')
                print('Returning the full dataset')
        
        return md1

In [3]:
# select only amp_units that are in cm
md = arctic_subset_variable(df,'amp_units','cm',1) 

# subset by longitude
md = arctic_subset_variable(df,'lon',[20,40],1) 

# subset by latitude
md = arctic_subset_variable(df,'lon',[70,90],1) 

# select data_flag at certain value
md = arctic_subset_variable(df,'data_flag',0,1) 

# select expert_flag at certain value
md = arctic_subset_variable(df,'expert_flag',0,1) 

# select data with a minimum number of observations
md = arctic_subset_variable(df,'number_of_obs',100000,1,minimum=True) 

# select data with a minimum and maximum number of observations
md = arctic_subset_variable(df,'number_of_obs',[100000,150000],1,minimum=True,maximum=True)

# select data from after a start date
md = arctic_subset_variable(df,'start','1980-01-01',1,minimum=True)

In [4]:
md

Unnamed: 0,source_id,lon,lat,cons,amp,pha,start,end,number_of_obs,missing_obs,...,site,rec_length,sampling_rate,inference,data_flag,expert_flag,site_record,site_total,amp_units,notes
0,24,3.810,55.400,O1,3.842,154.299,2015-06-28,2020-10-06,254777.0,68.60,...,A121,2191.50,60.0,,0.0,0.0,1.0,1.0,cm,
1,24,3.810,55.400,K1,3.637,311.713,2015-06-28,2020-10-06,254777.0,68.60,...,A121,2191.50,60.0,,0.0,0.0,1.0,1.0,cm,
2,24,3.810,55.400,M2,27.119,130.965,2015-06-28,2020-10-06,254777.0,68.60,...,A121,2191.50,60.0,,0.0,0.0,1.0,1.0,cm,
3,24,3.810,55.400,S2,7.494,173.509,2015-06-28,2020-10-06,254777.0,68.60,...,A121,2191.50,60.0,,0.0,0.0,1.0,1.0,cm,
4,24,3.810,55.400,S4,0.287,335.690,2015-06-28,2020-10-06,254777.0,68.60,...,A121,2191.50,60.0,,0.0,0.0,1.0,1.0,cm,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26167,24,6.078,52.639,OO1,0.072,276.030,2014-08-04,2020-10-06,309389.0,9.08,...,Zwartsluis,2556.75,60.0,,0.0,0.0,1.0,1.0,cm,
26168,24,6.078,52.639,R2,0.214,175.632,2014-08-04,2020-10-06,309389.0,9.08,...,Zwartsluis,2556.75,60.0,,0.0,0.0,1.0,1.0,cm,
26169,24,6.078,52.639,MSF,0.612,87.474,2014-08-04,2020-10-06,309389.0,9.08,...,Zwartsluis,2556.75,60.0,,0.0,0.0,1.0,1.0,cm,
26170,24,6.078,52.639,N2,0.212,35.082,2014-08-04,2020-10-06,309389.0,9.08,...,Zwartsluis,2556.75,60.0,,0.0,0.0,1.0,1.0,cm,
