In [30]:
import pandas as pd
import os

---

# test variables

In [81]:
my_cwd = os.getcwd()
reference_directory = os.path.join(my_cwd,"DemoDataIdentifyRelevantFiles")
new_coordinates = os.path.join(reference_directory,"TestingCoordinates")
data_directory = os.path.join(my_cwd,"DemoDataFolder")
coordinate_boundaries_filepath = os.path.join(reference_directory,"summary.csv")
output_path = os.path.join(reference_directory,"DemoOutput.csv")

# function

In [None]:
def RelevantData(new_coordinates,data_directory,coordinate_boundaries_filepath,output_path):
    '''
    Given lists of coordinates, go through the file with coordinate bounds, and identify files where the bounds intersect the bounds of the list.

    Args:
        new_coordinates (string): path to the file containing the coordinates
        data_directory (string): the filepath for the directory containing the data
        coordinate_boundaries_filepath (string): the filepath for the csv containing coordinate boundaries of all the data we have collected
        output_path (string): the full filepath to save the output csv


    Returns:
    Saves a csv of the output dataframe that containns the file paths of all files that account for the given coordinates in their boundaries.
    '''
    # load in the data
    established_boundaries = pd.read_csv(coordinate_boundaries_filepath)

    # loading in new coordinates
    new_coordinates_df = pd.read_csv(new_coordinates)
    # converting the x and y info into coordinate bounds
    new_boundaries = pd.DataFrame({
    'x_min': [new_coordinates_df['Coord_X '].min()]
    ,'x_max': [new_coordinates_df['Coord_X '].max()]
    ,'y_min': [new_coordinates_df['Coord_Y'].min()]
    ,'y_max': [new_coordinates_df['Coord_Y'].max()]
})
    
    # extracting scalar values
    x_min, x_max = new_boundaries['x_min'].iloc[0], new_boundaries['x_max'].iloc[0]
    y_min, y_max = new_boundaries['y_min'].iloc[0], new_boundaries['y_max'].iloc[0]

    # returning only established data values that intersect with this new data
    intersecting_bounds = established_boundaries[~(        (established_boundaries['x_min']> x_max)
                                           | (established_boundaries['x_max']< x_min)
                                           | (established_boundaries['y_min']> y_max)
                                           | (established_boundaries['y_max']< y_min)
                                           )]
    
    
    # creating a list to store filepaths and their bounds
    file_info = []

    # storing filepath and bounds information
    for root, _, files in os.walk(data_directory):
        for file in files:
            if file.endswith('.csv'):
                # loading in the file as a dataframe
                full_root_path = os.path.abspath(root)
                file_path = os.path.join(full_root_path, file)
                file_metadata = pd.read_csv(file_path)

                # creating coordinate bounds
                X_min, X_max = file_metadata['Coord_X'].min(), file_metadata['Coord_X'].max()
                Y_min, Y_max = file_metadata['Coord_Y'].min(), file_metadata['Coord_Y'].max()

                # storing data
                file_info.append({
                    'file_name': file,
                    'file_path': file_path,
                    'x_min': X_min,
                    'x_max': X_max,
                    'y_min': Y_min,
                    'y_max': Y_max
                })

    metadata = pd.DataFrame(file_info)

    # comparing all files with data we want
    relevant_files = metadata.merge(intersecting_bounds, on=["file_name", "x_min","x_max","y_min","y_max"], how="inner")
    relevant_files.to_csv(output_path, index=False)

    return(relevant_files)

In [80]:
RelevantData(new_coordinates,data_directory,coordinate_boundaries_filepath,output_path)

Unnamed: 0,file_name,file_path,x_min,x_max,y_min,y_max
0,coordinates_9.csv,c:\Users\dhahm\OneDrive\Desktop\MomOneDrive\On...,-93.168003,89.583659,-91.495306,98.545727
1,coordinates_6.csv,c:\Users\dhahm\OneDrive\Desktop\MomOneDrive\On...,-96.69592,98.795379,-98.423933,95.85775
