# Demo: File Sorting and Data Extraction

### The following code demonstrates how to sort through a set of log files, select only files of interest, and extract the lat/long coordinate pairs from each file. 

In [None]:
# Import the required packages.

import os, glob

import pandas as pd
import numpy as np

### This example assumes that all files are stored in the same directory/folder, and that the files of interest are all of the same file type. In this case, that file type is .csv. However, the code will work for any other file type. Simply change the search criteria from ".csv" to the appropriate file type.

In [None]:
#Set the path to the directory/folder where the files are stored. 
#Note: This example uses Mac path syntax but the same steps will work for Windows.

path="/Users/username/directory"

#Filter files to .csv files 

search_criteria = "*.csv"
q = os.path.join(path, search_criteria)
csv_files = glob.glob(q)

#To check your work, print the results. 
#This will return a list of only .csv files.

print(csv_files)

### In this example, the .csv files containing lat/long coordinates all have a file name that ends in a series of numbers. 
### To filter the list to only these files, import the re package. More information on how to filter and search with the re package can be found here: (https://www.w3schools.com/python/python_regex.asp)


In [None]:
#Filter to .csv files containing lat/lon coords.
#These files are named 'logfilename.csv'
#All logfilenames end with a number.
#Use re to search for these files
#and add the useful files to a new list.

import re

useful_files = []  # empty list for storing the useful files
for file in csv_files:
    if re.search('[0-9][.]csv', file):
        useful_files.append(file)

# Check your work.

print(useful_files)

#Check how many useful files there are.
#This is also the number of lat/long coordinates.

len(useful_files)

### Now that the files containing the lat/long coordinates are grouped in a single list, the next step is to extract a single coordinate pair from each file. 
### In this example, each .csv file contains a list of coordinate pairs. We would like to calculate the "average" coordinate pair so that we only extract *one* coordinate pair from each file. 
### This can be done by defining a function that calculates the average of the lats and longs in each file and returns a single pair.

In [None]:
# First, read one of the files to see what the data structure looks like.

test = pd.read_csv(useful_files[0])
test.head()

### The columns containing the latitude and longitude values are named 'Lat [deg]' and 'Lon [deg]'. Use these column names to specify which values to average in the function, as shown below.

In [None]:
#Test: Define a function to read file,
#average the lat and lon coordinates,
#and return the average coords as a pair.

def coord_pair(f):
    file = pd.read_csv(f)
    avg_lat= file['Lat [deg]'].mean()
    avg_lon= file['Lon [deg]'].mean()
    return (avg_lat,avg_lon)

coord_pair(useful_files[0])

In [None]:
#Create an empty list to store the lat/lon coords.
#Apply the coord function to all useful files.
#Store output in the list.

lat_lon_list = []
for f in useful_files:
    lat_lon_list.append(coord_pair(f))
    

In [None]:
#Check list of lat/lon coords.

print(lat_lon_list)

In [None]:
#Check length of coordinate list
#to make sure it matches the number of useful files.

len(lat_lon_list)

In [None]:
#Create a DataFrame object from the list of coordinates.

df = pd.DataFrame(lat_lon_list, columns=['Lat[deg]', 'Lon[deg]'])
print(df)

In [None]:
# Convert the DataFrame to a .csv
df.to_csv('CN_posts.csv', index=False)

### That's it! You've done it!