In [2]:
import sys
import os

# Calculate the absolute path to the project root (one level up) to make src available
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

import src.utils.features_penguins as ftpen

In [3]:
PENGUIN_DATA = '../data/penguin/'

# Load data into one frame
df = ftpen.load_penguins(PENGUIN_DATA)     

# Resolve date formatting issues
df = ftpen.date_formatting(df)                           

# Delete duplicates
df = df.drop_duplicates(subset=['track_id', 'datetime']).reset_index(drop=True)
df = df.sort_values(['track_id', 'datetime']).reset_index(drop=True)

# breed_status is redundant and dropped
df = df.drop(columns='breed_status')

### Feature Engineering - adding penguin features

In [None]:
# Calculate geodesic distance to colony (WGS-84 ellipsoid)
df = ftpen.geodesic_distance(df)                                   

# mean distance between consecutive points 
mean_distance, std_distance = ftpen.mean_distance(df)                  

# Relative position of individual penguin to colony
df = ftpen.calculate_relative_position(df)                           

# Direction of travel of individual penguin (relative to last point)
df = ftpen.direction_of_travel(df)                                     

# Distance travelled since last measure
df = ftpen.distance_since_last_measure(df)                             

# Distance in NSWE direction since last measure
df = ftpen.distance_direction(df)   
                                   
# Time between measures
df = ftpen.time_difference(df)                                          


In [None]:
# save df to csv and store in zip file
SAVE_PATH = '../data/penguin/'
#ftpen.save_df(df, SAVE_PATH, 'penguins_cleaned.csv')  

 # Calculate average values per day and track_id and store the FINAL PENGUIN DATA
df_penguins_final = ftpen.average_per_day(df)                      
ftpen.save_df(df_penguins_final, SAVE_PATH, 'penguins_final.csv')

# COMMENT THIS IN TO SAVE THE JSON FILE WITH ALL DATES FOR WHICH PENGUIN DATA EXISTS
# IMPORTANT FOR THE API CALL REGARDING ERA5 DATA

#PATH_JSON = '../data/era5/'
#ftpen.save_to_json(df_penguins_final, PATH_JSON, 'unique_dates_dict.json')

In [5]:
# standard deviation of penguins distance to colony
species = df['common_name'].unique()
ftpen.spread_species(df_penguins_final, species)        


King George Island 156.8922767009632
Hope Bay 204.50053560976153
Powell Island/Peng colony 241.1019912032257
Signy Island/Gourlay 234.41072806291984
Signy Island/North Point 397.9617597992179
Admiralty Bay 252.13850471057341

Laurie Island/Cape Geddes 33.81119977181794
Signy Island/Gourlay 15.947249101383004
Powell Island/Peng colony 20.863095171907656
Admiralty Bay 191.11673222630225
Cape Shirreff 898.6426014496931

Signy Island/North Point 14.226600107963876
Admiralty Bay 9.8093438462616
Cape Shirreff 48.9572812598909


In [6]:
# mean deviation to colony per specie and colony
ftpen.calculte_mean_deviation(df_penguins_final)            


King George Island 129.4704820208191
Hope Bay 164.31657218913702
Powell Island/Peng colony 188.51774155572733
Signy Island/Gourlay 200.64823394539997
Signy Island/North Point 340.63002199213594
Admiralty Bay 187.6252134675522

Laurie Island/Cape Geddes 19.19342632712171
Signy Island/Gourlay 12.324104237789532
Powell Island/Peng colony 12.005098504707796
Admiralty Bay 96.47090182991843
Cape Shirreff 571.9082535242949

Signy Island/North Point 8.846322623042843
Admiralty Bay 6.105893753653458
Cape Shirreff 32.132810331351685
