### Imports and set-up
---

In [130]:
import numpy as np
import pandas as pd

trait_data = pd.read_csv('training_data/trait_data_2014-2023.csv')
meta_data = pd.read_csv('training_data/meta_data_2014-2023.csv')

helper functions:

In [131]:
R = 6371000 # Earth Radius (m)
def haversine_distance(lat1, lat2, lon1, lon2):
    lat1, lat2, lon1, lon2 = map(np.radians, [lat1, lat2, lon1, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return c*R

#### Defining Useful Column Subsets
---

In [132]:
# Condensing 4 positions into 1 (center location)
lat_cols = [c for c in meta_data.columns if c.startswith('Latitude')]
lon_cols = [c for c in meta_data.columns if c.startswith('Longitude')]

meta_data.loc[:, 'Latitude'] = meta_data[lat_cols].mean(axis=1)
meta_data.loc[:, 'Longitude'] = meta_data[lon_cols].mean(axis=1)

In [133]:
important_cols = [
    'Year', 'Env', 'Experiment_Code', 
    'City', 'Farm', 'Field',
    'Latitude', 'Longitude'
]

interesting_cols = [
    'Year', 'Env', 'Experiment_Code', 
    'City', 'Farm', 'Field',
    'Pre-plant_tillage_method(s)', 'In-season_tillage_method(s)',
    'Previous_Crop', 'Irrigated',
    'Latitude', 'Longitude'
]

bare_minimum_cols = ['Env', 'Latitude', 'Longitude']

meta_data = meta_data[important_cols]
meta_data.head()

Unnamed: 0,Year,Env,Experiment_Code,City,Farm,Field,Latitude,Longitude
0,2014,DEH1_2014,DEH1,Georgetown,Elbert N. & Ann V. Carvel Research & Education...,27AB,,
1,2014,GAH1_2014,GAH1,Tifton,USDA - Bellflower experimental farm,18,,
2,2014,IAH1a_2014,IAH1,Ames,Worle,,,
3,2014,IAH1b_2014,IAH1,Ames,Worle,,,
4,2014,IAH1c_2014,IAH1,Ames,Worle,,,


#### Addressing Missing Locations
---

This works by first trying to match each Env with a missing location to the most similar Env whose location can be copied.

The remaining values are just selected by hand.

In [None]:
# First, self-join meta_data on Experiment_Code and City.
# This makes a table where each Env has a number of rows that share at least Experiment_Code and City.
merged = meta_data.merge(
    meta_data,  # Join with itself
    on=['Experiment_Code', 'City'], 
    suffixes=('', '_other')
)

# Filter out self matches, and narrow down to rows that are missing their location, but the comparable Env still has a location.
merged = merged[(merged['Year'] != merged['Year_other']) & merged['Latitude'].isna() & merged['Latitude_other'].notna()]

# Now, most of the Envs with a missing location will have multiple comparable Envs
# Assign each of those other Envs a match_priority value, which determines how closely they match
# Two Envs that share a 'Field' will have priority value 3, sharing a 'Farm' will get 2, and only sharing 'City' will get 1.
merged['match_priority'] = (
    (merged['Field'] == merged['Field_other']).astype(int) +
    (merged['Farm'] == merged['Farm_other']).astype(int) +
    1
)

merged = merged.sort_values(by=['Env', 'match_priority'], ascending=[True, False]) # Sort to put the highest priority match for each Env first.
best_matches = merged.groupby('Env').first().reset_index() # Get one row per Env again, but only keep the highest priority match
best_matches = best_matches[['Env', 'Latitude_other', 'Longitude_other']] # Narrow down to just important columns

# I have manually determined which Envs were not able to find a match, and found a suitable location:
curated_locations = [
    ['TXH2_2014', 40.060724, 88.233881],                    # Choosing MF-500, copying loc from ILH1_2016
    ['TXH2_2015', 40.478760, 86.989820],                    # Choosing Purdue Acre 54 North, copying from INH1_2016
    ['TXH2_2016', 40.86073680542686, -96.6139217242634],    # Choosing a larger field near the Havelock Research Farm in Lincoln, NE
    ['ILH1_2017', 34.19196292257452, -101.96587766472442],  # Chose a random center pivot field after searching 'Halfway, TX'
    ['INH1_2017', 33.683219771598594, -101.8228099023082],  # \
    ['TXH2_2017', 33.683219771598594, -101.8228099023082],  # | For tshese three, I am just going with the Extension Center field, since there is nothing to indicate either way, and at least it is not a random guess.
    ['TXH2_2018', 33.683219771598594, -101.8228099023082],  # /
    ['NEH2_2019', 33.683219771598594, -101.8228099023082],  # Chose a field just south of the Texas A&M AgriLife Research & Extension Center at Lubbock
    ['TXH4_2019', 33.683219771598594, -101.8228099023082]   # 
]
curated_locations_df = pd.DataFrame(curated_locations, columns=['Env', 'Latitude_other', 'Longitude_other'])
best_matches = pd.concat([best_matches, curated_locations_df], ignore_index=True) # Concatenate my curated locations to the matches.

# Finally, I align my new locations with the meta_data df, and fill in the missing values using my new values.
aligned_matches = meta_data.merge(best_matches, on='Env', how='left')
meta_data['Latitude'] = meta_data['Latitude'].fillna(aligned_matches['Latitude_other'])
meta_data['Longitude'] = meta_data['Longitude'].fillna(aligned_matches['Longitude_other'])

meta_data[meta_data['Latitude'].isna()] # print the misisng locations to prove they are all taken care of 

Unnamed: 0,Year,Env,Experiment_Code,City,Farm,Field,Latitude,Longitude


#### Envirotyping Input
---

Input Variables:
| Variable | Description | Collection Method |
| --- | --- | --- |
| Site | User site Identifier | Use 'Env' column |
| Planting | Plating Date (mm/dd/yyyy) | ??? |
| Latitude | Latitude of trial | use meta_data['Latitude'] |
| Longitude | Longitude of trial | use meta_data['Longitude'] |
| Crop | soybean or maize | 'maize' |
| Genetics | Soybean: maturity group (0-6, by 1), Maize: RM (80-130, by 5) | ??? |

In [None]:
et_trait_cols = ['Env', 'Year', 'Date_Planted']
# et_genetic_cols = ['Hybrid', 'Hybrid_orig_name'] # Unfortunately, none of the materials provided seem to list maturities for the hybrids
et_timing_cols = ['Pollen_DAP_days', 'Silk_DAP_days', 'Date_Harvested']

et_input = trait_data[et_trait_cols + et_timing_cols]
et_input = et_input.merge(meta_data[bare_minimum_cols], on='Env')

# Convert Date columns to_datetime
et_input['Date_Planted'] = pd.to_datetime(et_input['Date_Planted'], format='%m/%d/%y')
if 'Date_Harvested' in et_input: et_input['Date_Harvested'] = pd.to_datetime(et_input['Date_Harvested'], format='%m/%d/%y')

et_input['Harvest_DAP_days'] = (et_input['Date_Harvested'] - et_input['Date_Planted']).dt.days
et_input

Unnamed: 0,Env,Year,Date_Planted,Pollen_DAP_days,Silk_DAP_days,Date_Harvested,Latitude,Longitude,Harvest_DAP_days
0,DEH1_2014,2014,2014-05-05,63.0,67.0,2014-09-29,38.629357,-75.465693,147.0
1,DEH1_2014,2014,2014-05-05,61.0,63.0,2014-09-29,38.629357,-75.465693,147.0
2,DEH1_2014,2014,2014-05-05,63.0,65.0,2014-09-29,38.629357,-75.465693,147.0
3,DEH1_2014,2014,2014-05-05,61.0,63.0,2014-09-29,38.629357,-75.465693,147.0
4,DEH1_2014,2014,2014-05-05,63.0,65.0,2014-09-29,38.629357,-75.465693,147.0
...,...,...,...,...,...,...,...,...,...
173955,WIH3_2023,2023,2023-04-26,81.0,82.0,2023-11-14,44.115645,-89.544009,202.0
173956,WIH3_2023,2023,2023-04-26,70.0,70.0,2023-11-14,44.115645,-89.544009,202.0
173957,WIH3_2023,2023,2023-04-26,78.0,80.0,2023-11-14,44.115645,-89.544009,202.0
173958,WIH3_2023,2023,2023-04-26,84.0,100.0,2023-11-14,44.115645,-89.544009,202.0


In [164]:
et_input.to_csv('output_data/envirotype_input.csv')

In [None]:
# Consider trying to fill in planting/harvest dates by looking at the other crops in the same plot/experiment/year