### Imports and set-up
---
> Uncomment as needed

In [2]:
import io, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm


# Training DataFrames:

trn_traits = pd.read_csv('training_data/trait_data.csv')
trn_metadata = pd.read_csv('training_data/meta_data.csv')
# trn_env_cov = pd.read_csv('training_data/env_cov.csv')
# trn_soil = pd.read_csv('training_data/soil_data.csv')
# trn_weather_season = pd.read_csv('training_data/weather_season.csv')
# trn_weather_year = pd.read_csv('training_data/weather_year.csv')


# Testing DataFrames:

tst_template = pd.read_csv('testing_data/template.csv')
# tst_metadata = pd.read_csv('testing_data/meta_data.csv')
# tst_env_cov = pd.read_csv('testing_data/env_cov.csv')
# tst_soil = pd.read_csv('testing_data/soil_data.csv')
# tst_weather_season = pd.read_csv('testing_data/weather_season.csv')
# tst_weather_year = pd.read_csv('testing_data/weather_year.csv')

Helper functions:

In [3]:
R = 6371000 # Earth Radius (m)
def haversine_distance(lat1, lat2, lon1, lon2):
    lat1, lat2, lon1, lon2 = map(np.radians, [lat1, lat2, lon1, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return c*R

In [4]:
def read_vcf(fn: str) -> pd.DataFrame:
    with open(fn, 'r') as file:
        lines = (l for l in file if not l.startswith('##'))
        return pd.read_csv(io.StringIO(''.join(lines)), sep='\t')

#### Defining Useful Column Subsets
---

This section is for meta_data

In [5]:
# Condensing 4 positions into 1 (center location)
lat_cols = [c for c in trn_metadata.columns if c.startswith('Latitude')]
lon_cols = [c for c in trn_metadata.columns if c.startswith('Longitude')]

trn_metadata.loc[:, 'Latitude'] = trn_metadata[lat_cols].mean(axis=1)
trn_metadata.loc[:, 'Longitude'] = trn_metadata[lon_cols].mean(axis=1)

In [6]:
metadata_cols_important = [
    'Year', 'Env', 'Experiment_Code', 
    'City', 'Farm', 'Field',
    'Latitude', 'Longitude'
]

metadata_cols_interesting = [
    'Year', 'Env', 'Experiment_Code', 
    'City', 'Farm', 'Field',
    'Pre-plant_tillage_method(s)', 'In-season_tillage_method(s)',
    'Previous_Crop', 'Irrigated',
    'Latitude', 'Longitude'
]

metadata_cols_minimal = ['Env', 'Latitude', 'Longitude']

trn_metadata = trn_metadata[metadata_cols_important]
trn_metadata.head()

Unnamed: 0,Year,Env,Experiment_Code,City,Farm,Field,Latitude,Longitude
0,2014,DEH1_2014,DEH1,Georgetown,Elbert N. & Ann V. Carvel Research & Education...,27AB,,
1,2014,GAH1_2014,GAH1,Tifton,USDA - Bellflower experimental farm,18,,
2,2014,IAH1a_2014,IAH1,Ames,Worle,,,
3,2014,IAH1b_2014,IAH1,Ames,Worle,,,
4,2014,IAH1c_2014,IAH1,Ames,Worle,,,


This section is for trait_data:

In [7]:
# TODO: define useful column subsets for trait_data

#### Addressing Missing Locations
---

This works by first trying to match each Env with a missing location to the most similar Env whose location can be copied.

The remaining values are just selected by hand.

In [8]:
# First, self-join meta_data on Experiment_Code and City.
# This makes a table where each Env has a number of rows that share at least Experiment_Code and City.
merged = trn_metadata.merge(
    trn_metadata,  # Join with itself
    on=['Experiment_Code', 'City'], 
    suffixes=('', '_other')
)

# Filter out self matches, and narrow down to rows that are missing their location, but the comparable Env still has a location.
merged = merged[(merged['Year'] != merged['Year_other']) & merged['Latitude'].isna() & merged['Latitude_other'].notna()]

# Now, most of the Envs with a missing location will have multiple comparable Envs
# Assign each of those other Envs a match_priority value, which determines how closely they match
# Two Envs that share a 'Field' will have priority value 3, sharing a 'Farm' will get 2, and only sharing 'City' will get 1.
merged['match_priority'] = (
    (merged['Field'] == merged['Field_other']).astype(int) +
    (merged['Farm'] == merged['Farm_other']).astype(int) +
    1
)

merged = merged.sort_values(by=['Env', 'match_priority'], ascending=[True, False]) # Sort to put the highest priority match for each Env first.
best_matches = merged.groupby('Env').first().reset_index() # Get one row per Env again, but only keep the highest priority match
best_matches = best_matches[['Env', 'Latitude_other', 'Longitude_other']] # Narrow down to just important columns

# I have manually determined which Envs were not able to find a match, and found a suitable location:
curated_locations = [
    ['ILH1_2017', 40.060724, 88.233881],                    # Choosing MF-500, copying from ILH1_2016
    ['INH1_2017', 40.478760, 86.989820],                    # Choosing Purdue Acre 54 North, copying from INH1_2016
    ['NEH2_2019', 40.86073680542686, -96.6139217242634],    # Choosing a larger field near the Havelock Research Farm in Lincoln, NE
    ['TXH2_2014', 34.19196292257452, -101.96587766472442],  # Chose a random center pivot field after searching 'Halfway, TX'
    ['TXH2_2015', 33.683219771598594, -101.8228099023082],  # | For these, I am just going with a field next to the
    ['TXH2_2016', 33.683219771598594, -101.8228099023082],  # | Texas A&M AgriLife Research & Extension Center at Lubbock
    ['TXH2_2017', 33.683219771598594, -101.8228099023082],  # | 
    ['TXH2_2018', 33.683219771598594, -101.8228099023082],  # | It should be accurate for the ones listed as Lubbock, and close enough for the others. 
    ['TXH4_2019', 33.683219771598594, -101.8228099023082]   # /
]
curated_locations_df = pd.DataFrame(curated_locations, columns=['Env', 'Latitude_other', 'Longitude_other'])
best_matches = pd.concat([best_matches, curated_locations_df], ignore_index=True) # Concatenate my curated locations to the matches.

# Finally, I align my new locations with the meta_data df, and fill in the missing values using my new values.
aligned_matches = trn_metadata.merge(best_matches, on='Env', how='left')
trn_metadata['Latitude'] = trn_metadata['Latitude'].fillna(aligned_matches['Latitude_other'])
trn_metadata['Longitude'] = trn_metadata['Longitude'].fillna(aligned_matches['Longitude_other'])

print('Missing Locations:')
trn_metadata[trn_metadata['Latitude'].isna()] # print the misisng locations to verify they are all taken care of 

Missing Locations:


Unnamed: 0,Year,Env,Experiment_Code,City,Farm,Field,Latitude,Longitude


#### Envirotyping Input
---

Input Variables:
| Variable | Description | Collection Method |
| --- | --- | --- |
| Site | User site Identifier | Use 'Env' column |
| Planting | Plating Date (mm/dd/yyyy) | use 'Date_Planted' column |
| Latitude | Latitude of trial | use meta_data['Latitude'] |
| Longitude | Longitude of trial | use meta_data['Longitude'] |
| Crop | soybean or maize | 'maize' |
| Genetics | Soybean: maturity group (0-6, by 1), Maize: RM (80-130, by 5) | ??? |

In [9]:
et_trait_cols = ['Env', 'Field_Location', 'Year', 'Date_Planted']
# et_genetic_cols = ['Hybrid', 'Hybrid_orig_name'] # Unfortunately, none of the materials provided seem to list maturities for the hybrids
et_timing_cols = ['Pollen_DAP_days', 'Silk_DAP_days', 'Date_Harvested']

et_input = trn_traits[et_trait_cols + et_timing_cols]
et_input = et_input.merge(trn_metadata[metadata_cols_minimal], on='Env')

# Convert Date columns to_datetime
et_input['Date_Planted'] = pd.to_datetime(et_input['Date_Planted'], format='%m/%d/%y')
if 'Date_Harvested' in et_input: et_input['Date_Harvested'] = pd.to_datetime(et_input['Date_Harvested'], format='%m/%d/%y')

et_input['Harvest_DAP_days'] = (et_input['Date_Harvested'] - et_input['Date_Planted']).dt.days
et_input

Unnamed: 0,Env,Field_Location,Year,Date_Planted,Pollen_DAP_days,Silk_DAP_days,Date_Harvested,Latitude,Longitude,Harvest_DAP_days
0,DEH1_2014,DEH1,2014,2014-05-05,63.0,67.0,2014-09-29,38.629357,-75.465693,147.0
1,DEH1_2014,DEH1,2014,2014-05-05,61.0,63.0,2014-09-29,38.629357,-75.465693,147.0
2,DEH1_2014,DEH1,2014,2014-05-05,63.0,65.0,2014-09-29,38.629357,-75.465693,147.0
3,DEH1_2014,DEH1,2014,2014-05-05,61.0,63.0,2014-09-29,38.629357,-75.465693,147.0
4,DEH1_2014,DEH1,2014,2014-05-05,63.0,65.0,2014-09-29,38.629357,-75.465693,147.0
...,...,...,...,...,...,...,...,...,...,...
173955,WIH3_2023,WIH3,2023,2023-04-26,81.0,82.0,2023-11-14,44.115645,-89.544009,202.0
173956,WIH3_2023,WIH3,2023,2023-04-26,70.0,70.0,2023-11-14,44.115645,-89.544009,202.0
173957,WIH3_2023,WIH3,2023,2023-04-26,78.0,80.0,2023-11-14,44.115645,-89.544009,202.0
173958,WIH3_2023,WIH3,2023,2023-04-26,84.0,100.0,2023-11-14,44.115645,-89.544009,202.0


# Info About Hybrids
---

In [10]:
# Notes on Hybrid Columns:
# there are 2239 different values in trn_traits['Hybrid_Parent1']
# there are 80   different values in trn_traits['Hybrid_Parent2']
# there are 5205 different values in trn_traits['Hybrid']
# there are 5491 different values in trn_traits['Hybrid_orig_name']
# there are 2257 unique parent hybrids in trn_traits.


# there are 1063 different values in tst_template['Hybrid']
# of the 1063 values in tst_template['Hybrid'], 104 are also in trn_traits['Hybrid'], trn_traits['Hybrid_orig_name'] also has 104 elements in common.

# there are 533  different values in tst_template['Hybrid'].str.split('/')[0]
# there are 20   different values in tst_template['Hybrid'].str.split('/')[1]
# there are 548  unique parents in tst_template['Hybrid']
# of the 533 p1 values in tst_template, 93 are shared with Hybrid_Parent1 in trn_traits 
# of the 533 p1 values in tst_template, 20 are shared with Hybrid_Parent2 in trn_traits 
# of the 20 p2 values in tst_template, 18 are found in Hybrid_Parent1 in trn_traits 
# of the 20 p2 values in tst_template, 19 are found in Hybrid_Parent2 in trn_traits
# only 93 parent values from p1 are found between Hybrid_Parent1 and Hybrid_Parent2 (so all in 1)
# all 20 parent values in p2 are found between Hybrid_Parent1 and Hybrid_Parent2

# Make a table:
# Make a list of the hybrids (one each for trainging data and one for testing data)
# How many times does the hybrid show up in the data
# Count the individual parent lines (separate for the same hybrid if it is P1 vs P2)
# Sort by the frequency
# So, in summary, what actual columns do I want?
#  - Hybrid name
#  - count in training set
#  - Parent1
#  - Parent1 count in training set
#  - Parent2 
#  - Parent2 count in training set
# Do a bit of cleaning as well.

# trn_traits['Hybrid']
# len(set(trn_traits['Hybrid_orig_name']))
# len(set(tst_template['Hybrid']).intersection(set(trn_traits['Hybrid_orig_name'])))
# ext_template = tst_template.copy()
# ext_template[['p1', 'p2']] = ext_template['Hybrid'].str.split('/', expand=True)
# len(set(ext_template['p1']))
# len(set(ext_template['p1']).intersection(set(trn_traits['Hybrid_Parent1']).union(set(trn_traits['Hybrid_Parent2']))))
# len(set(trn_traits['Hybrid_Parent1']).union(set(trn_traits['Hybrid_Parent2'])))
# len(set(ext_template['p1']).union(set(ext_template['p2'])))

# Hybrid_Count is also the number of Envs that each Hybrid was planted in (assuming each row in trn_traits is a combination of Env and Hybrid)
# hybrids = trn_traits.groupby(['Hybrid', 'Hybrid_Parent1', 'Hybrid_Parent2'], as_index=False).agg({'Hybrid': ['first', 'count'], 'Hybrid_Parent1': ['first', 'count'], 'Hybrid_Parent2': ['first', 'count']})

# hybrids = trn_traits.groupby(['Hybrid'], as_index=False).agg({'Hybrid': ['first', 'count'], 'Hybrid_Parent1': ['first', 'count'], 'Hybrid_Parent2': ['first', 'count']})


# parent1_counts = hybrids.groupby('Hybrid_Parent1').agg(count=('Hybrid_Parent1', 'count'))
# parent2_counts = hybrids.groupby('Hybrid_Parent2').agg(count=('Hybrid_Parent2', 'count'))

# hybrids
# parent2_counts

hybrids = trn_traits.groupby(['Hybrid', 'Hybrid_Parent1', 'Hybrid_Parent2'], as_index=False).agg(hybrid_occurrences=('Hybrid', 'count'))
# hybrids['alt_names'] = hybrids.apply(lambda row: row['alt_names'] - {row['Hybrid']}, axis=1)

# TODO: Make a hairball. Nodes = parent lines, Edges = Hybrids, Color tst/trn data differently.

hybrids = hybrids.merge(hybrids.groupby('Hybrid_Parent1', as_index=False).agg(p1_children=('Hybrid_Parent1', 'count')), how='left', on='Hybrid_Parent1')
hybrids = hybrids.merge(hybrids.groupby('Hybrid_Parent2', as_index=False).agg(p2_children=('Hybrid_Parent2', 'count')), how='left', on='Hybrid_Parent2')

# Genetic Distance: Dot product between genetic marker vectors or sum of XOR between markers, etc.

# missing_parents = trn_traits[trn_traits['Hybrid_Parent1'].isna()][['Hybrid', 'Hybrid_Parent1', 'Hybrid_Parent2']]
# missing_parents = missing_parents.groupby('Hybrid', as_index=False).agg(occurrences=('Hybrid', 'count'))
# missing_parents


# hybrids = trn_traits.groupby(['Hybrid', 'Hybrid_Parent1', 'Hybrid_Parent2'], as_index=False, dropna=False)
# merged = trn_traits.merge(trn_traits, on='Hybrid', suffixes=('', '_filled'))
# filtered = merged[merged['Hybrid_Parent1'].isna() & merged['Hybrid_Parent1_filled'].notna()]
# result = filtered[['Hybrid', 'Hybrid_Parent1', 'Hybrid_Parent2', 'Hybrid_Parent1_filled', 'Hybrid_Parent2_filled']]
# result            dtype=

hybrids

Unnamed: 0,Hybrid,Hybrid_Parent1,Hybrid_Parent2,hybrid_occurrences,p1_children,p2_children
0,2369/DK3IIH6,2369,DK3IIH6,116,4,153
1,2369/LH123HT,2369,LH123HT,753,4,11
2,2369/PHN82,2369,PHN82,62,4,14
3,2369/PHZ51,2369,PHZ51,108,4,670
4,2FACC/DK3IIH6,2FACC,DK3IIH6,70,1,153
...,...,...,...,...,...,...
5087,Z037E0054/LH162,Z037E0054,LH162,12,3,28
5088,Z037E0054/PHZ51,Z037E0054,PHZ51,32,3,670
5089,Z038E0057/DK3IIH6,Z038E0057,DK3IIH6,38,3,153
5090,Z038E0057/LH162,Z038E0057,LH162,12,3,28


# Genetic Distance
---

In [11]:
genetic_df = read_vcf('training_data/genotypes.vcf')
genetic_df = genetic_df[[col for col in genetic_df.columns if col not in ('#CHROM', 'POS', 'QUAL', 'FILTER', 'INFO', 'FORMAT')]]
genetic_df

Unnamed: 0,ID,REF,ALT,01CSI6/LH287,01DIB2/LH287,01DIB2/PHP02,2369/DK3IIH6,2369/LH123HT,2369/PHN82,2369/PHZ51,...,Z037E0054/DK3IIH6,Z037E0054/LH162,Z037E0054/PHZ51,Z038E0057/DK3IIH6,Z038E0057/LH162,Z038E0057/PHZ51,ZS01459/LH287,ZS01459/PHP02,ZS0510/LH287,ZS0510/PHP02
0,S1_1007742,T,C,0/1,1/1,1/1,0/1,0/1,0/1,0/1,...,0/1,0/1,0/1,0/1,0/1,0/1,1/1,1/1,1/1,1/1
1,S1_1020677,G,A,0/0,0/0,0/1,0/0,0/1,0/1,0/0,...,0/0,0/1,0/0,0/0,0/1,0/0,0/0,0/1,0/0,0/1
2,S1_2018002,T,C,0/0,0/0,0/1,0/1,0/0,0/1,0/0,...,0/1,0/1,0/0,0/1,0/1,0/0,0/0,0/1,0/0,0/1
3,S1_2101934,T,C,0/0,0/0,./.,0/0,0/1,./.,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,./.,0/0,./.
4,S1_2275970,A,C,0/0,0/0,0/0,0/0,0/1,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/0,1/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2420,S10_150646162,G,T,0/0,1/0,1/1,0/1,0/0,0/1,0/0,...,0/1,0/0,0/0,0/1,0/0,0/0,1/0,1/1,1/0,1/1
2421,S10_150711963,G,A,1/1,0/1,0/0,0/1,0/0,0/0,0/0,...,0/1,0/1,0/0,0/1,0/1,0/0,1/1,1/0,0/1,0/0
2422,S10_150733250,A,C,1/0,1/0,1/0,0/1,0/1,0/0,0/0,...,0/1,0/1,0/0,0/1,0/1,0/0,1/0,1/0,1/0,1/0
2423,S10_151045975,T,G,1/1,0/1,0/0,0/0,0/0,0/1,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/1,0/0,0/1,0/0


In [52]:
inbred_lines = np.asarray(list(dict.fromkeys(inbreed for col in genetic_df.columns[3:].str.split('/') for inbreed in col)))
inbred_data = pd.DataFrame(columns=genetic_df['ID'], index=inbred_lines, dtype=float)
for col in tqdm(genetic_df.columns[3:], desc='Reformatting Data'):
    p1, p2 = col.split('/')
    split_data = genetic_df[col].str.split('/', expand=True)
    split_data = split_data.replace('.', np.nan).astype(float).set_index(genetic_df['ID'])
    
    # if inbred_data.loc[p1].any():
    #     valid_indices = inbred_data.loc[p1].notna() & split_data[0].T.notna()
    #     same_data = split_data[0].T[valid_indices] == inbred_data.loc[p1][valid_indices]
    #     diff = pd.DataFrame({'old': inbred_data.loc[p1][valid_indices][~same_data], 'new': split_data[0].T[valid_indices][~same_data]})
    #     if not same_data.all():
    #         print(f'{p1=:<25} ({(~same_data).sum()} Swapped Values). Keeping Old Data.')
    
    # if inbred_data.loc[p2].any():
    #     valid_indices = inbred_data.loc[p2].notna() & split_data[1].T.notna()
    #     same_data = split_data[1].T[valid_indices] == inbred_data.loc[p2][valid_indices]
    #     diff = pd.DataFrame({'old': inbred_data.loc[p2][valid_indices][~same_data], 'new': split_data[1].T[valid_indices][~same_data]})
    #     if not same_data.all():
    #         print(f'{p2=:<25} ({(~same_data).sum()} Swapped Values). Keeping Old Data.')
    
    # inbred_data.loc[[p1, p2]] = inbred_data.loc[[p1, p2]].fillna(split_data.T)
    inbred_data.loc[p1] = inbred_data.loc[p1].fillna(split_data[0].T)
    inbred_data.loc[p2] = inbred_data.loc[p2].fillna(split_data[1].T)
    
inbred_data

Reformatting Data:   0%|          | 0/5899 [00:00<?, ?it/s]

ID,S1_1007742,S1_1020677,S1_2018002,S1_2101934,S1_2275970,S1_2800964,S1_2811950,S1_2888631,S1_3023078,S1_3027593,...,S10_149647241,S10_149717807,S10_149851103,S10_149865723,S10_150351135,S10_150646162,S10_150711963,S10_150733250,S10_151045975,S10_151157757
01CSI6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
LH287,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
01DIB2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
PHP02,1.0,1.0,1.0,,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
2369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z037E0012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z037E0054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Z038E0057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZS01459,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


In [None]:
# def matrix_hamming_dist(m: np.ndarray):
#     mask = ~m[:, None, :].notna()
    
    
#     diff = m2[:, None, :] ^ m2[None, :, :]
#     mask = (m[:, None, :] != -1) & (m[None, :, :] != -1)
#     diff &= mask
    
#     return diff.sum(axis=2)

# distance_matrix = matrix_hamming_dist(inbred_data.to_numpy(np.int8))
# distance_matrix
matrix = inbred_data.to_numpy(np.int8, na_value=-1)
# mask = matrix != -1

# diff = matrix[:, None, :] ^ matrix[None, :, :]

# diff &= mask[:, None, :] | mask[None, :, :]



# diff

dist_matrix = np.ndarray((matrix.shape[0], matrix.shape[0]), np.int32)
dist_matrix.fill(0)

for i, row in enumerate(matrix):
    for j, other in enumerate(matrix[i+1:, :]):
        mask = (row != -1) | (other != -1)
        dist = row ^ other
        dist_matrix[i, i+j+1] = (dist & mask).sum()
        
dist_df = pd.DataFrame(dist_matrix, index=inbred_lines, columns=inbred_lines)
dist_df

  arr = np.array(blk.values, dtype=dtype, copy=copy)


Unnamed: 0,01CSI6,LH287,01DIB2,PHP02,2369,DK3IIH6,LH123HT,PHN82,PHZ51,2FACC,...,Z034E0042,Z034E0064,Z035E0047,Z035E0060,Z036E0057,Z037E0012,Z037E0054,Z038E0057,ZS01459,ZS0510
01CSI6,0,775,966,825,942,865,848,823,867,898,...,1007,995,998,1006,1002,1001,1009,1005,993,844
LH287,0,0,973,992,1007,958,529,974,872,913,...,1048,1036,1041,1041,1047,1044,1046,1044,1026,989
01DIB2,0,0,0,1039,766,1047,972,1047,989,1016,...,701,713,684,698,722,681,683,689,933,1052
PHP02,0,0,0,0,1021,462,991,370,946,829,...,1050,1028,1049,1049,1063,1048,1056,1048,776,661
2369,0,0,0,0,0,1055,1000,973,1019,982,...,279,299,270,310,312,259,271,273,1035,1050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z037E0012,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,14,16,1038,1035
Z037E0054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,16,1040,1037
Z038E0057,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1032,1033
ZS01459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,787


In [62]:
dist_df.to_csv('output_data/genetic_distance.csv', index_label='Inbred_Line')

# In-Progress
---

Everything below here is notes and experiments in progress.

In [None]:
# The purpose of this cell is to try to fill in missing value in `cols_to_fill`
# It kinda works, but there are still 9 Pollen, and 10 Silk days that aren't filled since there is no data for IAH1/TXH4 to use.
agg_func = lambda x: x.mode().min() # Aggregate columns by finding the most common value for that group (and break ties with min)

cols_to_fill = ['Pollen_DAP_days', 'Silk_DAP_days', 'Date_Harvested']
per_env = et_input.groupby('Env', as_index=False).agg(agg_func)

# What this does is get a row for each location (e.g. ILH1) and has the (smallest) mode for that value,
# Then any gaps in the data that was grouped by 'Env' are filled by the data that was grouped by the larger 'Field_Location'
test = per_env.merge(et_input.groupby('Field_Location').agg({c: agg_func for c in cols_to_fill}), on='Field_Location', how='left')
for col in cols_to_fill:
    test[col] = test[f'{col}_x'].fillna(test[f'{col}_y'])
    test = test.drop(columns=[f'{col}_x', f'{col}_y'])

test

In [9]:
# Find the best values for these columns to fill in gaps by copying over valuse from the most similar trial(s)
per_env = et_input.groupby(['Field_Location', 'Env']).agg({
    'Date_Planted': lambda x: x.mode().min(), 
    'Pollen_DAP_days': lambda x: x.mode().min(),
    'Silk_DAP_days': lambda x: x.mode().min(),
    'Date_Harvested': lambda x: x.mode().min()
}).reset_index()


# Backup values to fill in the ones that are still missing after grouping by Env
per_floc = trn_traits.groupby('Field_Location').agg({
    'Pollen_DAP_days': lambda x: x.mode().min(),
    'Silk_DAP_days': lambda x: x.mode().min(),
    'Date_Harvested': lambda x: x.mode().min()
})

test = per_env.merge(per_floc, on='Field_Location', how='left')

merge_cols = ['Pollen_DAP_days', 'Silk_DAP_days', 'Date_Harvested']
for col in merge_cols:
    test[col] = test[f'{col}_x'].fillna(test[f'{col}_y'])
    test = test.drop(columns=[f'{col}_x', f'{col}_y'])

# per_env
# per_floc
# et_input['Date_Planted'] = et_input['Date_Planted'].fillna(et_input.merge(plantings, on='Env', how='left')['Date_Planted_y'])
# test

In [None]:
# Consider trying to fill in planting/harvest dates by looking at the other crops in the same plot/experiment/year
planting_dates = et_input[['Env', 'Date_Planted']].drop_duplicates(subset='Env', keep='first').reset_index(drop=True)

aligned_planting_dates = et_input.merge(planting_dates, on='Env', how='left')

aligned_planting_dates

In [None]:
test = et_input.groupby(['Env', 'Date_Planted']).agg({
    'Year': 'first', 'Latitude': 'first', 'Longitude': 'first',
    'Pollen_DAP_days': ['mean', 'std'], 'Silk_DAP_days': ['mean', 'std'], 'Date_Harvested': [set, 'nunique'], 'Harvest_DAP_days': ['mean', 'std']
}).reset_index()
test

In [12]:
et_input.to_csv('output_data/envirotype_input.csv')

In [13]:
# Goal 1: Get a single dataframe that contains all of the most important information for the competition
metadata_cols_important = [
    'Env', 
    'Location_Code',
    'Year',
    'Latitude',
    'Longitude',
    'Hybrid',
    'Date_Planted',
    # Date_Harvested, Pollen_DAP_days, Silk_DAP_days, Harvest_DAP_days?
    # Check trait_data for some useful phenotype measurements
    # Yield
    # Decide whether we want any of the columns from metadata, like irrigation, methods, etc.
]

# Or, maybe make a few dfs? 
# I could have:
#  - minimal meta_data
#  - meta_data extra info
#  - minimal trait_data
#  - trait_data extra info
#  - 

In [14]:
# Goal 2: Make a naive method of estimating yield by finding the year with the most similar conditons to 2024 and 
# copying over the yield values from that year for the given hybrids/locations