In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

### Load dataset

In [4]:
df = pd.read_csv('/content/GerminationDatabaseFile.csv', encoding='latin1')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4680 non-null   int64  
 1   id_test              4680 non-null   int64  
 2   accepted_binomial    4680 non-null   object 
 3   EUNIS_name           4680 non-null   object 
 4   original_name        4680 non-null   object 
 5   contributor_name     4680 non-null   object 
 6   contributor_email    4680 non-null   object 
 7   Cocontributors       4680 non-null   object 
 8   collection_year      4680 non-null   object 
 9   doi                  4680 non-null   object 
 10  collection_site      4680 non-null   object 
 11  collection_country   4680 non-null   object 
 12  iso_code             4680 non-null   object 
 13  latitude             4680 non-null   object 
 14  longitude            4680 non-null   float64
 15  coordinate_accuracy  4680 non-null   f

### Create germination rate as target

In [6]:
df["germination_rate"] = df["seeds_germinated"] / df["seeds_sown"]

## Drop Unnecessary Columns

In [7]:
drop_cols = [
    "Unnamed: 0", "id_test", "contributor_name", "contributor_email",
    "Cocontributors", "doi", "original_name"  # IDs and metadata
]
df = df.drop(columns=drop_cols)

In [None]:
df.head()

Unnamed: 0,accepted_binomial,EUNIS_name,collection_year,collection_site,collection_country,iso_code,latitude,longitude,coordinate_accuracy,storage,...,stratification,other_treatments,water_potential,photoperiod,light_quality,T_day,T_night,seeds_sown,seeds_germinated,germination_rate
0,Brachypodium retusum,Brachypodium retusum,2016,Nemes_calcareus,France,FR,43.7847,4.259389,100.0,stored,...,none,none,0.0,12,white,20.0,15.0,160,123,0.76875
1,Brachypodium retusum,Brachypodium retusum,2016,Montpellier_Red Mediterranean,France,FR,43.60758,3.449269,100.0,stored,...,none,none,0.0,12,white,20.0,15.0,160,77,0.48125
2,Brachypodium retusum,Brachypodium retusum,2016,Merindol_Red Mediterranean,France,FR,43.750425,5.210495,100.0,stored,...,none,none,0.0,12,white,20.0,15.0,160,41,0.25625
3,Brachypodium retusum,Brachypodium retusum,2016,Caumont_Red Mediterranean,France,FR,43.898319,4.932395,100.0,stored,...,none,none,0.0,12,white,20.0,15.0,160,17,0.10625
4,Brachypodium retusum,Brachypodium retusum,2016,Nemes_Red Mediterranean,France,FR,43.722868,4.339264,100.0,stored,...,none,none,0.0,12,white,20.0,15.0,160,101,0.63125


In [None]:
df

Unnamed: 0,accepted_binomial,EUNIS_name,collection_year,collection_site,collection_country,iso_code,latitude,longitude,coordinate_accuracy,storage,...,stratification,other_treatments,water_potential,photoperiod,light_quality,T_day,T_night,seeds_sown,seeds_germinated,germination_rate
0,Brachypodium retusum,Brachypodium retusum,2016,Nemes_calcareus,France,FR,43.7847,4.259389,100.0,stored,...,none,none,0.00,12,white,20.0,15.0,160,123,0.76875
1,Brachypodium retusum,Brachypodium retusum,2016,Montpellier_Red Mediterranean,France,FR,43.60758,3.449269,100.0,stored,...,none,none,0.00,12,white,20.0,15.0,160,77,0.48125
2,Brachypodium retusum,Brachypodium retusum,2016,Merindol_Red Mediterranean,France,FR,43.750425,5.210495,100.0,stored,...,none,none,0.00,12,white,20.0,15.0,160,41,0.25625
3,Brachypodium retusum,Brachypodium retusum,2016,Caumont_Red Mediterranean,France,FR,43.898319,4.932395,100.0,stored,...,none,none,0.00,12,white,20.0,15.0,160,17,0.10625
4,Brachypodium retusum,Brachypodium retusum,2016,Nemes_Red Mediterranean,France,FR,43.722868,4.339264,100.0,stored,...,none,none,0.00,12,white,20.0,15.0,160,101,0.63125
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4675,Plantago lanceolata,Plantago lanceolata,2014,Alcocer (Guadalajara),Spain,ES,40.494321,-2.585302,1000.0,stored,...,none,polyethylene glycol (PEG) 6000_10% w/v_recover...,-0.15,16,white,20.0,20.0,100,75,0.75000
4676,Plantago lanceolata,Plantago lanceolata,2014,Guadalix de la Sierra (Madrid),Spain,ES,40.790054,-3.681596,1000.0,stored,...,none,polyethylene glycol (PEG) 6000_40% w/v_recover...,-1.76,16,white,20.0,20.0,100,59,0.59000
4677,Plantago lanceolata,Plantago lanceolata,2014,Guadalix de la Sierra (Madrid),Spain,ES,40.790054,-3.681596,1000.0,stored,...,none,polyethylene glycol (PEG) 6000_40% w/v,-1.76,16,white,20.0,20.0,100,2,0.02000
4678,Plantago lanceolata,Plantago lanceolata,2014,Guadalix de la Sierra (Madrid),Spain,ES,40.790054,-3.681596,1000.0,stored,...,none,salinity_NaCl (mmol L-1)_300,-1.49,16,white,20.0,20.0,100,23,0.23000


In [None]:
df.dtypes

Unnamed: 0,0
accepted_binomial,object
EUNIS_name,object
collection_year,object
collection_site,object
collection_country,object
iso_code,object
latitude,object
longitude,float64
coordinate_accuracy,float64
storage,object


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4680 entries, 0 to 4679
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   accepted_binomial    4680 non-null   object 
 1   EUNIS_name           4680 non-null   object 
 2   collection_year      4680 non-null   object 
 3   collection_site      4680 non-null   object 
 4   collection_country   4680 non-null   object 
 5   iso_code             4680 non-null   object 
 6   latitude             4680 non-null   object 
 7   longitude            4680 non-null   float64
 8   coordinate_accuracy  4680 non-null   float64
 9   storage              4680 non-null   object 
 10  substrate            4680 non-null   object 
 11  experiment_length    4680 non-null   object 
 12  scarification        4680 non-null   object 
 13  stratification       4680 non-null   object 
 14  other_treatments     4680 non-null   object 
 15  water_potential      4680 non-null   f

In [None]:
df.isnull().sum()

Unnamed: 0,0
accepted_binomial,0
EUNIS_name,0
collection_year,0
collection_site,0
collection_country,0
iso_code,0
latitude,0
longitude,0
coordinate_accuracy,0
storage,0


## Identify Feature Types

### Target variable

In [9]:
y = df["germination_rate"]

### Features

In [10]:
X = df.drop(columns=["germination_rate", "seeds_germinated", "seeds_sown"])

### Separate categorical and numeric

In [11]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

## Preprocessing

Numerical features → StandardScaler

Categorical features → OneHotEncoder

### Preprocessing for numeric + categorical

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

## Build Pipeline

In [13]:
# Random Forest Regressor (you can swap with XGBoost, etc.)
model = RandomForestRegressor(n_estimators=200, random_state=42)

# Create pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

## Train/Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Train the Model

In [15]:
pipeline.fit(X_train, y_train)

## Evaluate Model

In [16]:
# Predictions
y_pred = pipeline.predict(X_test)

### Evaluation

In [17]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
print("Mean Absolute Error:", mae)

Mean Absolute Error: 0.12095426879104726


In [19]:
print("R² Score:", r2)

R² Score: 0.7582402911133187


## Example: Predict germination rate for a single row

In [20]:
sample = X_test.iloc[[0]]  # take first test row
pred_rate = pipeline.predict(sample)
print("Predicted Germination Rate:", pred_rate[0])

Predicted Germination Rate: 0.494587500000001
