# EcoCrop data cleaning and imputation

## load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import seaborn as sns
import networkx as nx
import warnings

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, ward
from scipy.spatial.distance import pdist
from scipy.sparse import csr_matrix
from scipy.cluster.hierarchy import fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score, calinski_harabasz_score
from collections import defaultdict

## download pre-scraped data and examine files

https://github.com/supersistence/EcoCrop-ScrapeR?tab=readme-ov-file

In [2]:
cropbasics_data = '../data/agricultural/EcoCrop/cropbasics_scrape.csv'
cropbasics = pd.read_csv(cropbasics_data, na_values=['-', '---', 'nan', np.nan])
cropbasics

Unnamed: 0,crop_code,species,Life.form,Habit,Life.span,Physiology,Category,Plant.attributes,temp_opt_min,Temp_Opt_Max,...,Subsystem,Companion.species,Level.of.mechanization,Labour.intensity,cycle_min,cycle_max,use.main,use.detailed,use.part,datasheet_url
0,289,Abelmoschus esculentus,herb,erect,annual,single stem,vegetables,grown on large scale,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
1,290,Abelmoschus manihot,shrub,erect,"annual, perennial",,"vegetables, ornamentals/turf, medicinals & aro...",grown on small scale,22.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2,291,Abelmoschus moschatus,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
3,295,Acacia auriculiformis,tree,erect,perennial,single stem,forest/wood,grown on large scale,24.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
4,297,Acacia farnesiana,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",grown on small scale,20.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2564,400001,Acacia polyacantha,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2565,400002,Prosopis affinis,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2566,400003,Vicia dasycarpa,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...


## look at cropbasics

In [3]:
cropbasics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568 entries, 0 to 2567
Data columns (total 63 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   crop_code                   2568 non-null   int64  
 1   species                     2568 non-null   object 
 2   Life.form                   2306 non-null   object 
 3   Habit                       2272 non-null   object 
 4   Life.span                   2286 non-null   object 
 5   Physiology                  1895 non-null   object 
 6   Category                    2296 non-null   object 
 7   Plant.attributes            1290 non-null   object 
 8   temp_opt_min                2072 non-null   float64
 9   Temp_Opt_Max                2072 non-null   float64
 10  Temp_Abs_Min                2069 non-null   float64
 11  Temp_Abs_Max                2069 non-null   float64
 12  Rain_Opt_Min                2068 non-null   float64
 13  Rain_Opt_Max                2068 

In [4]:
columns = cropbasics.columns
for col in columns:
    print(col)
    # print(f"{cropbasics[col][:10]}\n")
    print(cropbasics[col].unique())

crop_code
[   289    290    291 ... 400002 400003 400004]
species
['Abelmoschus esculentus' 'Abelmoschus manihot' 'Abelmoschus moschatus'
 ... 'Prosopis affinis' 'Vicia dasycarpa' 'Camelina sativa']
Life.form
['herb' 'shrub' 'herb, sub-shrub' 'tree' 'shrub, tree' nan 'grass'
 'herb, vine' 'herb, vine, sub-shrub' 'sub-shrub' 'herb, shrub'
 'vine, shrub' 'vine' 'herb, shrub, tree' 'herb, sub-shrub, shrub'
 'vine, shrub, tree' 'other' 'vine, sub-shrub' 'sub-shrub, shrub'
 'vine, tree']
Habit
['erect' 'prostrate/procumbent/semi-erect' nan
 'erect, acaulescent (or rosette plants)' 'climber/scrambler/scadent'
 'acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect'
 'erect, climber/scrambler/scadent'
 'prostrate/procumbent/semi-erect, climber/scrambler/scadent'
 'prostrate/procumbent/semi-erect, acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect, climber/scrambler/scadent']
Life.span
['annual' 'annual, perennial' 'annual, biennial, perennial' 'peren

## look at species

save to csv for later use

In [5]:
cropbasics['species']

0          Abelmoschus esculentus
1             Abelmoschus manihot
2           Abelmoschus moschatus
3           Acacia auriculiformis
4               Acacia farnesiana
                  ...            
2563    Chamaecrista rotundifolia
2564           Acacia polyacantha
2565             Prosopis affinis
2566              Vicia dasycarpa
2567              Camelina sativa
Name: species, Length: 2568, dtype: object

In [6]:
species_data = cropbasics['species']
species_data.to_csv('../data/agricultural/EcoCrop/generated_data/species.csv')

## create separate columns for genus and species 

rename current `genus` column to `genus_species`, then generate a separate `genus`column, then generate a separate `species column

In [7]:
# Rename the 'species' column to 'genus_species'
cropbasics.rename(columns={'species': 'genus_species'}, inplace=True)

# Assuming 'species' column has names in 'Genus species' format,
# create the 'genus' column by extracting the first part of 'genus_species'
cropbasics['genus'] = cropbasics['genus_species'].apply(
    lambda x: x.split()[0] 
    if pd.notna(x) and len(x.split()) > 0 
    else None
)

In [8]:
# Create the 'species' column by extracting the second part of 'genus_species'
cropbasics['species'] = cropbasics['genus_species'].apply(
    lambda x: x.split()[1] 
    if pd.notna(x) and len(x.split()) > 1
    else None
)
cropbasics['species']

0           esculentus
1              manihot
2            moschatus
3       auriculiformis
4           farnesiana
             ...      
2563      rotundifolia
2564       polyacantha
2565           affinis
2566         dasycarpa
2567            sativa
Name: species, Length: 2568, dtype: object

## fill in missing data with data from same genus 

In [9]:
def impute_within_group(group):
    for column in group.columns:
        if group[column].dtype in ['float64', 'int64']:  # Numeric columns
            mean_value = group[column].mean(skipna=True)
            group[column].fillna(mean_value, inplace=True)
        else:  # Categorical columns
            mode_value = group[column].mode().get(0, default='Unknown')
            group[column].fillna(mode_value, inplace=True)
    return group

In [10]:
len(cropbasics['genus'].unique())

1033

In [11]:
groupbygenus = cropbasics.groupby('genus', group_keys=False)

In [12]:
imputed_by_genus = groupbygenus.apply(impute_within_group)
imputed_by_genus

Unnamed: 0,crop_code,genus_species,Life.form,Habit,Life.span,Physiology,Category,Plant.attributes,temp_opt_min,Temp_Opt_Max,...,Level.of.mechanization,Labour.intensity,cycle_min,cycle_max,use.main,use.detailed,use.part,datasheet_url,genus,species
0,289,Abelmoschus esculentus,herb,erect,annual,single stem,vegetables,grown on large scale,20.000000,30.000000,...,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,esculentus
1,290,Abelmoschus manihot,shrub,erect,"annual, perennial","deciduous, multi stem","vegetables, ornamentals/turf, medicinals & aro...",grown on small scale,22.000000,30.000000,...,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,manihot
2,291,Abelmoschus moschatus,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",grown on large scale,20.000000,30.000000,...,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,moschatus
3,295,Acacia auriculiformis,tree,erect,perennial,single stem,forest/wood,grown on large scale,24.000000,32.000000,...,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,auriculiformis
4,297,Acacia farnesiana,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",grown on small scale,20.000000,32.000000,...,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,farnesiana
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,herb,prostrate/procumbent/semi-erect,"annual, perennial","deciduous, multi stem","forage/pasture, medicinals & aromatic",Unknown,20.000000,25.000000,...,Level of mechanization,Labour intensity,0,0,animal food (feed),vitamins,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Chamaecrista,rotundifolia
2564,400001,Acacia polyacantha,"shrub, tree",erect,perennial,"single stem, multi stem","forest/wood, environmental",harvested from wild,20.163043,32.228261,...,Level of mechanization,Labour intensity,0,0,fuels,fuelwood,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,polyacantha
2565,400002,Prosopis affinis,tree,erect,perennial,"deciduous, single stem, C3 photosynthesis",forest/wood,grown on small scale,21.000000,31.600000,...,Level of mechanization,Labour intensity,0,0,food & beverage,vitamins,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Prosopis,affinis
2566,400003,Vicia dasycarpa,herb,climber/scrambler/scadent,annual,multi stem,forage/pasture,grown on large scale,13.000000,22.636364,...,Level of mechanization,Labour intensity,0,0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Vicia,dasycarpa


In [13]:
imputed_by_genus.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2568 entries, 0 to 2567
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   crop_code                   2568 non-null   int64  
 1   genus_species               2568 non-null   object 
 2   Life.form                   2568 non-null   object 
 3   Habit                       2568 non-null   object 
 4   Life.span                   2568 non-null   object 
 5   Physiology                  2568 non-null   object 
 6   Category                    2568 non-null   object 
 7   Plant.attributes            2568 non-null   object 
 8   temp_opt_min                2338 non-null   float64
 9   Temp_Opt_Max                2338 non-null   float64
 10  Temp_Abs_Min                2335 non-null   float64
 11  Temp_Abs_Max                2335 non-null   float64
 12  Rain_Opt_Min                2333 non-null   float64
 13  Rain_Opt_Max                2333 

## generate a comparison dataframe to see what was imputed

In [14]:
comparison_df = pd.DataFrame()
df = cropbasics
imputed_df = imputed_by_genus

# Initialize a dictionary to collect data
comparison_data = {}

for column in df.columns:
    mask = df[column] != imputed_df[column]  # Mask where changes occurred
    if mask.any():
        # Store original and imputed data directly in the dictionary
        comparison_data[f'{column}_original'] = df.loc[mask, column]
        comparison_data[f'{column}_imputed'] = imputed_df.loc[mask, column]

# Create the DataFrame from the dictionary
comparison_df = pd.DataFrame(comparison_data)

# Output the DataFrame to see the results
# print(comparison_df)
comparison_df

Unnamed: 0,Life.form_original,Life.form_imputed,Habit_original,Habit_imputed,Life.span_original,Life.span_imputed,Physiology_original,Physiology_imputed,Category_original,Category_imputed,...,Introduction.risks._original,Introduction.risks._imputed,Product..system_original,Product..system_imputed,use.main_original,use.main_imputed,use.detailed_original,use.detailed_imputed,use.part_original,use.part_imputed
0,,,,,,,,,,,...,,can become a weed,,Unknown,,,,,,
1,,,,,,,,"deciduous, multi stem",,,...,,can become a weed,,Unknown,,,,,,
2,,,,,,,,,,,...,,,,Unknown,,,,,,
3,,,,,,,,,,,...,,can become a weed,,Unknown,,,,,,
4,,,,,,,,,,,...,,,,Unknown,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,,herb,,prostrate/procumbent/semi-erect,,"annual, perennial",,"deciduous, multi stem",,"forage/pasture, medicinals & aromatic",...,,Unknown,,Unknown,,animal food (feed),,vitamins,,entire plant
2564,,"shrub, tree",,erect,,perennial,,"single stem, multi stem",,"forest/wood, environmental",...,,can become a weed,,Unknown,,fuels,,fuelwood,,bark
2565,,tree,,erect,,perennial,,"deciduous, single stem, C3 photosynthesis",,forest/wood,...,,can become a weed,,Unknown,,food & beverage,,vitamins,,bark
2566,,herb,,climber/scrambler/scadent,,annual,,multi stem,,forage/pasture,...,,can become a weed,,large scale/commercial,,animal food (feed),,minerals,,entire plant


## check imputed_by_genus columns


In [15]:
for column in imputed_by_genus.columns:
    print()
    print(column)
    print(imputed_by_genus[column].unique())
    # [print(type(data), data) for data in (imputed_by_genus_non_null[column].unique())]


crop_code
[   289    290    291 ... 400002 400003 400004]

genus_species
['Abelmoschus esculentus' 'Abelmoschus manihot' 'Abelmoschus moschatus'
 ... 'Prosopis affinis' 'Vicia dasycarpa' 'Camelina sativa']

Life.form
['herb' 'shrub' 'herb, sub-shrub' 'tree' 'shrub, tree' 'grass'
 'herb, vine' 'herb, vine, sub-shrub' 'sub-shrub' 'herb, shrub'
 'vine, shrub' 'vine' 'herb, shrub, tree' 'Unknown'
 'herb, sub-shrub, shrub' 'vine, shrub, tree' 'other' 'vine, sub-shrub'
 'sub-shrub, shrub' 'vine, tree']

Habit
['erect' 'prostrate/procumbent/semi-erect'
 'erect, acaulescent (or rosette plants)' 'climber/scrambler/scadent'
 'acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect'
 'erect, climber/scrambler/scadent'
 'prostrate/procumbent/semi-erect, climber/scrambler/scadent' 'Unknown'
 'prostrate/procumbent/semi-erect, acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect, climber/scrambler/scadent']

Life.span
['annual' 'annual, perennial' 'annual, bien

## further imputation

- convert numeric strings to numeric
- replace nan with mean of all values
- replace "no input" with mean
- replace "Unknown with mode of categories"

In [16]:
def convert_and_impute(column):
    # Check if the column is already numeric
    if column.dtype in ['float64', 'int64']:
        # Calculate mean and fill NaNs
        mean_value = column.mean()
        return column.fillna(mean_value)
    
    else:
        # Attempt to convert non-numeric data
        try:
            numeric_values = pd.to_numeric(column.replace('no input', np.nan), errors='coerce')
            if numeric_values.isna().all():
                # If all values are NaN, check if 'Unknown' needs to be replaced
                if 'Unknown' in column.values:
                    mode = column[column != 'Unknown'].mode().iloc[0] if not column[column != 'Unknown'].empty else 'Unknown'
                    return column.replace('Unknown', mode)
                return column  # Return original if no numerics and no 'Unknown'
            
            # Fill NaNs with the mean value
            mean_value = numeric_values.mean()
            filled_column = numeric_values.fillna(mean_value)

            # Ensure no NaN remains before converting to integer
            if filled_column.notna().all() and filled_column.apply(float.is_integer).all():
                filled_column = filled_column.astype(int)
            return filled_column

        except Exception as e:
            print(f"Error processing column: {e}")
            return column  # Return as is if exception

In [17]:
imputed_by_genus['genus_species'].info()

<class 'pandas.core.series.Series'>
Int64Index: 2568 entries, 0 to 2567
Series name: genus_species
Non-Null Count  Dtype 
--------------  ----- 
2568 non-null   object
dtypes: object(1)
memory usage: 104.7+ KB


In [18]:
impute_numerics = imputed_by_genus.copy()
for column in impute_numerics.columns:
    # print(column)
    impute_numerics[column] = convert_and_impute(impute_numerics[column]) 

In [19]:
impute_numerics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2568 entries, 0 to 2567
Data columns (total 65 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   crop_code                   2568 non-null   int64  
 1   genus_species               2568 non-null   object 
 2   Life.form                   2568 non-null   object 
 3   Habit                       2568 non-null   object 
 4   Life.span                   2568 non-null   object 
 5   Physiology                  2568 non-null   object 
 6   Category                    2568 non-null   object 
 7   Plant.attributes            2568 non-null   object 
 8   temp_opt_min                2568 non-null   float64
 9   Temp_Opt_Max                2568 non-null   float64
 10  Temp_Abs_Min                2568 non-null   float64
 11  Temp_Abs_Max                2568 non-null   float64
 12  Rain_Opt_Min                2568 non-null   float64
 13  Rain_Opt_Max                2568 

## save clean dataframe

In [20]:
cropbasics_clean_df_file = '../data/agricultural/EcoCrop/generated_data/cropbasics_clean_df.pkl'
impute_numerics.to_pickle(cropbasics_clean_df_file)