# EcoCrop data cleaning and imputation

## load libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import altair as alt
# import seaborn as sns
import networkx as nx
import warnings

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list, ward
from scipy.spatial.distance import pdist
from scipy.sparse import csr_matrix
from scipy.cluster.hierarchy import fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples, davies_bouldin_score, calinski_harabasz_score
from collections import defaultdict

## download pre-scraped data and examine files

https://github.com/supersistence/EcoCrop-ScrapeR?tab=readme-ov-file

In [2]:
# cropbasics_data = '../data/agricultural/EcoCrop/ScrapeR/cropbasics_scrape.csv'
cropbasics_data = '../data/crops/cropbasics_scrape.csv'
cropbasics = pd.read_csv(cropbasics_data, na_values=['-', '---', 'nan', np.nan])
cropbasics

Unnamed: 0,crop_code,species,Life.form,Habit,Life.span,Physiology,Category,Plant.attributes,temp_opt_min,Temp_Opt_Max,...,Subsystem,Companion.species,Level.of.mechanization,Labour.intensity,cycle_min,cycle_max,use.main,use.detailed,use.part,datasheet_url
0,289,Abelmoschus esculentus,herb,erect,annual,single stem,vegetables,grown on large scale,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
1,290,Abelmoschus manihot,shrub,erect,"annual, perennial",,"vegetables, ornamentals/turf, medicinals & aro...",grown on small scale,22.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2,291,Abelmoschus moschatus,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
3,295,Acacia auriculiformis,tree,erect,perennial,single stem,forest/wood,grown on large scale,24.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
4,297,Acacia farnesiana,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",grown on small scale,20.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2564,400001,Acacia polyacantha,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2565,400002,Prosopis affinis,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2566,400003,Vicia dasycarpa,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...


## look at cropbasics

In [3]:
cropbasics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568 entries, 0 to 2567
Data columns (total 63 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   crop_code                   2568 non-null   int64  
 1   species                     2568 non-null   object 
 2   Life.form                   2306 non-null   object 
 3   Habit                       2272 non-null   object 
 4   Life.span                   2286 non-null   object 
 5   Physiology                  1895 non-null   object 
 6   Category                    2296 non-null   object 
 7   Plant.attributes            1290 non-null   object 
 8   temp_opt_min                2072 non-null   float64
 9   Temp_Opt_Max                2072 non-null   float64
 10  Temp_Abs_Min                2069 non-null   float64
 11  Temp_Abs_Max                2069 non-null   float64
 12  Rain_Opt_Min                2068 non-null   float64
 13  Rain_Opt_Max                2068 

In [4]:
columns = cropbasics.columns
for col in columns:
    print(col)
    # print(f"{cropbasics[col][:10]}\n")
    print(cropbasics[col].unique())

crop_code
[   289    290    291 ... 400002 400003 400004]
species
['Abelmoschus esculentus' 'Abelmoschus manihot' 'Abelmoschus moschatus'
 ... 'Prosopis affinis' 'Vicia dasycarpa' 'Camelina sativa']
Life.form
['herb' 'shrub' 'herb, sub-shrub' 'tree' 'shrub, tree' nan 'grass'
 'herb, vine' 'herb, vine, sub-shrub' 'sub-shrub' 'herb, shrub'
 'vine, shrub' 'vine' 'herb, shrub, tree' 'herb, sub-shrub, shrub'
 'vine, shrub, tree' 'other' 'vine, sub-shrub' 'sub-shrub, shrub'
 'vine, tree']
Habit
['erect' 'prostrate/procumbent/semi-erect' nan
 'erect, acaulescent (or rosette plants)' 'climber/scrambler/scadent'
 'acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect'
 'erect, climber/scrambler/scadent'
 'prostrate/procumbent/semi-erect, climber/scrambler/scadent'
 'prostrate/procumbent/semi-erect, acaulescent (or rosette plants)'
 'erect, prostrate/procumbent/semi-erect, climber/scrambler/scadent']
Life.span
['annual' 'annual, perennial' 'annual, biennial, perennial' 'peren

## rename columns

In [5]:
list(columns)

['crop_code',
 'species',
 'Life.form',
 'Habit',
 'Life.span',
 'Physiology',
 'Category',
 'Plant.attributes',
 'temp_opt_min',
 'Temp_Opt_Max',
 'Temp_Abs_Min',
 'Temp_Abs_Max',
 'Rain_Opt_Min',
 'Rain_Opt_Max',
 'Rain_Abs_Min',
 'Rain_Abs_Max',
 'Lat_Opt_Min',
 'Lat_Opt_Max',
 'Lat_Abs_Min',
 'Lat_Abs_Max',
 'Alt_Opt_Min',
 'Alt_Opt_Max',
 'Alt_Abs_Min',
 'Alt_Abs_Max',
 'pH_Opt_Min',
 'pH_Opt_Max',
 'pH_Abs_Min',
 'pH_Abs_Max',
 'Light_Opt_Min',
 'Light_Opt_Max',
 'Light_Abs_Min',
 'Light_Abs_Max',
 'Depth_Opt',
 'Depth_Abs',
 'Texture_Ops',
 'Texture_Abs',
 'Fertility_Ops',
 'Fertility_Abs',
 'Al_Toxicity_Opt',
 'Al_Toxicity_Abs',
 'Salinity_Ops',
 'Salinity_Abs',
 'drainage_opt',
 'drainage_abs',
 'Climate.Zone',
 'photoperiod',
 'Killing.temp..during.rest',
 'Killing.temp..early.growth',
 'Abiotic.toler.',
 'Abiotic.suscept.',
 'Introduction.risks.',
 'Product..system',
 'Cropping.system',
 'Subsystem',
 'Companion.species',
 'Level.of.mechanization',
 'Labour.intensity',
 'cyc

In [6]:
renamed_columms = [
    'Crop_Code',
    'Scientific_Name',
    'Life_Form',
    'Habit',
    'Life_Span',
    'Physiology',
    'Category',
    'Plant_Attributes',
    'Temp_Opt_Min',
    'Temp_Opt_Max',
    'Temp_Abs_Min',
    'Temp_Abs_Max',
    'Rain_Opt_Min',
    'Rain_Opt_Max',
    'Rain_Abs_Min',
    'Rain_Abs_Max',
    'Lat_Opt_Min',
    'Lat_Opt_Max',
    'Lat_Abs_Min',
    'Lat_Abs_Max',
    'Alt_Opt_Min',
    'Alt_Opt_Max',
    'Alt_Abs_Min',
    'Alt_Abs_Max',
    'pH_Opt_Min',
    'pH_Opt_Max',
    'pH_Abs_Min',
    'pH_Abs_Max',
    'Light_Opt_Min',
    'Light_Opt_Max',
    'Light_Abs_Min',
    'Light_Abs_Max',
    'Depth_Opt',
    'Depth_Abs',
    'Texture_Ops',
    'Texture_Abs',
    'Fertility_Ops',
    'Fertility_Abs',
    'Al_Toxicity_Opt',
    'Al_Toxicity_Abs',
    'Salinity_Ops',
    'Salinity_Abs',
    'Drainage_Opt',
    'Drainage_Abs',
    'Climate_Zone_Trewartha',
    'Photoperiod',
    'Killing_Temp_Rest',
    'Killing_Temp_Growth',
    'Abiotic_Tolererance',
    'Abiotic_Susceptibility',
    'Introduction_Risks',
    'Production_System',
    'Cropping_System',
    'Subsystem',
    'Companion_Species',
    'Level_of_Mechanization',
    'Labour_Intensity',
    'Crop_Cycle_Min',
    'Crop_Cycle_Max',
    'Use_Main',
    'Use_Detailed',
    'Use_Part',
    'Datasheet_URL'
]

In [7]:
cropbasics.columns = renamed_columms

In [8]:
# list(cropbasics.columns)

In [9]:
cropbasics

Unnamed: 0,Crop_Code,Scientific_Name,Life_Form,Habit,Life_Span,Physiology,Category,Plant_Attributes,Temp_Opt_Min,Temp_Opt_Max,...,Subsystem,Companion_Species,Level_of_Mechanization,Labour_Intensity,Crop_Cycle_Min,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL
0,289,Abelmoschus esculentus,herb,erect,annual,single stem,vegetables,grown on large scale,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
1,290,Abelmoschus manihot,shrub,erect,"annual, perennial",,"vegetables, ornamentals/turf, medicinals & aro...",grown on small scale,22.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2,291,Abelmoschus moschatus,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",,20.0,30.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
3,295,Acacia auriculiformis,tree,erect,perennial,single stem,forest/wood,grown on large scale,24.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
4,297,Acacia farnesiana,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",grown on small scale,20.0,32.0,...,Subsystem,Companion species,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2564,400001,Acacia polyacantha,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2565,400002,Prosopis affinis,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...
2566,400003,Vicia dasycarpa,,,,,,,,,...,Subsystem,Companion species,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...


In [10]:
ecocrop_openclim = pd.read_csv('../data/crops/EcoCrop_DB2_usda_hardiness.csv')
ecocrop_openclim

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,EcoPortCode,ScientificName,AUTH,FAMNAME,SYNO,COMNAME,LIFO,...,PROSY,GMIN,GMAX,Genus,Species,Variety,PFAF_URL,USDA_HARDINESS_ZONE,USDA_HARDINESS_ZONE_MIN,USDA_HARDINESS_ZONE_MAX
0,0,0,0,289,Abelmoschus esculentus,(L.) Moench,Magnoliopsida:Dilleniidae:Malvales:Malvaceae,Hibiscus esculentus L.,"abelmoskus, america-neri, bakhua mun, bamia, b...",herb,...,,50.0,180.0,Abelmoschus,esculentus,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,5-11,5.0,11.0
1,1,1,1,290,Abelmoschus manihot,(L.) Medic.,Magnoliopsida:Dilleniidae:Malvales:Malvaceae,"Hibiscus manihot L. (1753), Abelmoschus maniho...","neka (Simbo), bele (Fiji), pele (Tonga, Tuvalu...",shrub,...,,365.0,365.0,Abelmoschus,manihot,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11,8.0,11.0
2,2,2,2,291,Abelmoschus moschatus,Medic.,Magnoliopsida:Dilleniidae:Malvales:Malvaceae,Hibiscus abelmoschus L. (1753).,"abelmosk, musk mallow, mushkdan, muskdana, kas...","herb, sub-shrub",...,,0.0,0.0,Abelmoschus,moschatus,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11,8.0,11.0
3,3,3,3,295,Acacia auriculiformis,Cunn. ex Benth.,Magnoliopsida:Rosidae:Fabales:Leguminosae,Racosperma auriculiforme (A. Cunn. ex Benth.) ...,"Papuan wattle, auri, ear leaf acacia, tuhkehn ...",tree,...,,180.0,240.0,Acacia,auriculiformis,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12,10.0,12.0
4,4,4,4,297,Acacia farnesiana,(L.) Willd.,Magnoliopsida:Rosidae:Fabales:Leguminosae,"Mimosa farnesiana L., Mimosa acicularis Poir.,...","Ellington curse, klu, cassie, espino blanco, a...",tree,...,,60.0,240.0,Acacia,farnesiana,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,9-11,9.0,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,2563,2563,2563,400000,Chamaecrista rotundifolia,Persoon,Leguminosae,Cassia rotundifolia,Pasto rastiero,,...,,0.0,0.0,Chamaecrista,rotundifolia,,https://pfaf.org/user/Plant.aspx?LatinName=Cha...,9-11,9.0,11.0
2564,2564,2564,2564,400001,Acacia polyacantha,Willd.,Leguminosae,Acacia catechu,Black catechu,,...,,0.0,0.0,Acacia,polyacantha,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,,,
2565,2565,2565,2565,400002,Prosopis affinis,(Sw.) DC.,Leguminosae,Prosopis algarrobilla,Algarobilla,,...,,0.0,0.0,Prosopis,affinis,,https://pfaf.org/user/Plant.aspx?LatinName=Pro...,,,
2566,2566,2566,2566,400003,Vicia dasycarpa,Roth.,Leguminosae,Vicia villosa ssp. varia,Hairy vetch,,...,,0.0,0.0,Vicia,dasycarpa,,https://pfaf.org/user/Plant.aspx?LatinName=Vic...,,,


In [11]:
ecocrop_openclim.columns

Index(['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'EcoPortCode',
       'ScientificName', 'AUTH', 'FAMNAME', 'SYNO', 'COMNAME', 'LIFO', 'HABI',
       'LISPA', 'PHYS', 'CAT', 'PLAT', 'TOPMN', 'TOPMX', 'TMIN', 'TMAX',
       'ROPMN', 'ROPMX', 'RMIN', 'RMAX', 'PHOPMN', 'PHOPMX', 'PHMIN', 'PHMAX',
       'LATOPMN', 'LATOPMX', 'LATMN', 'LATMX', 'ALTMX', 'LIOPMN', 'LIOPMX',
       'LIMN', 'LIMX', 'DEP', 'DEPR', 'TEXT', 'TEXTR', 'FER', 'FERR', 'TOX',
       'TOXR', 'SAL', 'SALR', 'DRA', 'DRAR', 'KTMPR', 'KTMP', 'PHOTO', 'CLIZ',
       'ABITOL', 'ABISUS', 'INTRI', 'PROSY', 'GMIN', 'GMAX', 'Genus',
       'Species', 'Variety', 'PFAF_URL', 'USDA_HARDINESS_ZONE',
       'USDA_HARDINESS_ZONE_MIN', 'USDA_HARDINESS_ZONE_MAX'],
      dtype='object')

In [12]:
ecocrop_combined = cropbasics.copy()

In [13]:
ecocrop_combined[['Genus', 'Species', 'Variety', 'PFAF_URL', 'USDA_Hardiness_Zone']] = ecocrop_openclim[['Genus', 'Species', 'Variety',
       'PFAF_URL', 'USDA_HARDINESS_ZONE']]

In [14]:
ecocrop_combined

Unnamed: 0,Crop_Code,Scientific_Name,Life_Form,Habit,Life_Span,Physiology,Category,Plant_Attributes,Temp_Opt_Min,Temp_Opt_Max,...,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL,Genus,Species,Variety,PFAF_URL,USDA_Hardiness_Zone
0,289,Abelmoschus esculentus,herb,erect,annual,single stem,vegetables,grown on large scale,20.0,30.0,...,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,esculentus,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,5-11
1,290,Abelmoschus manihot,shrub,erect,"annual, perennial",,"vegetables, ornamentals/turf, medicinals & aro...",grown on small scale,22.0,30.0,...,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,manihot,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
2,291,Abelmoschus moschatus,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",,20.0,30.0,...,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Abelmoschus,moschatus,,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
3,295,Acacia auriculiformis,tree,erect,perennial,single stem,forest/wood,grown on large scale,24.0,32.0,...,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,auriculiformis,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12
4,297,Acacia farnesiana,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",grown on small scale,20.0,32.0,...,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,farnesiana,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,9-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,,,,,,,,,...,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Chamaecrista,rotundifolia,,https://pfaf.org/user/Plant.aspx?LatinName=Cha...,9-11
2564,400001,Acacia polyacantha,,,,,,,,,...,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Acacia,polyacantha,,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,
2565,400002,Prosopis affinis,,,,,,,,,...,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Prosopis,affinis,,https://pfaf.org/user/Plant.aspx?LatinName=Pro...,
2566,400003,Vicia dasycarpa,,,,,,,,,...,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,Vicia,dasycarpa,,https://pfaf.org/user/Plant.aspx?LatinName=Vic...,


In [15]:
list(ecocrop_combined.columns)

['Crop_Code',
 'Scientific_Name',
 'Life_Form',
 'Habit',
 'Life_Span',
 'Physiology',
 'Category',
 'Plant_Attributes',
 'Temp_Opt_Min',
 'Temp_Opt_Max',
 'Temp_Abs_Min',
 'Temp_Abs_Max',
 'Rain_Opt_Min',
 'Rain_Opt_Max',
 'Rain_Abs_Min',
 'Rain_Abs_Max',
 'Lat_Opt_Min',
 'Lat_Opt_Max',
 'Lat_Abs_Min',
 'Lat_Abs_Max',
 'Alt_Opt_Min',
 'Alt_Opt_Max',
 'Alt_Abs_Min',
 'Alt_Abs_Max',
 'pH_Opt_Min',
 'pH_Opt_Max',
 'pH_Abs_Min',
 'pH_Abs_Max',
 'Light_Opt_Min',
 'Light_Opt_Max',
 'Light_Abs_Min',
 'Light_Abs_Max',
 'Depth_Opt',
 'Depth_Abs',
 'Texture_Ops',
 'Texture_Abs',
 'Fertility_Ops',
 'Fertility_Abs',
 'Al_Toxicity_Opt',
 'Al_Toxicity_Abs',
 'Salinity_Ops',
 'Salinity_Abs',
 'Drainage_Opt',
 'Drainage_Abs',
 'Climate_Zone_Trewartha',
 'Photoperiod',
 'Killing_Temp_Rest',
 'Killing_Temp_Growth',
 'Abiotic_Tolererance',
 'Abiotic_Susceptibility',
 'Introduction_Risks',
 'Production_System',
 'Cropping_System',
 'Subsystem',
 'Companion_Species',
 'Level_of_Mechanization',
 'Labour_In

In [16]:
reordered_columns = [
    'Crop_Code',
    'Scientific_Name',
    'Genus',
    'Species',
    'Variety',
    'Life_Form',
    'Habit',
    'Life_Span',
    'Physiology',
    'Category',
    'Plant_Attributes',
    'Temp_Opt_Min',
    'Temp_Opt_Max',
    'Temp_Abs_Min',
    'Temp_Abs_Max',
    'Rain_Opt_Min',
    'Rain_Opt_Max',
    'Rain_Abs_Min',
    'Rain_Abs_Max',
    'Lat_Opt_Min',
    'Lat_Opt_Max',
    'Lat_Abs_Min',
    'Lat_Abs_Max',
    'Alt_Opt_Min',
    'Alt_Opt_Max',
    'Alt_Abs_Min',
    'Alt_Abs_Max',
    'pH_Opt_Min',
    'pH_Opt_Max',
    'pH_Abs_Min',
    'pH_Abs_Max',
    'Light_Opt_Min',
    'Light_Opt_Max',
    'Light_Abs_Min',
    'Light_Abs_Max',
    'Depth_Opt',
    'Depth_Abs',
    'Texture_Ops',
    'Texture_Abs',
    'Fertility_Ops',
    'Fertility_Abs',
    'Al_Toxicity_Opt',
    'Al_Toxicity_Abs',
    'Salinity_Ops',
    'Salinity_Abs',
    'Drainage_Opt',
    'Drainage_Abs',
    'Climate_Zone_Trewartha',
    'Photoperiod',
    'Killing_Temp_Rest',
    'Killing_Temp_Growth',
    'Abiotic_Tolererance',
    'Abiotic_Susceptibility',
    'Introduction_Risks',
    'Production_System',
    'Cropping_System',
    'Subsystem',
    'Companion_Species',
    'Level_of_Mechanization',
    'Labour_Intensity',
    'Crop_Cycle_Min',
    'Crop_Cycle_Max',
    'Use_Main',
    'Use_Detailed',
    'Use_Part',
    'Datasheet_URL',
    'PFAF_URL',
    'USDA_Hardiness_Zone'
]

In [17]:
ecocrop_combined = ecocrop_combined[reordered_columns]

In [18]:
ecocrop_combined

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,Variety,Life_Form,Habit,Life_Span,Physiology,Category,...,Level_of_Mechanization,Labour_Intensity,Crop_Cycle_Min,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL,PFAF_URL,USDA_Hardiness_Zone
0,289,Abelmoschus esculentus,Abelmoschus,esculentus,,herb,erect,annual,single stem,vegetables,...,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,5-11
1,290,Abelmoschus manihot,Abelmoschus,manihot,,shrub,erect,"annual, perennial",,"vegetables, ornamentals/turf, medicinals & aro...",...,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
2,291,Abelmoschus moschatus,Abelmoschus,moschatus,,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",...,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
3,295,Acacia auriculiformis,Acacia,auriculiformis,,tree,erect,perennial,single stem,forest/wood,...,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12
4,297,Acacia farnesiana,Acacia,farnesiana,,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",...,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,9-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,Chamaecrista,rotundifolia,,,,,,,...,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Cha...,9-11
2564,400001,Acacia polyacantha,Acacia,polyacantha,,,,,,,...,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,
2565,400002,Prosopis affinis,Prosopis,affinis,,,,,,,...,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Pro...,
2566,400003,Vicia dasycarpa,Vicia,dasycarpa,,,,,,,...,Level of mechanization,Labour intensity,0,0,,,,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Vic...,


## fill in missing data with data from same genus 

In [19]:
def impute_numeric(group):
    return group.transform(lambda x: x.fillna(x.mean()))

def impute_categorical(group):
    return group.transform(lambda x: x.fillna(x.mode().iloc[0] if not x.mode().empty else 'Unknown'))

In [20]:
len(ecocrop_combined['Genus'].unique())

1033

In [21]:
groupbygenus = ecocrop_combined.groupby('Genus')

In [22]:
# Impute numeric columns
imputed_by_genus = ecocrop_combined.copy()

numeric_cols = imputed_by_genus.select_dtypes(include=['float64', 'int64']).columns
imputed_by_genus[numeric_cols] = groupbygenus[numeric_cols].transform(impute_numeric)

# Impute categorical columns
categorical_cols = imputed_by_genus.select_dtypes(exclude=['float64', 'int64']).columns
imputed_by_genus[categorical_cols] = groupbygenus[categorical_cols].transform(impute_categorical)

imputed_by_genus

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,Variety,Life_Form,Habit,Life_Span,Physiology,Category,...,Level_of_Mechanization,Labour_Intensity,Crop_Cycle_Min,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL,PFAF_URL,USDA_Hardiness_Zone
0,289,Abelmoschus esculentus,Abelmoschus,esculentus,Unknown,herb,erect,annual,single stem,vegetables,...,Level of mechanization,Labour intensity,50,180,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,5-11
1,290,Abelmoschus manihot,Abelmoschus,manihot,Unknown,shrub,erect,"annual, perennial","deciduous, multi stem","vegetables, ornamentals/turf, medicinals & aro...",...,Level of mechanization,Labour intensity,365,365,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
2,291,Abelmoschus moschatus,Abelmoschus,moschatus,Unknown,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",...,Level of mechanization,Labour intensity,0,0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11
3,295,Acacia auriculiformis,Acacia,auriculiformis,Unknown,tree,erect,perennial,single stem,forest/wood,...,Level of mechanization,Labour intensity,180,240,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12
4,297,Acacia farnesiana,Acacia,farnesiana,Unknown,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",...,Level of mechanization,Labour intensity,60,240,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,9-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,Chamaecrista,rotundifolia,Unknown,herb,prostrate/procumbent/semi-erect,"annual, perennial","deciduous, multi stem","forage/pasture, medicinals & aromatic",...,Level of mechanization,Labour intensity,0,0,animal food (feed),vitamins,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Cha...,9-11
2564,400001,Acacia polyacantha,Acacia,polyacantha,Unknown,"shrub, tree",erect,perennial,"single stem, multi stem","forest/wood, environmental",...,Level of mechanization,Labour intensity,0,0,fuels,fuelwood,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12
2565,400002,Prosopis affinis,Prosopis,affinis,Unknown,tree,erect,perennial,"deciduous, single stem, C3 photosynthesis",forest/wood,...,Level of mechanization,Labour intensity,0,0,food & beverage,vitamins,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Pro...,10-12
2566,400003,Vicia dasycarpa,Vicia,dasycarpa,Unknown,herb,climber/scrambler/scadent,annual,multi stem,forage/pasture,...,Level of mechanization,Labour intensity,0,0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Vic...,Coming soon


In [23]:
imputed_by_genus.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568 entries, 0 to 2567
Data columns (total 68 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Crop_Code               2568 non-null   int64  
 1   Scientific_Name         2568 non-null   object 
 2   Genus                   2568 non-null   object 
 3   Species                 2568 non-null   object 
 4   Variety                 2568 non-null   object 
 5   Life_Form               2568 non-null   object 
 6   Habit                   2568 non-null   object 
 7   Life_Span               2568 non-null   object 
 8   Physiology              2568 non-null   object 
 9   Category                2568 non-null   object 
 10  Plant_Attributes        2568 non-null   object 
 11  Temp_Opt_Min            2338 non-null   float64
 12  Temp_Opt_Max            2338 non-null   float64
 13  Temp_Abs_Min            2335 non-null   float64
 14  Temp_Abs_Max            2335 non-null   

## generate a comparison dataframe to see what was imputed

In [24]:
comparison_df = pd.DataFrame()
df = ecocrop_combined
imputed_df = imputed_by_genus

# Initialize a dictionary to collect data
comparison_data = {}

for column in df.columns:
    mask = df[column] != imputed_df[column]  # Mask where changes occurred
    if mask.any():
        # Store original and imputed data directly in the dictionary
        comparison_data[f'{column}_original'] = df.loc[mask, column]
        comparison_data[f'{column}_imputed'] = imputed_df.loc[mask, column]

# Create the DataFrame from the dictionary
comparison_df = pd.DataFrame(comparison_data)

# Output the DataFrame to see the results
# print(comparison_df)
comparison_df

Unnamed: 0,Variety_original,Variety_imputed,Life_Form_original,Life_Form_imputed,Habit_original,Habit_imputed,Life_Span_original,Life_Span_imputed,Physiology_original,Physiology_imputed,...,Production_System_original,Production_System_imputed,Use_Main_original,Use_Main_imputed,Use_Detailed_original,Use_Detailed_imputed,Use_Part_original,Use_Part_imputed,USDA_Hardiness_Zone_original,USDA_Hardiness_Zone_imputed
0,,Unknown,,,,,,,,,...,,Unknown,,,,,,,,
1,,Unknown,,,,,,,,"deciduous, multi stem",...,,Unknown,,,,,,,,
2,,Unknown,,,,,,,,,...,,Unknown,,,,,,,,
3,,Unknown,,,,,,,,,...,,Unknown,,,,,,,,
4,,Unknown,,,,,,,,,...,,Unknown,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,,Unknown,,herb,,prostrate/procumbent/semi-erect,,"annual, perennial",,"deciduous, multi stem",...,,Unknown,,animal food (feed),,vitamins,,entire plant,,
2564,,Unknown,,"shrub, tree",,erect,,perennial,,"single stem, multi stem",...,,Unknown,,fuels,,fuelwood,,bark,,10-12
2565,,Unknown,,tree,,erect,,perennial,,"deciduous, single stem, C3 photosynthesis",...,,Unknown,,food & beverage,,vitamins,,bark,,10-12
2566,,Unknown,,herb,,climber/scrambler/scadent,,annual,,multi stem,...,,large scale/commercial,,animal food (feed),,minerals,,entire plant,,Coming soon


## check imputed_by_genus columns


In [25]:
for column in imputed_by_genus.columns:
    print()
    print(column)
    print(imputed_by_genus[column].unique())
    # [print(type(data), data) for data in (imputed_by_genus_non_null[column].unique())]


Crop_Code
[   289    290    291 ... 400002 400003 400004]

Scientific_Name
['Abelmoschus esculentus' 'Abelmoschus manihot' 'Abelmoschus moschatus'
 ... 'Prosopis affinis' 'Vicia dasycarpa' 'Camelina sativa']

Genus
['Abelmoschus' 'Acacia' 'Adenanthera' ... 'Thysanolaena' 'Triodia'
 'Camelina']

Species
['esculentus' 'manihot' 'moschatus' ... 'orbiculata' 'polyacantha'
 'dasycarpa']

Variety
['Unknown' 'aggregatum' 'dulce' 'cicla' 'acephala' 'cepa' 'botrytis'
 'capitata' 'gemmifera' 'italica' 'assamica' 'pendulum' 'matogrossense'
 'antiquorum' 'brevides' 'lenabatu' 'bicostata' 'altissima' 'reptans'
 'piperascens' 'coloratum' 'glabrum' 'edulis' 'flavicarpa' 'Br.'
 'oleiferus' 'levis' 'sericea' 'splendida' 'sweet' 'guianensis' 'crispa'
 'gongyloides' 'colurna' 'gerardii' 'rapaceum' 'crassa' 'flavescens'
 'napobrassica' 'rapifera' 'Murray' 'verrucosa' 'sinensis' 'coronarium'
 'motia' 'sofia' 'mahapengiri' 'longan' 'malesianus' 'pseudoglobulus'
 'commutata' 'rubra' 'willemettiana' 'sabdari

## further imputation

- convert numeric strings to numeric
- replace nan with mean of all values
- replace "no input" with mean
- replace "Unknown" with mode of categories

In [26]:
def convert_and_impute(column):
    # Check if the column is already numeric
    if column.dtype in ['float64', 'int64']:
        # Calculate mean and fill NaNs
        mean_value = column.mean()
        return column.fillna(mean_value)
    
    else:
        # Attempt to convert non-numeric data
        try:
            numeric_values = pd.to_numeric(column.replace('no input', np.nan), errors='coerce')
            if numeric_values.isna().all():
                # If all values are NaN, check if 'Unknown' needs to be replaced
                if 'Unknown' in column.values:
                    mode = column[column != 'Unknown'].mode().iloc[0] if not column[column != 'Unknown'].empty else 'Unknown'
                    return column.replace('Unknown', mode)
                return column  # Return original if no numerics and no 'Unknown'
            
            # Fill NaNs with the mean value
            mean_value = numeric_values.mean()
            filled_column = numeric_values.fillna(mean_value)

            # Ensure no NaN remains before converting to integer
            if filled_column.notna().all() and filled_column.apply(float.is_integer).all():
                filled_column = filled_column.astype(int)
            return filled_column

        except Exception as e:
            print(f"Error processing column: {e}")
            return column  # Return as is if exception

In [27]:
imputed_by_genus['Scientific_Name'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 2568 entries, 0 to 2567
Series name: Scientific_Name
Non-Null Count  Dtype 
--------------  ----- 
2568 non-null   object
dtypes: object(1)
memory usage: 20.2+ KB


In [28]:
impute_numerics = imputed_by_genus.copy()
for column in impute_numerics.columns:
    # print(column)
    impute_numerics[column] = convert_and_impute(impute_numerics[column]) 

In [29]:
impute_numerics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568 entries, 0 to 2567
Data columns (total 68 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Crop_Code               2568 non-null   int64  
 1   Scientific_Name         2568 non-null   object 
 2   Genus                   2568 non-null   object 
 3   Species                 2568 non-null   object 
 4   Variety                 2568 non-null   object 
 5   Life_Form               2568 non-null   object 
 6   Habit                   2568 non-null   object 
 7   Life_Span               2568 non-null   object 
 8   Physiology              2568 non-null   object 
 9   Category                2568 non-null   object 
 10  Plant_Attributes        2568 non-null   object 
 11  Temp_Opt_Min            2568 non-null   float64
 12  Temp_Opt_Max            2568 non-null   float64
 13  Temp_Abs_Min            2568 non-null   float64
 14  Temp_Abs_Max            2568 non-null   

## single value corrections

In [30]:
# Using .at to set a single value
impute_numerics.at[2259, 'USDA_Hardiness_Zone'] = '10-12'

In [31]:
impute_numerics[impute_numerics['Temp_Opt_Min']==2000][['Temp_Opt_Min', 'Temp_Opt_Max', 'Temp_Abs_Min', 'Temp_Abs_Max']]

Unnamed: 0,Temp_Opt_Min,Temp_Opt_Max,Temp_Abs_Min,Temp_Abs_Max
1302,2000.0,3500.0,1500.0,4000.0


In [32]:
impute_numerics.at[1302, 'Temp_Opt_Min'] = 20.0
impute_numerics.at[1302, 'Temp_Opt_Max'] = 35.0
impute_numerics.at[1302, 'Temp_Abs_Min'] = 15.0
impute_numerics.at[1302, 'Temp_Abs_Max'] = 40.0

In [33]:
impute_numerics[impute_numerics['Temp_Opt_Min']==2000][['Temp_Opt_Min', 'Temp_Opt_Max', 'Temp_Abs_Min', 'Temp_Abs_Max']]

Unnamed: 0,Temp_Opt_Min,Temp_Opt_Max,Temp_Abs_Min,Temp_Abs_Max


In [34]:
usda_mode = impute_numerics[impute_numerics['USDA_Hardiness_Zone'] != 'Coming Soon']['USDA_Hardiness_Zone'].mode()[0]
usda_mode

'10-12'

In [35]:
impute_numerics[impute_numerics['USDA_Hardiness_Zone'] == 'Coming soon']

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,Variety,Life_Form,Habit,Life_Span,Physiology,Category,...,Level_of_Mechanization,Labour_Intensity,Crop_Cycle_Min,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL,PFAF_URL,USDA_Hardiness_Zone
35,432,Apium graveolens var. dulce,Apium,graveolens,dulce,herb,"erect, acaulescent (or rosette plants)","annual, biennial",single stem,"vegetables, materials",...,Level of mechanization,Labour intensity,80.0,160.0,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Api...,Coming soon
42,481,Avena sativa,Avena,sativa,bicostata,grass,erect,annual,"deciduous, multi stem","cereals & pseudocereals, cover crop",...,Level of mechanization,Labour intensity,110.0,270.0,food & beverage,starch,seeds,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Ave...,Coming soon
53,517,Bidens pilosa,Bidens,pilosa,bicostata,herb,erect,"annual, perennial",multi stem,"vegetables, ornamentals/turf, medicinals & aro...",...,Level of mechanization,Labour intensity,0.0,0.0,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Bid...,Coming soon
70,555,Brassica oleracea var. gemmifera,Brassica,oleracea,gemmifera,herb,erect,"annual, biennial",single stem,vegetables,...,Level of mechanization,Labour intensity,100.0,130.0,food & beverage,vitamins,unspecified part,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Bra...,Coming soon
106,693,Cichorium endivia,Cichorium,endivia,bicostata,herb,acaulescent (or rosette plants),"annual, biennial",single stem,vegetables,...,Level of mechanization,Labour intensity,70.0,85.0,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Cic...,Coming soon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2535,376642,Lathyrus pubescens,Lathyrus,pubescens,bicostata,herb,prostrate/procumbent/semi-erect,annual,multi stem,forage/pasture,...,Level of mechanization,Labour intensity,0.0,0.0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Lat...,Coming soon
2542,376649,Ornithopus micranthus,Ornithopus,micranthus,bicostata,herb,"erect, prostrate/procumbent/semi-erect",annual,"deciduous, multi stem",forage/pasture,...,Level of mechanization,Labour intensity,0.0,0.0,animal food (feed),vitamins,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Orn...,Coming soon
2553,376660,Sporobolus helvolus,Sporobolus,helvolus,bicostata,grass,erect,perennial,multi stem,forage/pasture,...,Level of mechanization,Labour intensity,0.0,0.0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Spo...,Coming soon
2554,376661,Sporobolus marginatus,Sporobolus,marginatus,bicostata,grass,erect,perennial,multi stem,forage/pasture,...,Level of mechanization,Labour intensity,0.0,0.0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Spo...,Coming soon


In [36]:
list(impute_numerics['USDA_Hardiness_Zone'].unique())

['5-11',
 '8-11',
 '10-12',
 '9-11',
 '10-11',
 '8-10',
 '5-9',
 '4-10',
 '7-10',
 '3-11',
 '11-12',
 '9-12',
 'Coming soon',
 '6-10',
 '2-9',
 '4-8',
 '6-9',
 '3-10',
 '7-11',
 '2-11',
 '6-11',
 '7-9',
 '3-7',
 '8-12',
 '5-10',
 '6-12',
 '5-7',
 '3-8',
 '4-11',
 '1-12',
 '3-12',
 '4-9',
 '7-12',
 '5-8',
 '3-6',
 '4-7',
 '6-8',
 '4-6',
 '3-9',
 '4-12',
 '9-10',
 '2-6',
 '2-7',
 '5-12',
 '8-9',
 '2-5',
 '2-8']

In [37]:
impute_numerics['USDA_Hardiness_Zone'] = impute_numerics['USDA_Hardiness_Zone'].replace('Coming soon', usda_mode)

In [38]:
list(impute_numerics['USDA_Hardiness_Zone'].unique())

['5-11',
 '8-11',
 '10-12',
 '9-11',
 '10-11',
 '8-10',
 '5-9',
 '4-10',
 '7-10',
 '3-11',
 '11-12',
 '9-12',
 '6-10',
 '2-9',
 '4-8',
 '6-9',
 '3-10',
 '7-11',
 '2-11',
 '6-11',
 '7-9',
 '3-7',
 '8-12',
 '5-10',
 '6-12',
 '5-7',
 '3-8',
 '4-11',
 '1-12',
 '3-12',
 '4-9',
 '7-12',
 '5-8',
 '3-6',
 '4-7',
 '6-8',
 '4-6',
 '3-9',
 '4-12',
 '9-10',
 '2-6',
 '2-7',
 '5-12',
 '8-9',
 '2-5',
 '2-8']

In [39]:
# Function to split USDA hardiness zone into min and max
def split_hardiness_zone(zone):
    if pd.isna(zone):
        return pd.Series([None, None])
    if '-' in zone:
        min_zone, max_zone = zone.split('-')
        return pd.Series([min_zone, max_zone])

    else:
        return pd.Series([zone, zone])
    
# Apply the function to split the USDA hardiness zone
impute_numerics[['USDA_Hardiness_Zone_Min', 'USDA_Hardiness_Zone_Max']] = impute_numerics['USDA_Hardiness_Zone'].apply(split_hardiness_zone)

In [40]:
# Convert the new columns to integers
impute_numerics['USDA_Hardiness_Zone_Min'] = pd.to_numeric(impute_numerics['USDA_Hardiness_Zone_Min'], errors='coerce').astype('Int64')
impute_numerics['USDA_Hardiness_Zone_Max'] = pd.to_numeric(impute_numerics['USDA_Hardiness_Zone_Max'], errors='coerce').astype('Int64')

In [41]:
impute_numerics

Unnamed: 0,Crop_Code,Scientific_Name,Genus,Species,Variety,Life_Form,Habit,Life_Span,Physiology,Category,...,Crop_Cycle_Min,Crop_Cycle_Max,Use_Main,Use_Detailed,Use_Part,Datasheet_URL,PFAF_URL,USDA_Hardiness_Zone,USDA_Hardiness_Zone_Min,USDA_Hardiness_Zone_Max
0,289,Abelmoschus esculentus,Abelmoschus,esculentus,bicostata,herb,erect,annual,single stem,vegetables,...,50.0,180.0,food & beverage,vitamins,fruits,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,5-11,5,11
1,290,Abelmoschus manihot,Abelmoschus,manihot,bicostata,shrub,erect,"annual, perennial","deciduous, multi stem","vegetables, ornamentals/turf, medicinals & aro...",...,365.0,365.0,food & beverage,vitamins,leaves,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11,8,11
2,291,Abelmoschus moschatus,Abelmoschus,moschatus,bicostata,"herb, sub-shrub",prostrate/procumbent/semi-erect,"annual, biennial, perennial","deciduous, multi stem","ornamentals/turf, medicinals & aromatic",...,0.0,0.0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Abe...,8-11,8,11
3,295,Acacia auriculiformis,Acacia,auriculiformis,bicostata,tree,erect,perennial,single stem,forest/wood,...,180.0,240.0,material,dye/tannin,stems,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12,10,12
4,297,Acacia farnesiana,Acacia,farnesiana,bicostata,tree,erect,perennial,single stem,"materials, ornamentals/turf, medicinals & arom...",...,60.0,240.0,environmental,ornamental/turf,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,9-11,9,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2563,400000,Chamaecrista rotundifolia,Chamaecrista,rotundifolia,bicostata,herb,prostrate/procumbent/semi-erect,"annual, perennial","deciduous, multi stem","forage/pasture, medicinals & aromatic",...,0.0,0.0,animal food (feed),vitamins,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Cha...,9-11,9,11
2564,400001,Acacia polyacantha,Acacia,polyacantha,bicostata,"shrub, tree",erect,perennial,"single stem, multi stem","forest/wood, environmental",...,0.0,0.0,fuels,fuelwood,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Aca...,10-12,10,12
2565,400002,Prosopis affinis,Prosopis,affinis,bicostata,tree,erect,perennial,"deciduous, single stem, C3 photosynthesis",forest/wood,...,0.0,0.0,food & beverage,vitamins,bark,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Pro...,10-12,10,12
2566,400003,Vicia dasycarpa,Vicia,dasycarpa,bicostata,herb,climber/scrambler/scadent,annual,multi stem,forage/pasture,...,0.0,0.0,animal food (feed),minerals,entire plant,http://ecocrop.fao.org/ecocrop/srv/en/dataShee...,https://pfaf.org/user/Plant.aspx?LatinName=Vic...,10-12,10,12


In [42]:
impute_numerics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568 entries, 0 to 2567
Data columns (total 70 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Crop_Code                2568 non-null   int64  
 1   Scientific_Name          2568 non-null   object 
 2   Genus                    2568 non-null   object 
 3   Species                  2568 non-null   object 
 4   Variety                  2568 non-null   object 
 5   Life_Form                2568 non-null   object 
 6   Habit                    2568 non-null   object 
 7   Life_Span                2568 non-null   object 
 8   Physiology               2568 non-null   object 
 9   Category                 2568 non-null   object 
 10  Plant_Attributes         2568 non-null   object 
 11  Temp_Opt_Min             2568 non-null   float64
 12  Temp_Opt_Max             2568 non-null   float64
 13  Temp_Abs_Min             2568 non-null   float64
 14  Temp_Abs_Max            

## corrections and conversions

In [43]:
# don't want to impute the variety
impute_numerics[['Genus', 'Species', 'Variety']] = ecocrop_openclim[['Genus', 'Species', 'Variety']]

In [44]:
# Function to convert Celsius to Fahrenheit
def celsius_to_fahrenheit(celsius):
    return (celsius * 9/5) + 32

In [45]:
# Apply the conversion function to the appropriate columns
temperature_columns = ['Temp_Opt_Min', 'Temp_Opt_Max', 'Temp_Abs_Min', 'Temp_Abs_Max', 'Killing_Temp_Rest', 'Killing_Temp_Growth']
df = impute_numerics.copy()
for col in temperature_columns:
    df.loc[:, f'{col}_F'] = df[col].apply(celsius_to_fahrenheit)
impute_numerics = df.copy()

In [46]:
list(impute_numerics.columns)

['Crop_Code',
 'Scientific_Name',
 'Genus',
 'Species',
 'Variety',
 'Life_Form',
 'Habit',
 'Life_Span',
 'Physiology',
 'Category',
 'Plant_Attributes',
 'Temp_Opt_Min',
 'Temp_Opt_Max',
 'Temp_Abs_Min',
 'Temp_Abs_Max',
 'Rain_Opt_Min',
 'Rain_Opt_Max',
 'Rain_Abs_Min',
 'Rain_Abs_Max',
 'Lat_Opt_Min',
 'Lat_Opt_Max',
 'Lat_Abs_Min',
 'Lat_Abs_Max',
 'Alt_Opt_Min',
 'Alt_Opt_Max',
 'Alt_Abs_Min',
 'Alt_Abs_Max',
 'pH_Opt_Min',
 'pH_Opt_Max',
 'pH_Abs_Min',
 'pH_Abs_Max',
 'Light_Opt_Min',
 'Light_Opt_Max',
 'Light_Abs_Min',
 'Light_Abs_Max',
 'Depth_Opt',
 'Depth_Abs',
 'Texture_Ops',
 'Texture_Abs',
 'Fertility_Ops',
 'Fertility_Abs',
 'Al_Toxicity_Opt',
 'Al_Toxicity_Abs',
 'Salinity_Ops',
 'Salinity_Abs',
 'Drainage_Opt',
 'Drainage_Abs',
 'Climate_Zone_Trewartha',
 'Photoperiod',
 'Killing_Temp_Rest',
 'Killing_Temp_Growth',
 'Abiotic_Tolererance',
 'Abiotic_Susceptibility',
 'Introduction_Risks',
 'Production_System',
 'Cropping_System',
 'Subsystem',
 'Companion_Species',
 'Le

In [47]:
reordered_columns = [
'Crop_Code',
'Scientific_Name',
'Genus',
'Species',
'Variety',
'Life_Form',
'Habit',
'Life_Span',
'Physiology',
'Category',
'Plant_Attributes',

'Temp_Opt_Min',
'Temp_Opt_Max',
'Temp_Opt_Min_F',
'Temp_Opt_Max_F',
    
'Temp_Abs_Min',
'Temp_Abs_Max',
'Temp_Abs_Min_F',
'Temp_Abs_Max_F',

'Killing_Temp_Rest',
'Killing_Temp_Growth',
'Killing_Temp_Rest_F',
'Killing_Temp_Growth_F',

'Rain_Opt_Min',
'Rain_Opt_Max',
'Rain_Abs_Min',
'Rain_Abs_Max',
'Lat_Opt_Min',
'Lat_Opt_Max',
'Lat_Abs_Min',
'Lat_Abs_Max',
'Alt_Opt_Min',
'Alt_Opt_Max',
'Alt_Abs_Min',
'Alt_Abs_Max',
'pH_Opt_Min',
'pH_Opt_Max',
'pH_Abs_Min',
'pH_Abs_Max',
'Light_Opt_Min',
'Light_Opt_Max',
'Light_Abs_Min',
'Light_Abs_Max',
'Depth_Opt',
'Depth_Abs',
'Texture_Ops',
'Texture_Abs',
'Fertility_Ops',
'Fertility_Abs',
'Al_Toxicity_Opt',
'Al_Toxicity_Abs',
'Salinity_Ops',
'Salinity_Abs',
'Drainage_Opt',
'Drainage_Abs',
'Photoperiod',
'Abiotic_Tolererance',
'Abiotic_Susceptibility',
'Introduction_Risks',
'Production_System',
'Cropping_System',
'Subsystem',
'Companion_Species',
'Level_of_Mechanization',
'Labour_Intensity',
'Crop_Cycle_Min',
'Crop_Cycle_Max',
'Use_Main',
'Use_Detailed',
'Use_Part',
'Climate_Zone_Trewartha',
'USDA_Hardiness_Zone',
'USDA_Hardiness_Zone_Min',
'USDA_Hardiness_Zone_Max',
'Datasheet_URL',
'PFAF_URL',
]

In [48]:
impute_numerics = impute_numerics[reordered_columns]

## save clean dataframe

In [49]:
ecocrop_clean_df_file = '../data/crops/EcoCrop_Complete.pickle'
impute_numerics.to_pickle(ecocrop_clean_df_file)


In [50]:
ecocrop_final_data_file = '../data/crops/EcoCrop_Complete.csv'
impute_numerics.to_csv(ecocrop_final_data_file)