## Digital Green Crop Yield Estimate Challenge
* Smallholder farmers are crucial contributors to global food production, and in India often suffer most from poverty and malnutrition. These farmers face challenges such as limited access to modern agriculture, unpredictable weather, and resource constraints. To tackle this issue, Digital Green collected data via surveys, offering insights into farming practices, environmental conditions, and crop yields.

* The objective of this challenge is to create a machine learning solution to predict the crop yield per acre of rice or wheat crops in India. Our goal is to empower these farmers and break the cycle of poverty and malnutrition.

In [16]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

le = LabelEncoder()


import lightgbm as lgb
import xgboost as xgb


pd.options.display.max_columns = 500
pd.options.display.max_rows= 500

### Data LOading

In [17]:
from google.colab import drive
drive.mount("/content/gdrive/")

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [18]:
!unzip /content/gdrive/MyDrive/digital_green/digital-green-crop-yield-estimate-challenge20230912-4562-1bbmstk.zip

Archive:  /content/gdrive/MyDrive/digital_green/digital-green-crop-yield-estimate-challenge20230912-4562-1bbmstk.zip
replace manifest-791977bdaaa868446d787f0618161fe720230912-4562-s64syz.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace VariableDescription.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [19]:
path = "/content/"
train = pd.read_csv(path + "Train.csv")
test = pd.read_csv(path + "Test.csv")

### Setup Config

In [20]:
class cfg:
  id_col = "ID"
  target_col = "Yield"
  date_cols = [col for col in train.columns if any(substring in col for substring in ['date', 'Date']) or col in ['SeedingSowingTransplanting']]
  outlier_ids = ["ID_PMSOXFT4FYDW", ]#"ID_NBZ50NSHWKX4", "ID_MUP1OS8GZR5E"
  outlier_min = ["ID_UFPM6MLG1BJK"]

In [21]:
completely_different= [107, 2396]
train.drop(completely_different, inplace=True)

### Data preparation: Preprocessing + Feature Engineering

In [22]:
def cluster_col(col: str, data: pd.DataFrame):
  combinations = data[col]
  vectorizer = TfidfVectorizer(stop_words='english')
  X = vectorizer.fit_transform(combinations)
  kmeans = KMeans(n_clusters=5, random_state=42)
  data[f"{col}_cluster"] = kmeans.fit_predict(X)

  return data

def create_binary_columns(df, column_name, prefix):
    df[column_name] = df[column_name].replace(np.nan, '', regex=True)

    unique_methods = set(method for methods in df[column_name].str.split() for method in methods)

    for method in unique_methods:
        df[f"{prefix}{method}"] = df[column_name].str.contains(method).astype(int)

    df = df.drop(column_name, axis=1)

    return df

def get_date_features(date_cols: list, data: pd.DataFrame):
  for col in date_cols:
    data[col] = pd.to_datetime(data[col])
    data[f'{col}_year'] = data[col].dt.year
    data[f'{col}_month'] = data[col].dt.month
    data[f'{col}_day'] = data[col].dt.day

  # data['Time_to_harvest'] = (data['Harv_date'] - data['CropTillageDate']).dt.days
  # data['Nursery_to_Field_Time'] = (data['CropTillageDate'] - data['RcNursEstDate']).dt.days
  # data['Harvest_to_Threshing_Time'] = (data['Threshing_date']- data['Harv_date']).dt.days
  # data['Total_Growing_Season_Length'] = (data['Threshing_date'] - data['RcNursEstDate']).dt.days

  return data

def groupby_features(data: pd.DataFrame, stats_feats: list):
  for col in stats_feats:
    data[f"{col}__Acre_mean"] = data.groupby(col)['Acre'].transform('mean')
    data[f"{col}__Acre_std"] = data.groupby(col)['Acre'].transform('std')
    data[f"{col}__Acre_min"] = data.groupby(col)['Acre'].transform('min')
    data[f"{col}__Acre_max"] = data.groupby(col)['Acre'].transform('max')
    data[f"{col}__Acre_skew"] = data.groupby(col)['Acre'].transform('skew')
  return data


def process(train: pd.DataFrame, test: pd.DataFrame, id_col: str, target_col: str):
  drop_cols = []
  le_cols = []

  data = pd.concat([train, test])

  #deal with outliers
  outliers = [(4000.0, 0.2272727272727273), (6000.0, 0.2272727272727273),(4800.0, 0.2272727272727273), (16800.0, 1.0), (8000.0, 0.4545454545454545), (4780.0, 0.3636363636363636),
  (5300.0, 0.25), (14400.0, 0.8181818181818182), (7510.0, 0.4375), (4800.0, 0.2727272727272727), (6000, 0.2727272727272727),
  (3400.0, 0.312500), (3200.0, 0.4090909090909091), (2500.0 ,	0.15625), (1500.0, 0.0909090909090909),   (3200.0, 0.1818181818181818),

  (3200.0, 0.4090909090909091),    (5400.0, 0.4090909090909091)

  ]
  outliers_min = [(160,0.6818181818181818), (200, 1.136363636363636), (18.0 , 0.625), (150.0, 0.625), (128.0,0.625), (10.0 ,	0.3125), (10.0,	0.3125), (56.0, 0.3125), (58.0, 0.3125),(10.0, 0.2272727272727273), (40.0, 0.2272727272727273)]



  for yieldd, acre in outliers:
    mask = (data['Acre'] == acre) & (data['Yield'] == yieldd)
    data.loc[mask, 'Yield'] = data.loc[mask, 'Yield'] / 10.0

  for yieldd, acre in outliers_min:
    mask = (data['Acre'] == acre) & (data['Yield'] == yieldd)
    data.loc[mask, 'Yield'] = data.loc[mask, 'Yield'] * 10.0

  data['Yield_per_acre'] = data['Yield']/data['Acre']
  data['District_Block'] = data['District'] + '_' + data['Block']
  data['District'] = np.where(data['District_Block'] == 'Jamui_Gurua', 'Gaya', data['District'])
  object_cols = [col for col in data.select_dtypes(include = 'O').columns if col not in cfg.date_cols + [id_col]]
  special_cols = ['LandPreparationMethod', 'NursDetFactor','TransDetFactor', 'OrgFertilizers', 'CropbasalFerts' ]
  stats_feats = ["SeedlingsPerPit", "CropTillageDate_month", "1appDaysUrea",  "TransplantingIrrigationHours","Threshing_date_month","CropTillageDepth"]

  for col in special_cols:
    data = create_binary_columns(data, col, f"{col}")

  object_cols = [col for col in object_cols if col not in special_cols]




  data = get_date_features(cfg.date_cols, data)
  # data = groupby_features(data, stats_feats)



  drop_cols.extend(['2tdUrea', '2appDaysUrea', 'District_Block'])
  data = pd.get_dummies(data, columns = object_cols)


  drop_cols.extend(object_cols)
  drop_cols.extend(cfg.date_cols)



  constant_cols = [col for col in data.columns if data[col].nunique() == 1]
  drop_cols.extend(constant_cols)

  features = [col for col in data.columns if col not in drop_cols + [cfg.id_col, cfg.target_col, 'Yield_per_acre']]

  train = data[data[id_col].isin(train[id_col].unique())].reset_index(drop=True)
  test = data[data[id_col].isin(test[id_col].unique())].reset_index(drop=True)

  return train, test, features

df_train, df_test, features = process(train, test, cfg.id_col, cfg.target_col)
display(df_train.head(), df_test.head(), features)

Unnamed: 0,ID,CultLand,CropCultLand,CropTillageDate,CropTillageDepth,RcNursEstDate,SeedingSowingTransplanting,SeedlingsPerPit,TransplantingIrrigationHours,TransIrriCost,StandingWater,Ganaura,CropOrgFYM,NoFertilizerAppln,BasalDAP,BasalUrea,1tdUrea,1appDaysUrea,2tdUrea,2appDaysUrea,Harv_date,Harv_hand_rent,Threshing_date,Residue_length,Residue_perc,Acre,Yield,Yield_per_acre,LandPreparationMethodFourWheelTracRotavator,LandPreparationMethodWetTillagePuddling,LandPreparationMethodTractorPlough,LandPreparationMethodOther,LandPreparationMethodBullockPlough,NursDetFactorPreMonsoonShowers,NursDetFactorCalendarDate,NursDetFactorSeedAvailability,NursDetFactorIrrigWaterAvailability,NursDetFactorLabourAvailability,TransDetFactorSeedlingAge,TransDetFactorRainArrival,TransDetFactorCalendarDate,TransDetFactorLaborAvailability,TransDetFactorIrrigWaterAvailability,OrgFertilizersVermiCompost,OrgFertilizersJeevamrit,OrgFertilizersGanaura,OrgFertilizersPranamrit,OrgFertilizersGhanajeevamrit,OrgFertilizersFYM,OrgFertilizersPoultryManure,CropbasalFertsNPKS,CropbasalFertsSSP,CropbasalFertsNPK,CropbasalFertsMoP,CropbasalFertsOther,CropbasalFertsUrea,CropbasalFertsDAP,CropTillageDate_year,CropTillageDate_month,CropTillageDate_day,RcNursEstDate_year,RcNursEstDate_month,RcNursEstDate_day,SeedingSowingTransplanting_year,SeedingSowingTransplanting_month,SeedingSowingTransplanting_day,Harv_date_year,Harv_date_month,Harv_date_day,Threshing_date_year,Threshing_date_month,Threshing_date_day,District_Gaya,District_Jamui,District_Nalanda,District_Vaishali,Block_Chehrakala,Block_Garoul,Block_Gurua,Block_Jamui,Block_Khaira,Block_Lohra,Block_Mahua,Block_Noorsarai,Block_Rajgir,Block_Wazirganj,CropEstMethod_Broadcasting,CropEstMethod_LineSowingAfterTillage,CropEstMethod_Manual_PuddledLine,CropEstMethod_Manual_PuddledRandom,TransplantingIrrigationSource_Boring,TransplantingIrrigationSource_Canal,TransplantingIrrigationSource_Pond,TransplantingIrrigationSource_Rainfed,TransplantingIrrigationSource_TubeWell,TransplantingIrrigationSource_Well,TransplantingIrrigationPowerSource_Diesel,TransplantingIrrigationPowerSource_Electric,TransplantingIrrigationPowerSource_Solar,PCropSolidOrgFertAppMethod_Broadcasting,PCropSolidOrgFertAppMethod_RootApplication,PCropSolidOrgFertAppMethod_SoilApplied,PCropSolidOrgFertAppMethod_Spray,MineralFertAppMethod_Broadcasting,MineralFertAppMethod_RootApplication,MineralFertAppMethod_SoilApplied,MineralFertAppMethod_Spray,FirstTopDressFert_DAP,FirstTopDressFert_DAP NPK,FirstTopDressFert_DAP NPK Urea,FirstTopDressFert_DAP NPKS,FirstTopDressFert_DAP Urea,FirstTopDressFert_NPK,FirstTopDressFert_NPK Urea,FirstTopDressFert_NPKS Urea,FirstTopDressFert_Other Urea,FirstTopDressFert_SSP,FirstTopDressFert_SSP Urea NPKS,FirstTopDressFert_Urea,FirstTopDressFert_Urea DAP,FirstTopDressFert_Urea NPK,FirstTopDressFert_Urea NPKS,FirstTopDressFert_Urea NPKS SSP,FirstTopDressFert_Urea Other,FirstTopDressFert_Urea SSP,MineralFertAppMethod.1_Broadcasting,MineralFertAppMethod.1_RootApplication,MineralFertAppMethod.1_SoilApplied,Harv_method_hand,Harv_method_machine,Threshing_method_hand,Threshing_method_machine,Stubble_use_burned,Stubble_use_plowed_in_soil,District_Block_Gaya_Gurua,District_Block_Gaya_Wazirganj,District_Block_Jamui_Gurua,District_Block_Jamui_Jamui,District_Block_Jamui_Khaira,District_Block_Jamui_Lohra,District_Block_Nalanda_Noorsarai,District_Block_Nalanda_Rajgir,District_Block_Vaishali_Chehrakala,District_Block_Vaishali_Garoul,District_Block_Vaishali_Mahua
0,ID_GTFAC7PEVWQ9,45,40,2022-07-20,5,2022-06-27,2022-07-21,2.0,5.0,200.0,2.0,,,2,,20.0,15.0,18.0,,,2022-11-16,,2022-11-16,30,40,0.3125,600.0,1920.0,1,0,1,0,0,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2022,7,20,2022.0,6.0,27.0,2022,7,21,2022,11,16,2022,11,16,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
1,ID_TK40ARLSPOKS,26,26,2022-07-18,5,2022-06-20,2022-07-20,2.0,5.0,125.0,3.0,,,2,15.0,10.0,20.0,39.0,,,2022-11-25,3.0,2022-12-24,24,10,0.3125,600.0,1920.0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2022,7,18,2022.0,6.0,20.0,2022,7,20,2022,11,25,2022,12,24,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
2,ID_1FJY2CRIMLZZ,10,10,2022-06-30,6,2022-06-20,2022-08-13,2.0,4.0,80.0,2.0,1.0,1.0,2,4.0,,5.0,65.0,,,2022-12-12,480.0,2023-01-11,30,10,0.148148,225.0,1518.75,1,0,1,0,0,1,0,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,2022,6,30,2022.0,6.0,20.0,2022,8,13,2022,12,12,2023,1,11,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0
3,ID_I3IPXS4DB7NE,15,15,2022-06-16,6,2022-06-17,2022-07-17,2.0,,,,1.0,,2,6.0,3.0,5.0,5.0,,,2022-12-02,240.0,2022-12-29,26,10,0.222222,468.0,2106.0,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,2022,6,16,2022.0,6.0,17.0,2022,7,17,2022,12,2,2022,12,29,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,ID_4T8YQWXWHB4A,60,60,2022-07-19,4,2022-06-21,2022-07-20,2.0,9.0,300.0,2.0,,,2,15.0,30.0,30.0,26.0,,,2022-11-30,,2022-12-02,24,40,0.46875,550.0,1173.333333,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2022,7,19,2022.0,6.0,21.0,2022,7,20,2022,11,30,2022,12,2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0


Unnamed: 0,ID,CultLand,CropCultLand,CropTillageDate,CropTillageDepth,RcNursEstDate,SeedingSowingTransplanting,SeedlingsPerPit,TransplantingIrrigationHours,TransIrriCost,StandingWater,Ganaura,CropOrgFYM,NoFertilizerAppln,BasalDAP,BasalUrea,1tdUrea,1appDaysUrea,2tdUrea,2appDaysUrea,Harv_date,Harv_hand_rent,Threshing_date,Residue_length,Residue_perc,Acre,Yield,Yield_per_acre,LandPreparationMethodFourWheelTracRotavator,LandPreparationMethodWetTillagePuddling,LandPreparationMethodTractorPlough,LandPreparationMethodOther,LandPreparationMethodBullockPlough,NursDetFactorPreMonsoonShowers,NursDetFactorCalendarDate,NursDetFactorSeedAvailability,NursDetFactorIrrigWaterAvailability,NursDetFactorLabourAvailability,TransDetFactorSeedlingAge,TransDetFactorRainArrival,TransDetFactorCalendarDate,TransDetFactorLaborAvailability,TransDetFactorIrrigWaterAvailability,OrgFertilizersVermiCompost,OrgFertilizersJeevamrit,OrgFertilizersGanaura,OrgFertilizersPranamrit,OrgFertilizersGhanajeevamrit,OrgFertilizersFYM,OrgFertilizersPoultryManure,CropbasalFertsNPKS,CropbasalFertsSSP,CropbasalFertsNPK,CropbasalFertsMoP,CropbasalFertsOther,CropbasalFertsUrea,CropbasalFertsDAP,CropTillageDate_year,CropTillageDate_month,CropTillageDate_day,RcNursEstDate_year,RcNursEstDate_month,RcNursEstDate_day,SeedingSowingTransplanting_year,SeedingSowingTransplanting_month,SeedingSowingTransplanting_day,Harv_date_year,Harv_date_month,Harv_date_day,Threshing_date_year,Threshing_date_month,Threshing_date_day,District_Gaya,District_Jamui,District_Nalanda,District_Vaishali,Block_Chehrakala,Block_Garoul,Block_Gurua,Block_Jamui,Block_Khaira,Block_Lohra,Block_Mahua,Block_Noorsarai,Block_Rajgir,Block_Wazirganj,CropEstMethod_Broadcasting,CropEstMethod_LineSowingAfterTillage,CropEstMethod_Manual_PuddledLine,CropEstMethod_Manual_PuddledRandom,TransplantingIrrigationSource_Boring,TransplantingIrrigationSource_Canal,TransplantingIrrigationSource_Pond,TransplantingIrrigationSource_Rainfed,TransplantingIrrigationSource_TubeWell,TransplantingIrrigationSource_Well,TransplantingIrrigationPowerSource_Diesel,TransplantingIrrigationPowerSource_Electric,TransplantingIrrigationPowerSource_Solar,PCropSolidOrgFertAppMethod_Broadcasting,PCropSolidOrgFertAppMethod_RootApplication,PCropSolidOrgFertAppMethod_SoilApplied,PCropSolidOrgFertAppMethod_Spray,MineralFertAppMethod_Broadcasting,MineralFertAppMethod_RootApplication,MineralFertAppMethod_SoilApplied,MineralFertAppMethod_Spray,FirstTopDressFert_DAP,FirstTopDressFert_DAP NPK,FirstTopDressFert_DAP NPK Urea,FirstTopDressFert_DAP NPKS,FirstTopDressFert_DAP Urea,FirstTopDressFert_NPK,FirstTopDressFert_NPK Urea,FirstTopDressFert_NPKS Urea,FirstTopDressFert_Other Urea,FirstTopDressFert_SSP,FirstTopDressFert_SSP Urea NPKS,FirstTopDressFert_Urea,FirstTopDressFert_Urea DAP,FirstTopDressFert_Urea NPK,FirstTopDressFert_Urea NPKS,FirstTopDressFert_Urea NPKS SSP,FirstTopDressFert_Urea Other,FirstTopDressFert_Urea SSP,MineralFertAppMethod.1_Broadcasting,MineralFertAppMethod.1_RootApplication,MineralFertAppMethod.1_SoilApplied,Harv_method_hand,Harv_method_machine,Threshing_method_hand,Threshing_method_machine,Stubble_use_burned,Stubble_use_plowed_in_soil,District_Block_Gaya_Gurua,District_Block_Gaya_Wazirganj,District_Block_Jamui_Gurua,District_Block_Jamui_Jamui,District_Block_Jamui_Khaira,District_Block_Jamui_Lohra,District_Block_Nalanda_Noorsarai,District_Block_Nalanda_Rajgir,District_Block_Vaishali_Chehrakala,District_Block_Vaishali_Garoul,District_Block_Vaishali_Mahua
0,ID_F9XXEXN2ADR2,20,13,2022-07-24,4,2022-07-03,2022-07-29,1.0,2.0,200.0,2.0,,2.0,3,6.0,,8.0,23.0,6.0,63.0,2022-12-02,500.0,2022-12-28,22,10,0.272727,,,0,0,1,0,0,1,0,0,1,1,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2022,7,24,2022.0,7.0,3.0,2022,7,29,2022,12,2,2022,12,28,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0
1,ID_SO3VW2X4QO93,25,25,2022-07-24,5,2022-07-08,2022-08-02,4.0,2.0,,3.0,,,3,5.0,,7.0,23.0,8.0,65.0,2022-11-12,150.0,2022-12-20,28,10,0.227273,,,0,0,1,0,0,0,1,1,1,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2022,7,24,2022.0,7.0,8.0,2022,8,2,2022,11,12,2022,12,20,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,ID_UKUQ7JM8E894,30,30,2022-07-07,5,2022-06-24,2022-07-08,2.0,4.0,,10.0,2.0,,2,9.0,18.0,9.0,38.0,,,2022-11-24,1200.0,2022-12-05,28,10,0.28125,,,1,1,1,0,0,1,0,1,1,1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,2022,7,7,2022.0,6.0,24.0,2022,7,8,2022,11,24,2022,12,5,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
3,ID_QUISMWEZR2H4,15,10,2022-07-04,5,2022-06-20,2022-07-10,2.0,2.0,,,,5.0,2,15.0,,9.0,30.0,,,2022-10-15,300.0,2022-10-25,30,10,0.136364,,,1,1,1,0,1,1,1,0,1,1,1,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,2022,7,4,2022.0,6.0,20.0,2022,7,10,2022,10,15,2022,10,25,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,ID_25JGI455VKCZ,30,10,2022-07-21,3,2022-06-24,2022-07-22,1.0,6.0,400.0,,,,2,20.0,15.0,15.0,40.0,,,2022-11-20,,2022-11-20,26,40,0.3125,,,1,1,1,0,0,0,0,1,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,2022,7,21,2022.0,6.0,24.0,2022,7,22,2022,11,20,2022,11,20,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0


['CultLand',
 'CropCultLand',
 'CropTillageDepth',
 'SeedlingsPerPit',
 'TransplantingIrrigationHours',
 'TransIrriCost',
 'StandingWater',
 'Ganaura',
 'CropOrgFYM',
 'NoFertilizerAppln',
 'BasalDAP',
 'BasalUrea',
 '1tdUrea',
 '1appDaysUrea',
 'Harv_hand_rent',
 'Residue_length',
 'Residue_perc',
 'Acre',
 'LandPreparationMethodFourWheelTracRotavator',
 'LandPreparationMethodWetTillagePuddling',
 'LandPreparationMethodTractorPlough',
 'LandPreparationMethodOther',
 'LandPreparationMethodBullockPlough',
 'NursDetFactorPreMonsoonShowers',
 'NursDetFactorCalendarDate',
 'NursDetFactorSeedAvailability',
 'NursDetFactorIrrigWaterAvailability',
 'NursDetFactorLabourAvailability',
 'TransDetFactorSeedlingAge',
 'TransDetFactorRainArrival',
 'TransDetFactorCalendarDate',
 'TransDetFactorLaborAvailability',
 'TransDetFactorIrrigWaterAvailability',
 'OrgFertilizersVermiCompost',
 'OrgFertilizersJeevamrit',
 'OrgFertilizersGanaura',
 'OrgFertilizersPranamrit',
 'OrgFertilizersGhanajeevamrit',
 

In [23]:
df_train[features].head()

Unnamed: 0,CultLand,CropCultLand,CropTillageDepth,SeedlingsPerPit,TransplantingIrrigationHours,TransIrriCost,StandingWater,Ganaura,CropOrgFYM,NoFertilizerAppln,BasalDAP,BasalUrea,1tdUrea,1appDaysUrea,Harv_hand_rent,Residue_length,Residue_perc,Acre,LandPreparationMethodFourWheelTracRotavator,LandPreparationMethodWetTillagePuddling,LandPreparationMethodTractorPlough,LandPreparationMethodOther,LandPreparationMethodBullockPlough,NursDetFactorPreMonsoonShowers,NursDetFactorCalendarDate,NursDetFactorSeedAvailability,NursDetFactorIrrigWaterAvailability,NursDetFactorLabourAvailability,TransDetFactorSeedlingAge,TransDetFactorRainArrival,TransDetFactorCalendarDate,TransDetFactorLaborAvailability,TransDetFactorIrrigWaterAvailability,OrgFertilizersVermiCompost,OrgFertilizersJeevamrit,OrgFertilizersGanaura,OrgFertilizersPranamrit,OrgFertilizersGhanajeevamrit,OrgFertilizersFYM,OrgFertilizersPoultryManure,CropbasalFertsNPKS,CropbasalFertsSSP,CropbasalFertsNPK,CropbasalFertsMoP,CropbasalFertsOther,CropbasalFertsUrea,CropbasalFertsDAP,CropTillageDate_month,CropTillageDate_day,RcNursEstDate_month,RcNursEstDate_day,SeedingSowingTransplanting_month,SeedingSowingTransplanting_day,Harv_date_year,Harv_date_month,Harv_date_day,Threshing_date_year,Threshing_date_month,Threshing_date_day,District_Gaya,District_Jamui,District_Nalanda,District_Vaishali,Block_Chehrakala,Block_Garoul,Block_Gurua,Block_Jamui,Block_Khaira,Block_Lohra,Block_Mahua,Block_Noorsarai,Block_Rajgir,Block_Wazirganj,CropEstMethod_Broadcasting,CropEstMethod_LineSowingAfterTillage,CropEstMethod_Manual_PuddledLine,CropEstMethod_Manual_PuddledRandom,TransplantingIrrigationSource_Boring,TransplantingIrrigationSource_Canal,TransplantingIrrigationSource_Pond,TransplantingIrrigationSource_Rainfed,TransplantingIrrigationSource_TubeWell,TransplantingIrrigationSource_Well,TransplantingIrrigationPowerSource_Diesel,TransplantingIrrigationPowerSource_Electric,TransplantingIrrigationPowerSource_Solar,PCropSolidOrgFertAppMethod_Broadcasting,PCropSolidOrgFertAppMethod_RootApplication,PCropSolidOrgFertAppMethod_SoilApplied,PCropSolidOrgFertAppMethod_Spray,MineralFertAppMethod_Broadcasting,MineralFertAppMethod_RootApplication,MineralFertAppMethod_SoilApplied,MineralFertAppMethod_Spray,FirstTopDressFert_DAP,FirstTopDressFert_DAP NPK,FirstTopDressFert_DAP NPK Urea,FirstTopDressFert_DAP NPKS,FirstTopDressFert_DAP Urea,FirstTopDressFert_NPK,FirstTopDressFert_NPK Urea,FirstTopDressFert_NPKS Urea,FirstTopDressFert_Other Urea,FirstTopDressFert_SSP,FirstTopDressFert_SSP Urea NPKS,FirstTopDressFert_Urea,FirstTopDressFert_Urea DAP,FirstTopDressFert_Urea NPK,FirstTopDressFert_Urea NPKS,FirstTopDressFert_Urea NPKS SSP,FirstTopDressFert_Urea Other,FirstTopDressFert_Urea SSP,MineralFertAppMethod.1_Broadcasting,MineralFertAppMethod.1_RootApplication,MineralFertAppMethod.1_SoilApplied,Harv_method_hand,Harv_method_machine,Threshing_method_hand,Threshing_method_machine,Stubble_use_burned,Stubble_use_plowed_in_soil,District_Block_Gaya_Gurua,District_Block_Gaya_Wazirganj,District_Block_Jamui_Gurua,District_Block_Jamui_Jamui,District_Block_Jamui_Khaira,District_Block_Jamui_Lohra,District_Block_Nalanda_Noorsarai,District_Block_Nalanda_Rajgir,District_Block_Vaishali_Chehrakala,District_Block_Vaishali_Garoul,District_Block_Vaishali_Mahua
0,45,40,5,2.0,5.0,200.0,2.0,,,2,,20.0,15.0,18.0,,30,40,0.3125,1,0,1,0,0,0,1,1,1,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,7,20,6.0,27.0,7,21,2022,11,16,2022,11,16,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0
1,26,26,5,2.0,5.0,125.0,3.0,,,2,15.0,10.0,20.0,39.0,3.0,24,10,0.3125,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,7,18,6.0,20.0,7,20,2022,11,25,2022,12,24,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0
2,10,10,6,2.0,4.0,80.0,2.0,1.0,1.0,2,4.0,,5.0,65.0,480.0,30,10,0.148148,1,0,1,0,0,1,0,0,1,1,1,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,6,30,6.0,20.0,8,13,2022,12,12,2023,1,11,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0
3,15,15,6,2.0,,,,1.0,,2,6.0,3.0,5.0,5.0,240.0,26,10,0.222222,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,6,16,6.0,17.0,7,17,2022,12,2,2022,12,29,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,60,60,4,2.0,9.0,300.0,2.0,,,2,15.0,30.0,30.0,26.0,,24,40,0.46875,0,1,1,0,0,0,1,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,7,19,6.0,21.0,7,20,2022,11,30,2022,12,2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0


#### Setup CV

In [25]:
# skf = StratifiedKFold(n_splits=10)
skf = KFold(n_splits = 10)

df_train['fold'] = -1

for fold, (_, test_index) in enumerate(skf.split(df_train)):
    df_train.loc[test_index, 'fold'] = fold

In [26]:
df_train.groupby('fold')['Yield'].agg({'mean'})

Unnamed: 0_level_0,mean
fold,Unnamed: 1_level_1
0,555.81137
1,547.364341
2,588.894057
3,580.596899
4,620.837209
5,576.78553
6,561.299742
7,539.674419
8,571.88342
9,537.463731


### Modelling: Train Test Split Stage1

In [27]:
X = df_train.copy()
y= df_train['Yield']
model = xgb.XGBRegressor(
    learning_rate=0.5,
    n_estimators=1000,
    objective='reg:squarederror',
)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.15, random_state=123, stratify = X['fold'])
model.fit(X_train[features], y_train, eval_set = [(X_valid[features], y_valid)], early_stopping_rounds = 250, verbose = 250)


[0]	validation_0-rmse:262.38594
[250]	validation_0-rmse:122.35659
[277]	validation_0-rmse:122.37204


In [28]:
pd.options.display.max_rows = 2200
#define a function to create variable importance dataframe
def get_catt_varimp(model, train_columns, max_vars=350):

    # Scikit-learn API LGBMClassifier or LGBMRegressor was fitted,
    # so using feature_importances_ property
    feature_importances_perc = (model.feature_importances_ / sum(model.feature_importances_)) * 100
    cv_varimp_df = pd.DataFrame([train_columns, feature_importances_perc]).T

    cv_varimp_df.columns = ['feature_name', 'varimp (%)']

    cv_varimp_df.sort_values(by='varimp (%)', ascending=False, inplace=True)

    cv_varimp_df = cv_varimp_df.iloc[0:max_vars]

    return cv_varimp_df
#
get_catt_varimp(model, features)


Unnamed: 0,feature_name,varimp (%)
17,Acre,68.959938
98,FirstTopDressFert_DAP Urea,4.141641
61,District_Nalanda,2.382698
105,FirstTopDressFert_Urea,2.141635
92,MineralFertAppMethod_SoilApplied,1.780336
74,CropEstMethod_LineSowingAfterTillage,1.509116
80,TransplantingIrrigationSource_Rainfed,1.462747
88,PCropSolidOrgFertAppMethod_SoilApplied,1.234163
41,CropbasalFertsSSP,0.911107
62,District_Vaishali,0.84678


In [29]:
test['preds'] = model.predict(df_test[features])
sub_tts_stage_1 = test[[cfg.id_col, 'preds']]
sub_tts_stage_1.head()

Unnamed: 0,ID,preds
0,ID_F9XXEXN2ADR2,563.643921
1,ID_SO3VW2X4QO93,402.992645
2,ID_UKUQ7JM8E894,436.074188
3,ID_QUISMWEZR2H4,299.14502
4,ID_25JGI455VKCZ,570.89502


In [31]:
sub_tts_stage_1['preds'] = np.where(sub_tts_stage_1['ID'].isin(cfg.outlier_ids), sub_tts_stage_1['preds']*10, sub_tts_stage_1['preds'])
sub_tts_stage_1.to_csv("tts_stage1_no_outliers_(new).csv", index= False)
sub_tts_stage_1.describe()

Unnamed: 0,preds
count,1290.0
mean,594.705444
std,510.233307
min,-54.573891
25%,317.247231
50%,451.019531
75%,745.0009
max,9038.716797


In [32]:
sub_tts_stage_1[sub_tts_stage_1['ID'].isin(cfg.outlier_ids)]

Unnamed: 0,ID,preds
373,ID_PMSOXFT4FYDW,9038.716797


#### Selected_Features

In [33]:
feature_importance_df = pd.DataFrame(model.feature_importances_, columns=['importance'])
feature_importance_df['feature'] = features
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)
selected = list(feature_importance_df[feature_importance_df['importance']>0.0]['feature'].unique())
selected

['Acre',
 'FirstTopDressFert_DAP Urea',
 'District_Nalanda',
 'FirstTopDressFert_Urea',
 'MineralFertAppMethod_SoilApplied',
 'CropEstMethod_LineSowingAfterTillage',
 'TransplantingIrrigationSource_Rainfed',
 'PCropSolidOrgFertAppMethod_SoilApplied',
 'CropbasalFertsSSP',
 'District_Vaishali',
 'TransDetFactorCalendarDate',
 'LandPreparationMethodOther',
 'PCropSolidOrgFertAppMethod_Broadcasting',
 'BasalUrea',
 'Stubble_use_burned',
 'Block_Khaira',
 'MineralFertAppMethod_Broadcasting',
 'LandPreparationMethodWetTillagePuddling',
 'Harv_hand_rent',
 'TransplantingIrrigationSource_Boring',
 'Ganaura',
 'CropbasalFertsUrea',
 'TransplantingIrrigationPowerSource_Diesel',
 'LandPreparationMethodBullockPlough',
 'Block_Rajgir',
 'TransDetFactorSeedlingAge',
 'Threshing_date_month',
 'Harv_date_month',
 'NursDetFactorPreMonsoonShowers',
 'NursDetFactorCalendarDate',
 '1tdUrea',
 'SeedingSowingTransplanting_month',
 'CropOrgFYM',
 'Residue_length',
 'StandingWater',
 'PCropSolidOrgFertAppMet

In [34]:
len(selected),len(features)

(101, 132)

### Modelling: Cross Validation
#### Lightgbm

In [41]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming you have 'features', 'df_train', and 'cfg.target_col' defined earlier
features = selected

# Best parameters obtained from Optuna
best_params = {'verbosity': -1, 'learning_rate': 0.053668513020289785, 'lambda_l1': 4.726963760132878e-06, 'lambda_l2': 4.696788389459892e-08, 'subsample': 0.7627217493239157, 'colsample_bytree': 0.7251409575910653, 'max_depth': 12, 'num_leaves': 23, 'min_child_samples': 21, 'feature_fraction': 0.6460683795597331}
fold_pred = []
oof_pred = []

# Adding a new column for predictions in df_train
df_train['predictions'] = np.nan

for fold in range(10):
    X_train = df_train[df_train.fold != fold].reset_index(drop=True)
    X_val = df_train[df_train.fold == fold].reset_index(drop=True)

    y_train = X_train.Yield
    y_val = X_val.Yield

    X_train = X_train[features]
    X_val = X_val[features]

    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)

    # Train a LightGBM model for the current fold using the best parameters
    model = lgb.train(
        best_params,
        lgb_train,
        num_boost_round=1000,
        valid_sets=[lgb_val],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=50),
            lgb.callback.log_evaluation(period=100),
        ],
    )


    preds = model.predict(X_val)
    oof_pred.append(mean_squared_error(y_val, preds, squared=False))
    print(f"Fold {fold} rmse: {mean_squared_error(y_val, preds, squared=False)}")
    print("-" * 50)
    print()

    # Store predictions in the 'predictions' column of df_train
    df_train.loc[df_train.fold == fold, 'predictions'] = model.predict(df_train[df_train.fold == fold][features])

    p2 = model.predict(df_test[features])
    fold_pred.append(p2)

print("Mean OOF Error:", np.mean(oof_pred))
print("Overall RMSE:", mean_squared_error(df_train['Yield'], df_train['predictions'], squared=False))


Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 10287.2
[200]	valid_0's l2: 9669.69
Early stopping, best iteration is:
[190]	valid_0's l2: 9595.13
Fold 0 rmse: 97.95475161842539
--------------------------------------------------

Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 18884.7
[200]	valid_0's l2: 17563.1
Early stopping, best iteration is:
[207]	valid_0's l2: 17472.7
Fold 1 rmse: 132.18420219198634
--------------------------------------------------

Training until validation scores don't improve for 50 rounds
[100]	valid_0's l2: 9732.04
[200]	valid_0's l2: 8741.96
[300]	valid_0's l2: 8527.3
[400]	valid_0's l2: 8362.86
[500]	valid_0's l2: 8199.53
[600]	valid_0's l2: 8121.65
[700]	valid_0's l2: 8106.02
Early stopping, best iteration is:
[656]	valid_0's l2: 8094.04
Fold 2 rmse: 89.96686715354379
--------------------------------------------------

Training until validation scores don't improve for 50 rounds
[100]	vali

In [45]:
preds = np.mean(fold_pred,axis=0)
test['preds'] = preds
sub_lgb = pd.DataFrame({
    cfg.id_col: test[cfg.id_col],
    cfg.target_col: preds
})

sub_lgb['Yield'] = np.where(sub_lgb['ID'].isin(cfg.outlier_ids), 8000, sub_lgb['Yield'])
sub_lgb.to_csv("lgb_106_107.csv", index= False)

In [46]:
sub_lgb[sub_lgb['ID']=="ID_PMSOXFT4FYDW"]

Unnamed: 0,ID,Yield
373,ID_PMSOXFT4FYDW,8000.0


#### XGBoost

In [47]:


# Assuming you have 'features', 'df_train', and 'cfg.target_col' defined earlier

params_xgb = {'learning_rate': 0.03578431849582851, 'reg_lambda': 31.700838394548214, 'reg_alpha': 3.456217283013418e-05, 'subsample': 0.48779814668483423, 'colsample_bytree': 0.7534598696968853, 'max_depth': 6}
fold_pred = []
oof_pred = []
features = selected
# Adding a new column for predictions in df_train
df_train['predictions'] = np.nan


for fold in range(10):

    print("-" * 50)
    X_train =  df_train[df_train.fold != fold].reset_index(drop=True)
    X_val = df_train[df_train.fold == fold].reset_index(drop=True)


    y_train = X_train.Yield
    y_val = X_val.Yield

    X_train = X_train[features]
    X_val = X_val[features]




    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    evals = [(dval, 'eval')]
    evals_result = {}

    model = xgb.train(params_xgb, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50, evals_result=evals_result, verbose_eval=False)

    preds = model.predict(dval)
    oof_pred.append(mean_squared_error(y_val, preds, squared=False))
    print(f"Fold {fold} rmse: {mean_squared_error(y_val, preds, squared=False)}")
    print("-" * 50)
    print()

    df_train.loc[df_train.fold == fold, 'predictions'] = model.predict(xgb.DMatrix(df_train[df_train.fold == fold][features]))

    p2 = model.predict(xgb.DMatrix(df_test[features]))
    fold_pred.append(p2)

print("Mean OOF Error:", np.mean(oof_pred))

print("Overall RMSE:", mean_squared_error(df_train['Yield'], df_train['predictions'], squared = False))

--------------------------------------------------
Fold 0 rmse: 95.8187439151289
--------------------------------------------------

--------------------------------------------------
Fold 1 rmse: 135.148737044329
--------------------------------------------------

--------------------------------------------------
Fold 2 rmse: 98.51917567120309
--------------------------------------------------

--------------------------------------------------
Fold 3 rmse: 142.00892493383313
--------------------------------------------------

--------------------------------------------------
Fold 4 rmse: 94.23616094781376
--------------------------------------------------

--------------------------------------------------
Fold 5 rmse: 85.12127726414604
--------------------------------------------------

--------------------------------------------------
Fold 6 rmse: 115.47731430379976
--------------------------------------------------

--------------------------------------------------
Fold 7 rmse

In [49]:
preds = np.mean(fold_pred,axis=0)
sub_xgb = pd.DataFrame({
    cfg.id_col: test[cfg.id_col],
    cfg.target_col: preds
})


sub_xgb['Yield'] = np.where(sub_xgb['ID'].isin(cfg.outlier_ids), sub_xgb['Yield']*10, sub_xgb['Yield'])
sub_xgb.to_csv("xgb_107_109.csv", index= False)

### Ensemble:
Lightgb + xgboost

In [50]:
ensemble = sub_lgb.copy()
ensemble['Yield'] = sub_lgb['Yield']*0.5 + sub_xgb['Yield']* 0.5
ensemble.to_csv("lgb_xgb(post).csv", index=False)

In [51]:
ensemble[ensemble['ID'].isin(cfg.outlier_ids)]

Unnamed: 0,ID,Yield
373,ID_PMSOXFT4FYDW,7930.819824


In [None]:
["ID_L3RTRS1QIY6D"]