# Import Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split
import os

# Load Dataset

In [2]:
# Define the path to the processed dataset
file_path = '../data/processed/merged_dataset.csv'

In [3]:
# Load the dataset
df = pd.read_csv(file_path)

In [4]:
# Drop unnecessary columns
# These columns are not needed for modeling and can be removed to simplify the dataset
# Missing indicators: WB_CCKP_HD40, WB_CCKP_HD42, WB_CCKP_HI37, WB_CCKP_HI39, WB_CCKP_HI41, WB_CCKP_HD50
df = df.drop(columns=[
    'WB_CCKP_HD40', 'WB_CCKP_HD42', 'WB_CCKP_HI37', 'WB_CCKP_HI39', 'WB_CCKP_HI41',
    'WB_CCKP_HD50', 'AREA', 'YEAR', 'AREA_HARVESTED', 'PRODUCTION_QUANTITY'
])

In [5]:
# Rename 'Maize (corn)' to 'Maize' in the ITEM column
df['ITEM'] = df['ITEM'].replace('Maize (corn)', 'Maize')

# Type Casting

In [6]:
# Display the data types of each column in the dataframe
print(df.dtypes)

ITEM                object
YIELD              float64
WB_CCKP_CDD        float64
WB_CCKP_CDD65      float64
WB_CCKP_CSDI       float64
WB_CCKP_CWD        float64
WB_CCKP_FD         float64
WB_CCKP_HD30       float64
WB_CCKP_HD35       float64
WB_CCKP_HD45       float64
WB_CCKP_HDD65      float64
WB_CCKP_HI35       float64
WB_CCKP_HURS       float64
WB_CCKP_ID         float64
WB_CCKP_PR         float64
WB_CCKP_R20MM      float64
WB_CCKP_R50MM      float64
WB_CCKP_R95PTOT    float64
WB_CCKP_RX1DAY     float64
WB_CCKP_RX5DAY     float64
WB_CCKP_SD         float64
WB_CCKP_TAS        float64
WB_CCKP_TASMAX     float64
WB_CCKP_TASMIN     float64
WB_CCKP_TNN        float64
WB_CCKP_TR         float64
WB_CCKP_TR23       float64
WB_CCKP_TR26       float64
WB_CCKP_TR29       float64
WB_CCKP_TR32       float64
WB_CCKP_TX84RR     float64
WB_CCKP_TXX        float64
WB_CCKP_WSDI       float64
dtype: object


In [7]:
# Check for non numeric columns (only non numeric should be 'ITEM')
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) == 0:
    print("All columns are numerical.")
else:
    print("The following columns are not numerical:", list(non_numeric_cols))

The following columns are not numerical: ['ITEM']


# Missing Value Handling

In [8]:
# Check for missing values in the dataframe
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("No missing values found in the dataframe.")
else:
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])

No missing values found in the dataframe.


# Outlier Detection

In [9]:
# Detect outliers in each numeric column using the IQR method
outlier_info = {}
total_outliers = 0
n_rows = len(df)

for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper))
    n_outliers = outliers.sum()
    if n_outliers > 0:
        outlier_info[col] = n_outliers
        total_outliers += n_outliers

print(f"Total outliers: {total_outliers}")
print("Outliers per column:")
for col, count in outlier_info.items():
    print(f"  {col}: {count} ({count/n_rows*100:.2f}%)")
print(f"Percentage of outliers (all columns, not unique rows): {total_outliers/(n_rows*len(df.select_dtypes(include=[np.number]).columns))*100:.2f}%")

Total outliers: 4772
Outliers per column:
  YIELD: 32 (1.05%)
  WB_CCKP_CDD: 286 (9.39%)
  WB_CCKP_CDD65: 53 (1.74%)
  WB_CCKP_CSDI: 167 (5.48%)
  WB_CCKP_CWD: 100 (3.28%)
  WB_CCKP_FD: 93 (3.05%)
  WB_CCKP_HD30: 555 (18.22%)
  WB_CCKP_HD35: 60 (1.97%)
  WB_CCKP_HD45: 18 (0.59%)
  WB_CCKP_HDD65: 229 (7.52%)
  WB_CCKP_HI35: 110 (3.61%)
  WB_CCKP_HURS: 15 (0.49%)
  WB_CCKP_ID: 93 (3.05%)
  WB_CCKP_PR: 113 (3.71%)
  WB_CCKP_R20MM: 96 (3.15%)
  WB_CCKP_R50MM: 346 (11.36%)
  WB_CCKP_R95PTOT: 87 (2.86%)
  WB_CCKP_RX1DAY: 48 (1.58%)
  WB_CCKP_RX5DAY: 66 (2.17%)
  WB_CCKP_SD: 48 (1.58%)
  WB_CCKP_TAS: 173 (5.68%)
  WB_CCKP_TASMAX: 173 (5.68%)
  WB_CCKP_TASMIN: 173 (5.68%)
  WB_CCKP_TNN: 133 (4.37%)
  WB_CCKP_TR: 48 (1.58%)
  WB_CCKP_TR23: 59 (1.94%)
  WB_CCKP_TR26: 235 (7.72%)
  WB_CCKP_TR29: 498 (16.35%)
  WB_CCKP_TR32: 537 (17.63%)
  WB_CCKP_TX84RR: 35 (1.15%)
  WB_CCKP_TXX: 30 (0.98%)
  WB_CCKP_WSDI: 63 (2.07%)
Percentage of outliers (all columns, not unique rows): 4.90%


# Duplicate Removal

In [10]:
# Count duplicate rows in the dataframe
n_duplicates = df.duplicated().sum()
pct_duplicates = (n_duplicates / n_rows) * 100

print(f"Number of duplicate rows: {n_duplicates}")
print(f"Percentage of duplicate rows: {pct_duplicates:.2f}%")

# Remove duplicate rows
df = df.drop_duplicates()

Number of duplicate rows: 0
Percentage of duplicate rows: 0.00%


# One-Hot Encoding

In [11]:
# One-hot encode the 'ITEM' column for modeling
df_encoded = pd.get_dummies(df, columns=['ITEM'], prefix='ITEM')
display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HDD65,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,1000.0,27.05,548.49,5.6,14.53,20.12,0.0,0.0,0.0,4382.08,...,25.22,2.65,0.02,0.0,0.05,24.78,21.31,True,False,False
1,923.1,23.78,492.26,33.37,9.44,33.1,0.0,0.0,0.0,5171.71,...,21.68,2.48,0.0,0.0,0.03,24.22,0.38,True,False,False
2,1380.2,29.91,579.35,7.62,10.14,30.34,0.0,0.0,0.0,4862.13,...,27.46,5.56,0.11,0.0,0.08,25.25,0.0,True,False,False
3,1332.4,24.83,629.48,7.81,9.02,32.41,0.02,0.0,0.0,4799.37,...,30.26,6.19,0.26,0.0,0.1,26.22,6.07,True,False,False
4,2352.3,24.24,810.85,0.0,11.43,14.55,0.32,0.0,0.0,4272.51,...,42.15,11.38,0.88,0.08,0.15,27.95,9.24,True,False,False


# Scale Numeric Features

In [12]:
# Select numeric feature columns (excluding the target)
features_to_scale = df_encoded.select_dtypes(include=[np.number]).columns.drop('YIELD')

# Create a new scaler instance and scale only the features
scaler = StandardScaler()
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

# YIELD remains untouched
display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HDD65,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,1000.0,0.254177,0.349305,-0.142392,1.628061,-0.707982,-0.367223,-0.080685,-0.056228,-0.774895,...,0.203528,-0.379913,-0.438591,-0.221775,-1.382162,-0.160958,1.507791,True,False,False
1,923.1,0.014224,0.194211,3.532471,-0.322191,-0.407172,-0.367223,-0.080685,-0.056228,-0.430769,...,0.043177,-0.400767,-0.451262,-0.221775,-1.590963,-0.354197,-1.05065,True,False,False
2,1380.2,0.464043,0.434423,0.124919,-0.053984,-0.471135,-0.367223,-0.080685,-0.056228,-0.565686,...,0.304993,-0.022941,-0.381574,-0.221775,-1.068959,0.001226,-1.097101,True,False,False
3,1332.4,0.091273,0.572692,0.150062,-0.483116,-0.423163,-0.342747,-0.080685,-0.056228,-0.593037,...,0.431824,0.054342,-0.286545,-0.221775,-0.860158,0.335945,-0.355116,True,False,False
4,2352.3,0.047979,1.072949,-0.883452,0.440285,-0.837066,0.024392,-0.080685,-0.056228,-0.822647,...,0.970404,0.691003,0.106242,0.242744,-0.338154,0.932917,0.032378,True,False,False


# Split Data (Holdout Method)

In [99]:
# Stratify by the item columns to maintain balance
item_col = ['ITEM_Barley', 'ITEM_Maize', 'ITEM_Wheat']

# Find the item label for each row
item_labels = df_encoded[item_col].idxmax(axis=1)

X_train, X_test = train_test_split(
    df_encoded,
    test_size=0.2,
    random_state=42,
    stratify=item_labels
)

# Count items in each set
train_counts = X_train[item_col].sum()
test_counts = X_test[item_col].sum()

print("Training set item counts:")
print(train_counts)
print("\nTesting set item counts:")
print(test_counts)

Training set item counts:
ITEM_Barley    884
ITEM_Maize     675
ITEM_Wheat     877
dtype: int64

Testing set item counts:
ITEM_Barley    221
ITEM_Maize     169
ITEM_Wheat     220
dtype: int64


# Save Datasets

In [100]:
# Create the output directory if it doesn't exist
output_dir = "../data/final/"
os.makedirs(output_dir, exist_ok=True)

# Save the full encoded dataset
df_encoded.to_csv(os.path.join(output_dir, "modeling_dataset.csv"), index=False)

# Save the training and testing sets
X_train.to_csv(os.path.join(output_dir, "training_set.csv"), index=False)
X_test.to_csv(os.path.join(output_dir, "testing_set.csv"), index=False)