# Import Modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from IPython.display import display
from sklearn.model_selection import train_test_split
import os

# Load Dataset

In [2]:
# Define the path to the processed dataset
file_path = '../data/processed/merged_dataset.csv'

In [3]:
# Load the dataset
df = pd.read_csv(file_path)

In [4]:
# Drop unnecessary columns
# These columns are not needed for modeling and can be removed to simplify the dataset
# Missing indicators: WB_CCKP_HD40, WB_CCKP_HD42, WB_CCKP_HI37, WB_CCKP_HI39, WB_CCKP_HI41, WB_CCKP_HD50
df = df.drop(columns=[
    'WB_CCKP_HD40', 'WB_CCKP_HD42', 'WB_CCKP_HI37', 'WB_CCKP_HI39', 'WB_CCKP_HI41',
    'WB_CCKP_HD50', 'AREA', 'YEAR', 'AREA_HARVESTED', 'PRODUCTION_QUANTITY'
])

In [5]:
# Rename 'Maize (corn)' to 'Maize' in the ITEM column
df['ITEM'] = df['ITEM'].replace('Maize (corn)', 'Maize')

# Type Casting

In [6]:
# Display the data types of each column in the dataframe
print(df.dtypes)

ITEM                object
YIELD              float64
WB_CCKP_CDD        float64
WB_CCKP_CDD65      float64
WB_CCKP_CSDI       float64
WB_CCKP_CWD        float64
WB_CCKP_FD         float64
WB_CCKP_HD30       float64
WB_CCKP_HD35       float64
WB_CCKP_HD45       float64
WB_CCKP_HDD65      float64
WB_CCKP_HI35       float64
WB_CCKP_HURS       float64
WB_CCKP_ID         float64
WB_CCKP_PR         float64
WB_CCKP_R20MM      float64
WB_CCKP_R50MM      float64
WB_CCKP_R95PTOT    float64
WB_CCKP_RX1DAY     float64
WB_CCKP_RX5DAY     float64
WB_CCKP_SD         float64
WB_CCKP_TAS        float64
WB_CCKP_TASMAX     float64
WB_CCKP_TASMIN     float64
WB_CCKP_TNN        float64
WB_CCKP_TR         float64
WB_CCKP_TR23       float64
WB_CCKP_TR26       float64
WB_CCKP_TR29       float64
WB_CCKP_TR32       float64
WB_CCKP_TX84RR     float64
WB_CCKP_TXX        float64
WB_CCKP_WSDI       float64
dtype: object


In [7]:
# Check for non numeric columns (only non numeric should be 'ITEM')
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns
if len(non_numeric_cols) == 0:
    print("All columns are numerical.")
else:
    print("The following columns are not numerical:", list(non_numeric_cols))

The following columns are not numerical: ['ITEM']


# Missing Value Handling

In [8]:
# Check for missing values in the dataframe
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("No missing values found in the dataframe.")
else:
    print("Columns with missing values:")
    print(missing_values[missing_values > 0])

No missing values found in the dataframe.


# Outlier Detection

In [9]:
# Detect outliers in each numeric column using the IQR method
outlier_info = {}
total_outliers = 0
n_rows = len(df)

for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = ((df[col] < lower) | (df[col] > upper))
    n_outliers = outliers.sum()
    if n_outliers > 0:
        outlier_info[col] = n_outliers
        total_outliers += n_outliers

print(f"Total outliers: {total_outliers}")
print("Outliers per column:")
for col, count in outlier_info.items():
    print(f"  {col}: {count} ({count/n_rows*100:.2f}%)")
print(f"Percentage of outliers (all columns, not unique rows): {total_outliers/(n_rows*len(df.select_dtypes(include=[np.number]).columns))*100:.2f}%")

Total outliers: 5066
Outliers per column:
  YIELD: 40 (1.23%)
  WB_CCKP_CDD: 299 (9.21%)
  WB_CCKP_CDD65: 57 (1.76%)
  WB_CCKP_CSDI: 170 (5.24%)
  WB_CCKP_CWD: 104 (3.20%)
  WB_CCKP_FD: 97 (2.99%)
  WB_CCKP_HD30: 588 (18.12%)
  WB_CCKP_HD35: 60 (1.85%)
  WB_CCKP_HD45: 18 (0.55%)
  WB_CCKP_HDD65: 243 (7.49%)
  WB_CCKP_HI35: 110 (3.39%)
  WB_CCKP_HURS: 15 (0.46%)
  WB_CCKP_ID: 97 (2.99%)
  WB_CCKP_PR: 145 (4.47%)
  WB_CCKP_R20MM: 105 (3.24%)
  WB_CCKP_R50MM: 360 (11.09%)
  WB_CCKP_R95PTOT: 96 (2.96%)
  WB_CCKP_RX1DAY: 48 (1.48%)
  WB_CCKP_RX5DAY: 69 (2.13%)
  WB_CCKP_SD: 50 (1.54%)
  WB_CCKP_TAS: 185 (5.70%)
  WB_CCKP_TASMAX: 185 (5.70%)
  WB_CCKP_TASMIN: 185 (5.70%)
  WB_CCKP_TNN: 141 (4.35%)
  WB_CCKP_TR: 50 (1.54%)
  WB_CCKP_TR23: 63 (1.94%)
  WB_CCKP_TR26: 248 (7.64%)
  WB_CCKP_TR29: 540 (16.64%)
  WB_CCKP_TR32: 555 (17.10%)
  WB_CCKP_TX84RR: 35 (1.08%)
  WB_CCKP_TXX: 42 (1.29%)
  WB_CCKP_WSDI: 66 (2.03%)
Percentage of outliers (all columns, not unique rows): 4.88%


# Duplicate Removal

In [10]:
# Count duplicate rows in the dataframe
n_duplicates = df.duplicated().sum()
pct_duplicates = (n_duplicates / n_rows) * 100

print(f"Number of duplicate rows: {n_duplicates}")
print(f"Percentage of duplicate rows: {pct_duplicates:.2f}%")

# Remove duplicate rows
df = df.drop_duplicates()

Number of duplicate rows: 0
Percentage of duplicate rows: 0.00%


# One-Hot Encoding

In [11]:
# One-hot encode the 'ITEM' column for modeling
df_encoded = pd.get_dummies(df, columns=['ITEM'], prefix='ITEM')
display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HDD65,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,1000.0,27.05,548.49,5.6,14.53,20.12,0.0,0.0,0.0,4382.08,...,25.22,2.65,0.02,0.0,0.05,24.78,21.31,True,False,False
1,923.1,23.78,492.26,33.37,9.44,33.1,0.0,0.0,0.0,5171.71,...,21.68,2.48,0.0,0.0,0.03,24.22,0.38,True,False,False
2,1380.2,29.91,579.35,7.62,10.14,30.34,0.0,0.0,0.0,4862.13,...,27.46,5.56,0.11,0.0,0.08,25.25,0.0,True,False,False
3,1332.4,24.83,629.48,7.81,9.02,32.41,0.02,0.0,0.0,4799.37,...,30.26,6.19,0.26,0.0,0.1,26.22,6.07,True,False,False
4,2352.3,24.24,810.85,0.0,11.43,14.55,0.32,0.0,0.0,4272.51,...,42.15,11.38,0.88,0.08,0.15,27.95,9.24,True,False,False


# Scale Numeric Features

In [12]:
# Select numeric feature columns (excluding the target)
features_to_scale = df_encoded.select_dtypes(include=[np.number]).columns.drop('YIELD')

# Create a new scaler instance and scale only the features
scaler = StandardScaler()
df_encoded[features_to_scale] = scaler.fit_transform(df_encoded[features_to_scale])

# YIELD remains untouched
display(df_encoded.head())

Unnamed: 0,YIELD,WB_CCKP_CDD,WB_CCKP_CDD65,WB_CCKP_CSDI,WB_CCKP_CWD,WB_CCKP_FD,WB_CCKP_HD30,WB_CCKP_HD35,WB_CCKP_HD45,WB_CCKP_HDD65,...,WB_CCKP_TR23,WB_CCKP_TR26,WB_CCKP_TR29,WB_CCKP_TR32,WB_CCKP_TX84RR,WB_CCKP_TXX,WB_CCKP_WSDI,ITEM_Barley,ITEM_Maize,ITEM_Wheat
0,1000.0,0.233778,0.35575,-0.135448,1.646696,-0.713337,-0.361975,-0.078157,-0.054472,-0.777041,...,0.211854,-0.371063,-0.43339,-0.219177,-1.378998,-0.152424,1.536358,True,False,False
1,923.1,-0.006406,0.199929,3.613478,-0.313387,-0.411892,-0.361975,-0.078157,-0.054472,-0.432414,...,0.05066,-0.392143,-0.446303,-0.219177,-1.589877,-0.34718,-1.035053,True,False,False
2,1380.2,0.443848,0.441268,0.137251,-0.043827,-0.47599,-0.361975,-0.078157,-0.054472,-0.567528,...,0.313853,-0.010211,-0.375281,-0.219177,-1.06268,0.011031,-1.081739,True,False,False
3,1332.4,0.070717,0.580185,0.1629,-0.475122,-0.427917,-0.337088,-0.078157,-0.054472,-0.594919,...,0.441351,0.067911,-0.278433,-0.219177,-0.851801,0.348376,-0.335993,True,False,False
4,2352.3,0.027381,1.082788,-0.891443,0.452932,-0.842694,0.03621,-0.078157,-0.054472,-0.824862,...,0.982764,0.711492,0.12187,0.246021,-0.324604,0.950033,0.053466,True,False,False


# Split Data (Holdout Method)

In [13]:
# Stratify by the item columns to maintain balance
item_col = ['ITEM_Barley', 'ITEM_Maize', 'ITEM_Wheat']

# Find the item label for each row
item_labels = df_encoded[item_col].idxmax(axis=1)

X_train, X_test = train_test_split(
    df_encoded,
    test_size=0.2,
    random_state=42,
    stratify=item_labels
)

# Count items in each set
train_counts = X_train[item_col].sum()
test_counts = X_test[item_col].sum()

print("Training set item counts:")
print(train_counts)
print("\nTesting set item counts:")
print(test_counts)

Training set item counts:
ITEM_Barley    942
ITEM_Maize     719
ITEM_Wheat     935
dtype: int64

Testing set item counts:
ITEM_Barley    235
ITEM_Maize     180
ITEM_Wheat     234
dtype: int64


# Save Datasets

In [14]:
# Create the output directory if it doesn't exist
output_dir = "../data/final/"
os.makedirs(output_dir, exist_ok=True)

# Save the full encoded dataset
df_encoded.to_csv(os.path.join(output_dir, "modeling_dataset.csv"), index=False)

# Save the training and testing sets
X_train.to_csv(os.path.join(output_dir, "training_set.csv"), index=False)
X_test.to_csv(os.path.join(output_dir, "testing_set.csv"), index=False)