<a href="https://colab.research.google.com/github/lehai0609/KagglePlayground/blob/main/PlaygroundS05E06_LightGBM_and_Improving_031.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import data

In [64]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# Define the base folder path
base_folder = '/content/drive/MyDrive/Kaggle/Playground S5E6'

# Import train.csv as synthetic_df
synthetic_df = pd.read_csv(os.path.join(base_folder, 'train.csv'))
test_df = pd.read_csv(os.path.join(base_folder, 'test.csv'))
original_df = pd.read_csv(os.path.join(base_folder, 'Fertilizer Prediction.csv'))

In [65]:
original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Temparature      100000 non-null  int64 
 1   Humidity         100000 non-null  int64 
 2   Moisture         100000 non-null  int64 
 3   Soil Type        100000 non-null  object
 4   Crop Type        100000 non-null  object
 5   Nitrogen         100000 non-null  int64 
 6   Potassium        100000 non-null  int64 
 7   Phosphorous      100000 non-null  int64 
 8   Fertilizer Name  100000 non-null  object
dtypes: int64(6), object(3)
memory usage: 6.9+ MB


In [66]:
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


## 2. Preparation for modelling

Use basic lightGBM for baseline modelling.
- Encode categorical features & target

In [67]:
# Handle Categorical variables
categorical_features = ['Soil Type', 'Crop Type']
for col in categorical_features:
  synthetic_df[col] = synthetic_df[col].astype('category')
  test_df[col] = test_df[col].astype('category')
  original_df[col] = original_df[col].astype('category')

In [68]:
# Define features and target
# Fertilizer Name is my target variable. And it should be encoded as categorical data too.
target_col = 'Fertilizer Name'
feature_cols = ['Soil Type', 'Crop Type', 'Humidity', 'Temparature', 'Moisture', 'Nitrogen', 'Potassium']

X = synthetic_df[feature_cols]
y = synthetic_df[target_col]
X_original = original_df[feature_cols]
y_original = original_df[target_col]

# If target is categorical, encode it
if y.dtype == 'object':
    target_encoder = LabelEncoder()
    y_encoded = target_encoder.fit_transform(y)
    y = y_encoded

if y_original.dtype == 'object':
    target_encoder = LabelEncoder()
    y_original_encoded = target_encoder.fit_transform(y_original)
    y_original = y_original_encoded


# Define LightGBM parameters
params = {
    'objective': 'multiclass',
    'num_class': len(np.unique(y)),
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,  # Suppress output
    'random_state': 42
}

# Custom MAP@3 calculation function
def mapk(y_true, y_pred_proba, k=3):
    """Calculate MAP@k score"""
    # Get top k predictions for each sample
    top_k = np.argsort(y_pred_proba, axis=1)[:, -k:][:, ::-1]

    scores = []
    for i, true_label in enumerate(y_true):
        # Get rank of true label in top k predictions
        try:
            rank = np.where(top_k[i] == true_label)[0][0] + 1
            if rank <= k:
                scores.append(1.0 / rank)
            else:
                scores.append(0.0)
        except IndexError:
            scores.append(0.0)

    return np.mean(scores)

In [None]:
# Start 5-Fold cross-validation
# Initialize stratified 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store scores
cv_scores = []
cv_map3_scores = []
models = []

# Create a train/validation loop through defined (X, y) above, in that
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"Fold {fold + 1} out of 5")
    # Split data
    X_train_synthetic, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train_synthetic, y_val = y[train_idx], y[val_idx]

    # Concatenate original data to training set
    X_train_combined = pd.concat([X_train_synthetic, X_original], ignore_index=True)
    y_train_combined = np.concatenate([y_train_synthetic, y_original])

    # Create train & val dataset for lightGBM
    train_data = lgb.Dataset(X_train_combined, label=y_train_combined)
    val_data = lgb.Dataset(X_val, label=y_val)

    # Train the model
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=500,
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(0)])

    # Make prediction on val data
    val_pred_proba = model.predict(X_val, num_iteration=model.best_iteration)
    val_pred_classes = np.argmax(val_pred_proba, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, val_pred_classes)
    print(f"Accuracy: {accuracy}")
    map3_score = mapk(y_val, val_pred_proba, k=3)
    print(f"MAP@3 Score: {map3_score}")

    # Append scores
    cv_scores.append(accuracy)
    cv_map3_scores.append(map3_score)
    models.append(model)
# Print Cross-Validation Results
print("\n" + "=" * 50)
print("CROSS-VALIDATION RESULTS")
print("=" * 50)
print(f"Accuracy - Mean: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")
print(f"MAP@3 - Mean: {np.mean(cv_map3_scores):.4f} ± {np.std(cv_map3_scores):.4f}")
print("\nFold-by-fold MAP@3 scores:")
for i, score in enumerate(cv_map3_scores):
    print(f"Fold {i+1}: {score:.4f}")

Fold 1 out of 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 1.92198
Accuracy: 0.19229333333333334
MAP@3 Score: 0.327
Fold 2 out of 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 1.92161
Accuracy: 0.1931
MAP@3 Score: 0.3278155555555555
Fold 3 out of 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 1.92184
Accuracy: 0.19307333333333335
MAP@3 Score: 0.32740888888888886
Fold 4 out of 5
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 1.92223
Accuracy: 0.19274666666666668
MAP@3 Score: 0.3271333333333333
Fold 5 out of 5
Training until validation scores don't improve for 50 rounds


## 3. Output the Map@3 testing

In [None]:
sample_df = pd.read_csv(os.path.join(base_folder, 'sample_submission.csv'))
sample_df.head()

In [71]:
print("\nGenerating test predictions...")
# Prepare Test features
test_features = test_df[feature_cols]

# Average predictions from all folds
test_pred_avg = np.zeros((len(test_df), len(np.unique(y))))
for model in models:
    test_pred = model.predict(test_features, num_iteration=model.best_iteration)
    test_pred_avg += test_pred / len(models)

# Get top 3 indices
top3_indices = np.argsort(test_pred_avg, axis=1)[:, -3:][:, ::-1]

# Convert indices back to fertilizer names
fertilizer_names = target_encoder.inverse_transform(top3_indices.flatten()).reshape(-1, 3)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': [' '.join(row) for row in fertilizer_names]
})

# Save submission
submission.to_csv('submission.csv', index=False)
print(submission.head())


Generating test predictions...
       id             Fertilizer Name
0  750000          28-28 10-26-26 DAP
1  750001     17-17-17 20-20 10-26-26
2  750002     20-20 14-35-14 10-26-26
3  750003  14-35-14 17-17-17 10-26-26
4  750004        20-20 28-28 10-26-26
