<a href="https://colab.research.google.com/github/lehai0609/KagglePlayground/blob/main/PlaygroundS05E06_LightGBM_and_Improving_031.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Import data

In [36]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

# Define the base folder path
base_folder = '/content/drive/MyDrive/Kaggle/Playground S5E6'

# Import train.csv as synthetic_df
synthetic_df = pd.read_csv(os.path.join(base_folder, 'train.csv'))
test_df = pd.read_csv(os.path.join(base_folder, 'test.csv'))
original_df = pd.read_csv(os.path.join(base_folder, 'Fertilizer Prediction.csv'))

In [37]:
synthetic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               750000 non-null  int64 
 1   Temparature      750000 non-null  int64 
 2   Humidity         750000 non-null  int64 
 3   Moisture         750000 non-null  int64 
 4   Soil Type        750000 non-null  object
 5   Crop Type        750000 non-null  object
 6   Nitrogen         750000 non-null  int64 
 7   Potassium        750000 non-null  int64 
 8   Phosphorous      750000 non-null  int64 
 9   Fertilizer Name  750000 non-null  object
dtypes: int64(7), object(3)
memory usage: 57.2+ MB


## 2. Preparation for modelling

Use basic lightGBM for baseline modelling.
- Encode categorical features & target

In [38]:
# Handle Categorical variables
categorical_features = ['Soil Type', 'Crop Type']
for col in categorical_features:
  synthetic_df[col] = synthetic_df[col].astype('category')
  test_df[col] = test_df[col].astype('category')

In [39]:
# Define features and target
# Fertilizer Name is my target variable. And it should be encoded as categorical data too.
target_col = 'Fertilizer Name'
feature_cols = ['Soil Type', 'Crop Type', 'Humidity', 'Temparature', 'Moisture', 'Nitrogen', 'Potassium']

X = synthetic_df[feature_cols]
y = synthetic_df[target_col]

# If target is categorical, encode it
if y.dtype == 'object':
    target_encoder = LabelEncoder()
    y_encoded = target_encoder.fit_transform(y)
    y = y_encoded

In [40]:
# Train_test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (600000, 7)
Test set: (150000, 7)


In [41]:
from unittest.mock import call
# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Basic parameters for baseline
params = {
    'objective': 'multiclass',  # or 'regression' if predicting continuous values
    'num_class': len(np.unique(y)),  # number of classes
    'metric': 'multi_logloss',  # or 'rmse' for regression
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

# Train the model
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 1.92116	valid_1's multi_logloss: 1.92962


In [42]:
# Prediction and Evaluation
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred_classes = np.argmax(y_pred, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_test, y_pred_classes))

Accuracy: 0.1823
              precision    recall  f1-score   support

           0       0.18      0.22      0.20     22777
           1       0.18      0.26      0.21     22887
           2       0.18      0.24      0.21     22491
           3       0.19      0.17      0.18     22178
           4       0.18      0.22      0.19     22232
           5       0.19      0.09      0.12     18972
           6       0.19      0.03      0.06     18463

    accuracy                           0.18    150000
   macro avg       0.18      0.18      0.17    150000
weighted avg       0.18      0.18      0.17    150000



## 3. Output the Map@3 testing

In [43]:
sample_df = pd.read_csv(os.path.join(base_folder, 'sample_submission.csv'))
sample_df.head()

Unnamed: 0,id,Fertilizer Name
0,750000,14-35-14 10-26-26 Urea
1,750001,14-35-14 10-26-26 Urea
2,750002,14-35-14 10-26-26 Urea
3,750003,14-35-14 10-26-26 Urea
4,750004,14-35-14 10-26-26 Urea


In [44]:
# Prepare Test features
test_features = test_df[feature_cols]

# Predict probablility
test_probs = model.predict(test_features, num_iteration=model.best_iteration)

# Get top 3 indices
top3_indices = np.argsort(test_probs, axis=1)[:, -3:][:, ::-1]

# Convert indices back to fertilizer names
fertilizer_names = target_encoder.inverse_transform(top3_indices.flatten()).reshape(-1, 3)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test_df['id'],
    'Fertilizer Name': [' '.join(row) for row in fertilizer_names]
})

# Save submission
submission.to_csv('submission.csv', index=False)
print(submission.head())

       id             Fertilizer Name
0  750000          DAP 10-26-26 28-28
1  750001     17-17-17 20-20 10-26-26
2  750002        20-20 14-35-14 28-28
3  750003  14-35-14 17-17-17 10-26-26
4  750004        20-20 28-28 17-17-17
