# **LightGBM with best parameters tuned using Optuna + Cross-valiodation + Feature extraction + Data augmentation**

**NOTE: The hyper parameter tuning using Optuna was done in a separate notebook**

In [None]:
!pip install loguru
!pip install optuna

Collecting loguru
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import optuna
from optuna.samplers import TPESampler
import pickle
import warnings

warnings.simplefilter("ignore")

In [None]:
metadata = pd.read_csv("/kaggle/input/tick-tick-bloom-data/metadata.csv")
train_labels = pd.read_csv("/kaggle/input/tick-tick-bloom-data/train_labels.csv")
image_features = pd.read_csv("/kaggle/input/tick-tick-bloom-data/image_features.csv")
image_features = image_features.rename(columns={'Unnamed: 0': 'uid'})

In [None]:
metadata.head()

In [None]:
# Select the rows that correspond to the training set
metadata_train = metadata.loc[metadata['split'] == 'train']

# Select the rows that correspond to the test set
metadata_test = metadata.loc[metadata['split'] == 'test']

In [None]:
train_labels.head()

In [None]:
image_features.head()

In [2]:
#Merge dataframes
train_data = pd.merge(metadata_train, image_features, on='uid')
train_data = pd.merge(train_data, train_labels, on='uid')
#drop useless columns
train_data = train_data.drop(columns=['split','density'])
#One hot encode region
#train_data = pd.get_dummies(train_data, columns=['region'])

#Convert date to to year and month
train_data['year'] = pd.to_datetime(train_data['date']).dt.year
train_data['month'] = pd.to_datetime(train_data['date']).dt.month
train_data = train_data.drop(columns=['date'])

NameError: name 'pd' is not defined

# Feature extraction

**Combinations**

In [None]:
# Overall average color value
train_data['overall_average'] = (train_data['red_average'] + train_data['green_average'] + train_data['blue_average']) / 3

# Overall median color value
train_data['overall_median'] = train_data[['red_median', 'green_median', 'blue_median']].median(axis=1)

# Standard deviation for each color channel
train_data['red_std'] = train_data[['red_median', 'red_average']].std(axis=1)
train_data['green_std'] = train_data[['green_median', 'green_average']].std(axis=1)
train_data['blue_std'] = train_data[['blue_median', 'blue_average']].std(axis=1)

# Overall std color value
train_data['overall_std'] = train_data[['red_std', 'green_std', 'blue_std']].median(axis=1)

In [None]:
train_data

# Data augmentation

In [None]:
# Define the augmentation function
def augment_data(data, mean, std):
    return data + np.random.normal(mean, std)

In [None]:
# Create a new dataframe to store the augmented data
augmented_df = pd.DataFrame(columns=train_data.columns)

In [3]:
# Apply the augmentation function to each image feature column
for i, row in train_data.iterrows():
    augmented_row = row.copy()
    for col in train_data.columns:
        if col in ['severity', 'region','uid']:
            continue
        data = row[col]
        augmented_data = augment_data(data, 0, 0.1)
        augmented_row[col] = augmented_data
    augmented_df = augmented_df.append(augmented_row, ignore_index=True)

NameError: name 'train_data' is not defined

In [None]:
augmented_df

In [None]:
from sklearn.model_selection import train_test_split

# Combine the original data and the augmented data
train_data = pd.concat([train_data, augmented_df], ignore_index=True)

train_data["severity"] = train_data["severity"].astype(int)
train_data["year"] = train_data["year"].astype(int)
train_data["month"] = train_data["month"].astype(int)

In [None]:
train_data.to_csv("train_new.csv")

In [16]:
train_data

Unnamed: 0,uid,latitude,longitude,red_average,green_average,blue_average,red_median,green_median,blue_median,region,severity,year,month,overall_average,overall_median,red_std,green_std,blue_std,overall_std
0,aabm,39.080319,-86.430867,223.455782,227.185941,216.442177,255.000000,255.000000,255.000000,midwest,1,2018,5,222.361300,255.000000,22.305130,19.667510,27.264498,22.305130
1,aacd,35.875083,-78.878434,46.590909,43.385281,25.987013,40.000000,38.000000,20.000000,south,1,2020,11,38.654401,38.000000,4.660477,3.807969,4.233457,4.233457
2,aaee,35.487000,-79.062133,25.006494,44.538961,20.872294,20.000000,40.000000,19.000000,south,1,2016,8,30.139250,20.000000,3.540126,3.209530,1.323912,3.209530
3,aaff,38.049471,-99.827001,49.880952,52.298701,32.225108,22.000000,25.000000,16.000000,midwest,3,2019,7,44.801587,22.000000,19.714810,19.303097,11.472884,19.303097
4,aafl,39.474744,-86.898353,95.807256,102.832200,74.993197,53.000000,71.000000,46.000000,midwest,4,2021,8,91.210884,53.000000,30.269301,22.508764,20.501286,22.508764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33799,zzsv,38.774495,-75.195796,255.072860,254.962871,255.102111,255.012097,255.131530,254.973501,south,3,2017,6,254.856865,254.862547,-0.025967,0.110441,-0.036297,0.057546
33800,zzuq,35.904342,-79.020370,0.129493,-0.077115,0.157920,-0.105595,-0.118381,-0.017720,south,3,2015,7,0.130549,-0.112245,0.138887,-0.003566,-0.093636,0.106404
33801,zzwo,39.748652,-99.996586,33.059657,49.738809,30.826056,26.026767,43.044100,25.905820,midwest,2,2017,6,37.731469,25.950664,5.051927,4.604028,3.297238,4.513391
33802,zzwq,35.392182,-79.009030,-0.140520,0.083303,-0.123086,0.029531,0.005583,0.011202,south,1,2015,3,0.047688,0.057797,0.079137,0.052640,0.046437,-0.011616


In [17]:
# split into train and validation
rng = np.random.RandomState(42)
train_data["split"] = rng.choice(
    ["train", "validation"], size=len(train_data), replace=True, p=[0.67, 0.33]
)
train_data.to_csv("train_data.csv", index=False)
train_data.head()

Unnamed: 0,uid,latitude,longitude,red_average,green_average,blue_average,red_median,green_median,blue_median,region,severity,year,month,overall_average,overall_median,red_std,green_std,blue_std,overall_std,split
0,aabm,39.080319,-86.430867,223.455782,227.185941,216.442177,255.0,255.0,255.0,midwest,1,2018,5,222.3613,255.0,22.30513,19.66751,27.264498,22.30513,train
1,aacd,35.875083,-78.878434,46.590909,43.385281,25.987013,40.0,38.0,20.0,south,1,2020,11,38.654401,38.0,4.660477,3.807969,4.233457,4.233457,validation
2,aaee,35.487,-79.062133,25.006494,44.538961,20.872294,20.0,40.0,19.0,south,1,2016,8,30.13925,20.0,3.540126,3.20953,1.323912,3.20953,validation
3,aaff,38.049471,-99.827001,49.880952,52.298701,32.225108,22.0,25.0,16.0,midwest,3,2019,7,44.801587,22.0,19.71481,19.303097,11.472884,19.303097,train
4,aafl,39.474744,-86.898353,95.807256,102.8322,74.993197,53.0,71.0,46.0,midwest,4,2021,8,91.210884,53.0,30.269301,22.508764,20.501286,22.508764,train


# Modeling

# LightGBM

In [23]:
%%writefile train_gbm_model.py
import lightgbm as lgb

import joblib
import numpy as np
from pathlib import Path
from loguru import logger
import typer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

def main(
    data_path="/kaggle/working/",
    model_save_path="/kaggle/working/lgb_classifier.txt",
    x_val_pth = "/kaggle/working/x_val.npy",
    y_val_pth = "/kaggle/working/y_val.npy"
):
    """
    Train a LightGBM model based on training features in features_path and
    training labels in labels_path. Save our the trained model to model_save_path"""
    #Read data
    data = pd.read_csv(data_path + "train_data.csv")
    
    #Split X and y data
    X = data.drop(columns=['uid','region'])
    y = data["severity"]
    
    #Split train, and val data
    feature_cols = [
    "red_average",
    "green_average",
    "blue_average",
    "red_median",
    "green_median",
    "blue_median",
    'latitude',
    'longitude',
    'year',
    'month',
    'overall_average',
    'overall_median',
    'red_std',
    'green_std',
    'blue_std',
    'overall_std'
 ]
    target_col = "severity"
    
    val_set_mask = data.split == "validation"
    X_train = data.loc[~val_set_mask, feature_cols].values
    y_train = data.loc[~val_set_mask, target_col]
    X_val = data.loc[val_set_mask, feature_cols].values
    y_val = data.loc[val_set_mask, target_col]
    
    #parameters
    params = {'boosting_type' : 'gbdt',
              'lambda_l1' : 0.40915119192853316,
              'lambda_l2' : 0.014743770312866007,
              'num_leaves' : 249,
              'feature_fraction' : 0.6076408208183803,
              'bagging_fraction' : 0.9895340184265567,
              'bagging_freq' : 7,
              'min_child_samples'  : 85,
              'learning_rate' : 0.034931535619483665,
              'n_estimators' : 1000,
              'subsample' : 0.9284487072134361,
              'colsample_bytree' : 0.9020281188207814,
              'min_split_gain' : 0.029129356107522004,
              'reg_alpha' : 0.4008369867940891,
              'reg_lambda': 0.4821401208357889,              
              'max_depth' : -1,
              'min_child_weight' : 3.0985039512038317,
              'min_data_in_leaf' : 21,
              'is_unbalance' : True,
              'scale_pos_weight' : 0.29629472545169483,
              'drop_rate' : 0.10841892913710545,
              'max_drop' : 32,
              'skip_drop' : 0.9715442284335818,
              'xgboost_dart_mode' : True,
              'drop_seed' : 1844532202,
              'bagging_seed' : 14579769,
              'feature_fraction_seed' : 1653468328,
              'data_random_seed' : 419446037,
              'num_class' : 5
             }

    logger.info(f"Loaded training features of shape {X_train.shape}")
    logger.info(f"Loading training labels of shape {y_train.shape}")
    logger.info(f"Loaded validation features of shape {X_val.shape}")
    logger.info(f"Loading validation labels of shape {y_val.shape}")
    
    # instantiate tree model
    model = lgb.LGBMClassifier(**params)
    
    # Define the cross-validation and early stopping
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Fit the model on the training data
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, eval_metric='multi_logloss', verbose=False)

    # Get the best iteration
    best_iteration = model.best_iteration_
    
    logger.info("Fitting LGBM model on the best iteration")
    # Fit the model on the best iteration
    model.n_estimators = best_iteration
    model.fit(X_train, y_train)

    
    # save out model weights
    joblib.dump(model, str(model_save_path))
    logger.success(f"Model weights saved to {model_save_path}")
    
    # save out validation features
    x_val_pth = "/kaggle/working/x_val.npy"

    logger.info("Saving validation features")
    
    with open(x_val_pth, "wb") as f:
        np.save(f, X_val)
    with open(y_val_pth, "wb") as f:
        np.save(f, y_val)

if __name__ == "__main__":
    typer.run(main)

Overwriting train_gbm_model.py


In [24]:
!python train_gbm_model.py

[32m2023-01-14 23:14:35.244[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m91[0m - [1mLoaded training features of shape (22721, 16)[0m
[32m2023-01-14 23:14:35.245[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m92[0m - [1mLoading training labels of shape (22721,)[0m
[32m2023-01-14 23:14:35.245[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m93[0m - [1mLoaded validation features of shape (11083, 16)[0m
[32m2023-01-14 23:14:35.245[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m94[0m - [1mLoading validation labels of shape (11083,)[0m
[32m2023-01-14 23:14:49.298[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m108[0m - [1mFitting LGBM model on the best iteration[0m
[32m2023-01-14 23:15:00.073[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m116[0m - [32m[1mModel weights saved to /kaggle/working/lgb_classifier.txt[0m
[32m2023-01-14 23:15:00.073[0m | [1mINFO    [0m | [36m__

In [25]:
%%writefile predict_gbm_model.py
import lightgbm as lgb

import joblib
from loguru import logger
import numpy as np
from pathlib import Path
import typer



def main(
    model_weights_path="/kaggle/working/lgb_classifier.txt",
    features_path="/kaggle/working/x_val.npy",
    preds_save_path="/kaggle/working/val_preds.npy",
):
    """
    Generate predictions with a LightGBM model using weights saved at model_weights_path
    and features saved at features_path. Save out predictions to preds_save_path.
    """
    # load model weights
    lgb_model = joblib.load(model_weights_path)
    logger.info(f"Loaded model {lgb_model} from {model_weights_path}")

    # load the features
    with open(features_path, "rb") as f:
        X_val = np.load(f)
    logger.info(f"Loaded features of shape {X_val.shape} from {features_path}")

    # generate predictions
    preds = lgb_model.predict(X_val)

    # save out predictions
    with open(preds_save_path, "wb") as f:
        np.save(f, preds)
    logger.success(f"Predictions saved to {preds_save_path}")


if __name__ == "__main__":
    typer.run(main)

Overwriting predict_gbm_model.py


In [26]:
!python predict_gbm_model.py

[32m2023-01-14 23:15:02.083[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m22[0m - [1mLoaded model LGBMClassifier(bagging_fraction=0.9895340184265567, bagging_freq=7,
               bagging_seed=14579769, colsample_bytree=0.9020281188207814,
               data_random_seed=419446037, drop_rate=0.10841892913710545,
               drop_seed=1844532202, feature_fraction=0.6076408208183803,
               feature_fraction_seed=1653468328, is_unbalance=True,
               lambda_l1=0.40915119192853316, lambda_l2=0.014743770312866007,
               learning_rate=0.034931535619483665, max_drop=32,
               min_child_samples=85, min_child_weight=3.0985039512038317,
               min_data_in_leaf=21, min_split_gain=0.029129356107522004,
               n_estimators=224, num_class=5, num_leaves=249,
               reg_alpha=0.4008369867940891, reg_lambda=0.4821401208357889,
               scale_pos_weight=0.29629472545169483,
               skip_drop=0.9715442284335818,

In [27]:
preds_pth = "/kaggle/working/val_preds.npy"
with open(preds_pth, "rb") as f:
    val_preds = np.load(f)

In [28]:
# get the validation part of the training data
val_set = train_data[train_data.split == "validation"][
    ["uid", "region", "severity"]
].copy()
val_set["pred"] = val_preds

val_set.head()

Unnamed: 0,uid,region,severity,pred
1,aacd,south,1,1
2,aaee,south,1,1
7,aahy,south,1,1
9,aaig,south,3,3
11,aamg,west,4,4


In [29]:
from sklearn.metrics import mean_squared_error
region_scores = []
for region in val_set.region.unique():
    sub = val_set[val_set.region == region]
    region_rmse = mean_squared_error(sub.severity, sub.pred, squared=False)
    print(f"RMSE for {region} (n={len(sub)}): {round(region_rmse, 4)}")
    region_scores.append(region_rmse)

overall_rmse = np.mean(region_scores)
print(f"Final score: {overall_rmse}")

RMSE for south (n=6425): 0.8621
RMSE for west (n=2454): 0.3787
RMSE for midwest (n=1449): 0.774
RMSE for northeast (n=755): 0.8187
Final score: 0.7083620582036203


In [30]:
with open("/kaggle/working/y_val.npy", "rb") as f:
        y_val = np.load(f)


In [31]:
# what's our RMSE across all validation data points?
mean_squared_error(y_val, val_preds, squared=False)

0.7658812129186375

In [32]:
# how many times did each severity level show up in our predictions vs. the actual values?
val_results = pd.DataFrame({"pred": val_preds, "actual": y_val})

pd.concat(
    [
        val_results.pred.value_counts().sort_index().rename("predicted"),
        val_results.actual.value_counts().sort_index().rename("actual"),
    ],
    axis=1,
).rename_axis("severity_level_count")

Unnamed: 0_level_0,predicted,actual
severity_level_count,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6173,4770
2,1284,2136
3,1293,1841
4,2319,2298
5,14,38


In [33]:
# get the image features for the test set
submission_format = pd.read_csv('/kaggle/input/submission/submission_format.csv')
test_features = pd.merge(submission_format, image_features, how='left')
test_features = pd.merge(test_features, metadata_test, how='left')

#drop useless columns
test_features = test_features.drop(columns=['split'])

#Convert date to to year and month
test_features['year'] = pd.to_datetime(test_features['date']).dt.year
test_features['month'] = pd.to_datetime(test_features['date']).dt.month
test_features = test_features.drop(columns=['date'])

#add the missing features
# Overall average color value
test_features['overall_average'] = (test_features['red_average'] + test_features['green_average'] + test_features['blue_average']) / 3

# Overall median color value
test_features['overall_median'] = test_features[['red_median', 'green_median', 'blue_median']].median(axis=1)

# Standard deviation for each color channel
test_features['red_std'] = test_features[['red_median', 'red_average']].std(axis=1)
test_features['green_std'] = test_features[['green_median', 'green_average']].std(axis=1)
test_features['blue_std'] = test_features[['blue_median', 'blue_average']].std(axis=1)


# Overall std color value
test_features['overall_std'] = test_features[['red_std', 'green_std', 'blue_std']].median(axis=1)

# make sure our features are in the same order as the submission format
assert (test_features.index == submission_format.index).all()


test_features.head()

Unnamed: 0,uid,region,severity,red_average,green_average,blue_average,red_median,green_median,blue_median,latitude,longitude,year,month,overall_average,overall_median,red_std,green_std,blue_std,overall_std
0,aabn,west,1,0.0,0.0,0.0,0.0,0.0,0.0,36.5597,-121.51,2016,8,0.0,0.0,0.0,0.0,0.0,0.0
1,aair,west,1,156.71875,141.578125,143.578125,159.5,146.5,146.5,33.0426,-117.076,2014,11,147.291667,146.5,1.966641,3.480291,2.066078,2.066078
2,aajw,northeast,1,99.0,113.795918,66.346939,93.0,103.0,60.0,40.703968,-80.29305,2015,8,93.047619,93.0,4.242641,7.633867,4.487963,4.487963
3,aalr,midwest,1,255.0,255.0,255.0,255.0,255.0,255.0,38.9725,-94.67293,2019,8,255.0,255.0,0.0,0.0,0.0,0.0
4,aalw,west,1,127.997732,105.900227,84.965986,128.0,93.0,72.0,34.279,-118.905,2018,1,106.287982,93.0,0.001603,9.121838,9.168337,9.121838


In [34]:
test_features.isna().sum()

uid                  0
region               0
severity             0
red_average        105
green_average      105
blue_average       105
red_median         105
green_median       105
blue_median        105
latitude             0
longitude            0
year                 0
month                0
overall_average    105
overall_median     105
red_std            105
green_std          105
blue_std           105
overall_std        105
dtype: int64

In [35]:
# fill in missing values
for avg_col in ["red_average", "green_average", "blue_average","overall_average"]:
    test_features[avg_col] = test_features[avg_col].fillna(
        test_features[avg_col].mean()
    )
for median_col in ["red_median", "green_median", "blue_median","overall_median"]:
    test_features[median_col] = test_features[median_col].fillna(
        test_features[median_col].median()
    )
for std_col in ["red_std", "green_std", "blue_std",'overall_std']:
    test_features[std_col] = test_features[std_col].fillna(
        test_features[std_col].std()
    )

In [36]:
# select feature columns
feature_cols = [
   "red_average",
    "green_average",
    "blue_average",
    "red_median",
    "green_median",
    "blue_median",
    'latitude',
    'longitude',
    'overall_std',
    'year',
    'month',
    'overall_average',
    'overall_median',
    'red_std',
    'green_std',
    'blue_std']

X_test = test_features[feature_cols].values

print(X_test.shape)

X_test[1]

(6510, 16)


array([ 1.56718750e+02,  1.41578125e+02,  1.43578125e+02,  1.59500000e+02,
        1.46500000e+02,  1.46500000e+02,  3.30426000e+01, -1.17076000e+02,
        2.06607763e+00,  2.01400000e+03,  1.10000000e+01,  1.47291667e+02,
        1.46500000e+02,  1.96664074e+00,  3.48029119e+00,  2.06607763e+00])

In [37]:
# save out test features
x_test_pth = "/kaggle/working/x_test.npy"
with open(x_test_pth, "wb") as f:
    np.save(f, X_test)

In [38]:
test_preds_pth = "/kaggle/workingtest_preds.npy"

In [39]:
!python predict_gbm_model.py --features-path {x_test_pth} --preds-save-path {test_preds_pth}

[32m2023-01-14 23:15:05.121[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m22[0m - [1mLoaded model LGBMClassifier(bagging_fraction=0.9895340184265567, bagging_freq=7,
               bagging_seed=14579769, colsample_bytree=0.9020281188207814,
               data_random_seed=419446037, drop_rate=0.10841892913710545,
               drop_seed=1844532202, feature_fraction=0.6076408208183803,
               feature_fraction_seed=1653468328, is_unbalance=True,
               lambda_l1=0.40915119192853316, lambda_l2=0.014743770312866007,
               learning_rate=0.034931535619483665, max_drop=32,
               min_child_samples=85, min_child_weight=3.0985039512038317,
               min_data_in_leaf=21, min_split_gain=0.029129356107522004,
               n_estimators=224, num_class=5, num_leaves=249,
               reg_alpha=0.4008369867940891, reg_lambda=0.4821401208357889,
               scale_pos_weight=0.29629472545169483,
               skip_drop=0.9715442284335818,

In [40]:
# load our predictions
with open(test_preds_pth, "rb") as f:
    test_preds = np.load(f)

In [41]:
submission = submission_format.copy()
submission["severity"] = test_preds

submission.head()

Unnamed: 0,uid,region,severity
0,aabn,west,4
1,aair,west,1
2,aajw,northeast,1
3,aalr,midwest,3
4,aalw,west,4


In [42]:
# save out our formatted submission
submission_save_path = "/kaggle/working/submission_LGBM_CV_DA_FE.csv"
submission.to_csv(submission_save_path, index=False)

In [43]:
# make sure our saved csv looks correct
!cat {submission_save_path} | head -5

uid,region,severity
aabn,west,4
aair,west,1
aajw,northeast,1
aalr,midwest,3
cat: write error: Broken pipe
