In [None]:
#Primary Modules
import pandas as pd
import numpy as np

#Model_selection Tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score 

#Model
from lightgbm import LGBMClassifier as lgb

#Grid Search for Optimal Hyperparameters
from sklearn.model_selection import RandomizedSearchCV

#Data Object Output
import pickle

#Stacking Models
from vecstack import stacking

In [None]:
# Import Modified Data

train_data=pd.read_pickle(r"/content/drive/My Drive/Colab Data/Pet Adoption/modified_train.pickle")
test_data=pd.read_pickle(r"/content/drive/My Drive/Colab Data/Pet Adoption/modified_test.pickle")

In [None]:
Final_features_b = ['condition','color_type','length(m)','height(cm)','X1','X2','diff']
Final_features_p = ['condition_impute','color_type','pet_pred','X1','X2','diff']


X_b= train_data[Final_features_b].copy()
X_p= train_data[Final_features_p].copy()


In [None]:
y_b=train_data['breed_category'].to_numpy()
y_b = y_b.reshape(y_b.shape[0], )
y_b=pd.Series(y_b)

y_p=train_data['pet_category'].to_numpy()
yd_p = y_p.reshape(y_p.shape[0], )
y_p=pd.Series(y_p)

In [None]:
X_b_train,X_b_valid,y_b_train,y_b_valid=train_test_split(X_b,y_b,test_size=0.3,random_state=21)
X_p_train,X_p_valid,y_p_train,y_p_valid=train_test_split(X_p,y_p,test_size=0.3,random_state=21)


X_b_test=test_data[Final_features_b].copy()
X_p_test=test_data[Final_features_p].copy()

In [None]:
#LGBM Benchmark

benchmark_model=lgb(n_estimators=10000,random_state=21,max_bin=64,extra_trees=True ,objective='multiclass',
                    num_class=3,verbose=1,early_stopping_rounds=200)

eval_set = [(X_b_train, y_b_train)]

benchmark_model.fit(X_b_train, y_b_train,eval_metric="error", eval_set=eval_set, verbose=False)
pred=benchmark_model.predict(X_b_valid)
        
#y_b_valid=y_b_valid.to_numpy()
f1=f1_score(y_b_valid,pred,average="weighted")

print(f1)


benchmark_model=lgb(n_estimators=10000,objective='multiclass',extra_trees=True,max_bin=64,num_class=4,random_state=21,verbose=2,early_stopping_rounds=200)
eval_set = [(X_p_train, y_p_train)]

benchmark_model.fit(X_p_train, y_p_train,eval_metric="error", eval_set=eval_set, verbose=False)
pred=benchmark_model.predict(X_p_valid)

#y_p_valid=y_p_valid.to_numpy()
f1=f1_score(y_p_valid,pred,average="weighted")

print(f1)


Found `early_stopping_rounds` in params. Will use it instead of argument



0.8975409226942965
0.8916145224636047


In [None]:
# Create the parameter grid: lgbm_param_grid 

gbm_param_grid = {
    'boosting_type':['gbdt'],
    'n_estimators': range(512,2048,128),
    'max_depth': range(3,6),
    'class_weight' : ['balanced'],
    'reg_alpha': np.linspace(0.0, 4, 16),
    'reg_lambda': np.linspace(0.0, 4, 16),
    'min_data_in_leaf': [20,30,40,50],
    'bagging_freq': [3, 4, 5, 6, 7],
    'bagging_fraction': np.linspace(0.6, 0.95, 10),
    'num_leaves': range(256,512,4),
    'learning_rate': np.linspace(0.05,0.95,10),
    'subsample':[0.8,0.9,1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'early_stopping_rounds': [50],
    'n_jobs':[4]
}

eval_set = [(X_b, y_b)]

# Instantiate the regressor: gbm
lgbm_random = lgb(random_state=21,objective='multiclass')

#Perform random search: grid_mse
lgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, estimator = lgbm_random,verbose=20, scoring = "f1_weighted", n_iter = 100,n_jobs=4)


#Fit randomized_mse to the data
lgb_random.fit(X_b, y_b,eval_metric="error", eval_set=eval_set)

#Print the best parameters and lowest RMSE
print("Best parameters found: ", lgb_random.best_params_)
print("Best f1_weighted found: ", lgb_random.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   25.1s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   28.0s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   31.8s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   36.0s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   54.7s
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:   57.6s
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:  2.2min
[Parallel(

[1]	training's multi_error: 0.306811	training's multi_logloss: 1.06697
Training until validation scores don't improve for 50 rounds.
[2]	training's multi_error: 0.0760832	training's multi_logloss: 1.02106
[3]	training's multi_error: 0.072992	training's multi_logloss: 0.987693
[4]	training's multi_error: 0.0882798	training's multi_logloss: 0.971018
[5]	training's multi_error: 0.0741402	training's multi_logloss: 0.917535
[6]	training's multi_error: 0.0823652	training's multi_logloss: 0.895282
[7]	training's multi_error: 0.0853652	training's multi_logloss: 0.867389
[8]	training's multi_error: 0.0870148	training's multi_logloss: 0.842183
[9]	training's multi_error: 0.086727	training's multi_logloss: 0.822562
[10]	training's multi_error: 0.0840717	training's multi_logloss: 0.780707
[11]	training's multi_error: 0.0746699	training's multi_logloss: 0.741901
[12]	training's multi_error: 0.0735103	training's multi_logloss: 0.705852
[13]	training's multi_error: 0.0734364	training's multi_logloss:

In [None]:
#Best parameters found:  {'subsample': 0.8, 'reg_lambda': 1.8666666666666667, 'reg_alpha': 2.1333333333333333, 'num_leaves': 288, 'n_jobs': 4, 'n_estimators': 1536, 'min_data_in_leaf': 5, 'min_child_weight': 50.0, 'max_depth': 3, 'learning_rate': 0.05, 'early_stopping_rounds': 50, 'colsample_bytree': 0.6, 'class_weight': 'balanced', 'boosting_type': 'gbdt', 'bagging_freq': 6, 'bagging_fraction': 0.7555555555555555}
#Best f1_weighted found:  0.9045974535906183
model=lgb(objective='mutlticlass',max_bin=64,early_stopping_rounds=100,subsample = 0.8, 
          reg_lambda = 1.86, reg_alpha = 2.13, num_leaves = 288, n_jobs = 4, 
          n_estimators = 1536,min_data_in_leaf = 5,
          max_depth = 3, learning_rate = 0.05, colsample_bytree = 0.6, 
          boosting_type = 'gbdt',class_weight='balanced',random_state=21)
eval_set = [(X_b, y_b)]
model.fit(X_b, y_b,verbose=False,eval_metric="error", eval_set=eval_set)
predic_breed = model.predict(X_b_test)


Found `early_stopping_rounds` in params. Will use it instead of argument



In [None]:
predic_breed

array([1., 0., 0., ..., 1., 2., 1.])

In [None]:

# Create the parameter grid: gbm_param_grid 

gbm_param_grid = {
    'boosting_type':['gbdt'],
    'n_estimators': range(512,1024,128),
    'max_depth': range(3,6),
    'class_weight' : ['balanced'],
    'reg_alpha': np.linspace(0.0,1, 10),
    'reg_lambda': np.linspace(0.0,1, 10),
    'min_data_in_leaf': [20,30,40,50],,
    'num_leaves': range(256,512,4),
    'learning_rate': np.linspace(0.05,0.95,10),
    'subsample':[0.8,0.9,1],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1],
    'early_stopping_rounds': [100],
    'n_jobs':[4]
}

eval_set = [(X_p, y_p)]

# Instantiate the regressor: gbm
lgbm_random = lgb(random_state=21,objective='multiclass')

#Perform random search: grid_mse
lgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid,estimator = lgbm_random,verbose=20, scoring = "f1_weighted", n_iter = 5,n_jobs=4)


#Fit randomized_mse to the data
lgb_random.fit(X_p, y_p,eval_metric="error", eval_set=eval_set,verbose=False)

#Print the best parameters and lowest RMSE
print("Best parameters found: ", lgb_random.best_params_)
print("Best f1_weighted found: ", lgb_random.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   32.8s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   41.1s
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:   47.9s
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed:   55.2s
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed:  1.4min
[Parallel(n_jobs=4)]: Done   8 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done   9 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done  11 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  12 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  13 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done  14 tasks      | elapsed:  2.5min
[Parallel(n_jobs=4)]: Done  15 tasks      | elapsed:  2.5min
[Parallel(

Best parameters found:  {'subsample': 0.9, 'reg_lambda': 0.2222222222222222, 'reg_alpha': 0.8888888888888888, 'num_leaves': 316, 'n_jobs': 4, 'n_estimators': 640, 'min_data_in_leaf': 10, 'min_child_weight': 44.0, 'max_depth': 4, 'learning_rate': 0.65, 'early_stopping_rounds': 100, 'colsample_bytree': 0.7, 'class_weight': 'balanced', 'boosting_type': 'gbdt', 'bagging_freq': 3, 'bagging_fraction': 0.95}
Best f1_weighted found:  0.8827601869157908


In [None]:
 
model=lgb(objective='mutlticlass',max_bin=64,subsample = 1, reg_lambda = 0.4, reg_alpha = 0.6, num_leaves = 210, n_jobs = 4, n_estimators = 792, min_data_in_leaf = 20, max_depth = 5, learning_rate = 0.05, colsample_bytree = 0.6, boosting_type = 'gbdt',random_state=21)
model.fit(X_p, y_p,verbose=False)
predic_pet = model.predict(X_p_test)


In [None]:
predic_pet = predic_pet.reshape(predic_pet.shape[0], )
predic_pet

array([2, 1, 2, ..., 2, 4, 2])

In [None]:
output = pd.DataFrame({'pet_id': test_data.pet_id, 'breed_category': predic_breed,'pet_category':predic_pet})
output.to_csv('my_submission_62.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
