# Cat boost model 

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from catboost import CatBoostRegressor
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

In [77]:
data = pd.read_csv('data/train.csv')
display(data.head())

# parent species column is categorical, so we need to convert it to numerical
data['parentspecies'] = data['parentspecies'].astype('category')
data['parentspecies'] = data['parentspecies'].cat.codes

# make psat_pa log base 10
data['pSat_Pa'] = np.log10(data['pSat_Pa'])

# set ID as index
data.set_index('Id', inplace=True)

#set target and features
target = data['pSat_Pa']
features = data.drop(['pSat_Pa'], axis=1)

# There are no missing values or outliers in this data set to clean
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.01, random_state=42)

# model to bead
best_model = None
best_rmse = 10
best_r2 = 0

data

Unnamed: 0,Id,MW,NumOfAtoms,NumOfC,NumOfO,NumOfN,NumHBondDonors,NumOfConf,NumOfConfUsed,parentspecies,...,ether..alicyclic.,nitrate,nitro,aromatic.hydroxyl,carbonylperoxynitrate,peroxide,hydroperoxide,carbonylperoxyacid,nitroester,pSat_Pa
0,0,30.010565,4,1,1,0,0,1,1,apin_decane_toluene,...,0,0,0,0,0,0,0,0,0,641974.491
1,1,74.995643,6,1,3,1,0,1,1,toluene,...,0,0,1,0,0,0,0,0,0,10295.712
2,2,102.990558,8,2,4,1,0,2,2,toluene,...,0,0,1,0,0,0,0,0,0,13517.575
3,3,118.985472,9,2,5,1,1,3,1,toluene,...,0,0,1,0,0,0,0,0,0,241.63
4,4,134.980387,10,2,6,1,1,3,3,toluene,...,0,0,1,0,0,0,0,1,0,315.357


Unnamed: 0_level_0,MW,NumOfAtoms,NumOfC,NumOfO,NumOfN,NumHBondDonors,NumOfConf,NumOfConfUsed,parentspecies,C.C..non.aromatic.,...,ether..alicyclic.,nitrate,nitro,aromatic.hydroxyl,carbonylperoxynitrate,peroxide,hydroperoxide,carbonylperoxyacid,nitroester,pSat_Pa
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,30.010565,4,1,1,0,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,5.807518
1,74.995643,6,1,3,1,0,1,1,6,0,...,0,0,1,0,0,0,0,0,0,4.012656
2,102.990558,8,2,4,1,0,2,2,6,0,...,0,0,1,0,0,0,0,0,0,4.130899
3,118.985472,9,2,5,1,1,3,1,6,0,...,0,0,1,0,0,0,0,0,0,2.383151
4,134.980387,10,2,6,1,1,3,3,6,0,...,0,0,1,0,0,0,0,1,0,2.498802
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166370,300.007724,28,6,12,2,2,72,11,6,0,...,0,2,0,0,0,1,0,1,0,-3.584787
166376,331.997553,30,6,14,2,3,197,25,6,0,...,0,1,0,0,1,1,1,0,0,-4.551207
166385,295.090331,37,10,9,1,3,302,40,0,0,...,0,1,0,0,0,0,2,0,0,-6.746709
166409,303.007389,29,6,13,1,4,283,6,6,0,...,0,0,0,0,1,1,2,0,0,-4.456638


In [78]:
#catboostregressor

# first we check that there is no obvious underfitting or overfitting before tuning any other parameters.

model = CatBoostRegressor(
    iterations=10000,  # Large number of iterations
    learning_rate=0.01,
    depth=10,
    random_state=42,
    od_type='Iter',
    eval_metric='RMSE',  # Evaluation metric for validation set
    use_best_model=True  # Use the best model encountered during training
)

In [89]:
# with the last model, we found overfitting, so we will decrease the learning rate

model = CatBoostRegressor(
    iterations=501,  # Large number of iterations
    depth=10,
    random_state=42,
    od_type='Iter',
    eval_metric='RMSE',  # Evaluation metric for validation set
    use_best_model=True  # Use the best model encountered during training
)


In [91]:
# Initialize and fit CatBoost Regressor
model.fit(X_train, y_train, 
          eval_set=(X_test, y_test),
          verbose=100
          )

# Predict and evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print('RMSE: {:.3f}'.format(rmse))
print('R2: {:.3f}'.format(r2))
print('learning rate: {:.3f}'.format(model.get_all_params()['learning_rate']))

Learning rate set to 0.020956
0:	learn: 2.1490041	test: 2.2405892	best: 2.2405892 (0)	total: 31.1ms	remaining: 5m 10s
100:	learn: 1.2066396	test: 1.1893931	best: 1.1893931 (100)	total: 2.13s	remaining: 3m 29s
200:	learn: 1.1059280	test: 1.0996476	best: 1.0996476 (200)	total: 4.32s	remaining: 3m 30s
300:	learn: 1.0672509	test: 1.0751441	best: 1.0751441 (300)	total: 6.43s	remaining: 3m 27s
400:	learn: 1.0435910	test: 1.0653506	best: 1.0653506 (400)	total: 8.65s	remaining: 3m 27s
500:	learn: 1.0215840	test: 1.0586080	best: 1.0586080 (500)	total: 10.8s	remaining: 3m 24s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.058607961
bestIteration = 500

Shrink model to first 501 iterations.
RMSE: 1.059
R2: 0.782
learning rate: 0.021


In [75]:

# Get feature importances
feature_importances = model.get_feature_importance()

# Create a DataFrame of feature importances
features = pd.DataFrame({'feature': X_train.columns, 'importance': feature_importances})

# Sort features by importance
features.sort_values(by='importance', ascending=True, inplace=True)

# Display the features with their importance scores
print(features)

n_top_features = 20  # Example: keep the top 20 features
top_features = features.tail(n_top_features)['feature']

# Select only the top features for your model
X_train_reduced = X_train[top_features]
X_test_reduced = X_test[top_features]

# Initialize and fit CatBoost Regressor
model = CatBoostRegressor(iterations=5000, learning_rate=0.01, depth=10)
model.fit(X_train_reduced, y_train, eval_set=(X_test_reduced, y_test), verbose=100)


                         feature  importance
19             aromatic.hydroxyl    0.012264
10  C.C.C.O.in.non.aromatic.ring    0.051744
24                    nitroester    0.116342
18                         nitro    0.529609
17                       nitrate    0.617236
16             ether..alicyclic.    0.923404
15                         ester    1.123752
4                         NumOfN    1.220203
0                             MW    1.241600
22                 hydroperoxide    1.454435
3                         NumOfO    1.761840
12                      aldehyde    1.781642
11              hydroxyl..alkyl.    1.872020
9             C.C..non.aromatic.    2.040696
23            carbonylperoxyacid    2.153926
1                     NumOfAtoms    2.273758
8                  parentspecies    2.385977
21                      peroxide    2.453619
13                        ketone    3.227774
14               carboxylic.acid    4.446176
7                  NumOfConfUsed    4.850992
20        

KeyboardInterrupt: 

In [65]:
# Predict and evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)




RMSE: 1.074
R2: 0.776


In [92]:
test_data = pd.read_csv('data/test.csv')

# parentspecies column is categorical, so we need to convert it to numerical
test_data['parentspecies'] = test_data['parentspecies'].astype('category')
test_data['parentspecies'] = test_data['parentspecies'].cat.codes

# remove Id column
test_data_noID = test_data.drop(['Id'], axis=1)

preds = model.predict(test_data_noID)

# create submission file
submission = pd.DataFrame({'Id': test_data['Id'], 'target': preds})

submission.to_csv('submission.csv', index=False)