In [10]:
import pandas as pd
import numpy as np
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline

# Load datasets
train_given = pd.read_csv('train.csv')
test_given = pd.read_csv('test.csv')

# Split features and target
X_train = train_given.drop(columns=['ID', 'log_pSat_Pa'])
y_train = train_given['log_pSat_Pa']
X_test = test_given.drop(columns=['ID'])
print(X_train)



               MW  NumOfAtoms  NumOfC  NumOfO  NumOfN  NumHBondDonors  \
0      224.016832          23       6       9       0               4   
1      310.064845          35       9      10       2               1   
2      368.033938          37      10      13       2               1   
3      299.012475          29       7      12       1               4   
4      202.011353          20       7       7       0               1   
...           ...         ...     ...     ...     ...             ...   
26632  221.017166          22       6       8       1               1   
26633  222.001182          21       6       9       0               3   
26634  287.012475          28       6      12       1               4   
26635  284.996825          26       6      12       1               3   
26636  267.022645          27       7      10       1               2   

       NumOfConf  NumOfConfUsed parentspecies  C=C (non-aromatic)  ...  ester  \
0          485.0           40.0       tolu

In [11]:
# Handle categorical columns
cat_col = X_train.select_dtypes(include=['object']).columns
X_train[cat_col] = X_train[cat_col].fillna('unknown')
X_test[cat_col] = X_test[cat_col].fillna('unknown')

# Encode categorical features
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_train_enc = enc.fit_transform(X_train[cat_col])
X_test_enc = enc.transform(X_test[cat_col])

# Handle numeric features
X_train_numeric = X_train.drop(columns=cat_col).fillna(0)
X_test_numeric = X_test.drop(columns=cat_col).fillna(0)

# Combine encoded and numeric features
X_train_preproc = np.hstack((X_train_numeric.values, X_train_enc))
X_test_preproc = np.hstack((X_test_numeric.values, X_test_enc))

# Split training data into train and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_preproc, y_train, test_size=0.2, random_state=42
)

# Dummy Regressor
dummy_model = DummyRegressor(strategy='mean')
dummy_model.fit(X_train_split, y_train_split)
y_val_pred_dummy = dummy_model.predict(X_val_split)

r2_dummy = r2_score(y_val_split, y_val_pred_dummy)
mse_dummy = mean_squared_error(y_val_split, y_val_pred_dummy)
cv_scores_dummy = cross_val_score(dummy_model, X_train_preproc, y_train, scoring='r2', cv=5)

print(f"R² score of Dummy model: {r2_dummy:.4f}")
print(f"MSE of Dummy model: {mse_dummy:.4f}")
print(f"Cross-Validation R² scores for Dummy model: {cv_scores_dummy}")


R² score of Dummy model: -0.0014
MSE of Dummy model: 9.7658
Cross-Validation R² scores for Dummy model: [-2.69698919e-04 -3.37360710e-04 -2.34260205e-06 -2.00823926e-04
 -1.97508581e-04]




In [12]:

# OLS Regression
ols_model = LinearRegression()
ols_model.fit(X_train_split, y_train_split)
y_val_pred_ols = ols_model.predict(X_val_split)

r2_ols = r2_score(y_val_split, y_val_pred_ols)
mse_ols = mean_squared_error(y_val_split, y_val_pred_ols)
cv_scores_ols = cross_val_score(ols_model, X_train_preproc, y_train, scoring='r2', cv=5)

print(f"R² score of OLS model: {r2_ols:.4f}")
print(f"MSE of OLS model: {mse_ols:.4f}")
print(f"Cross-Validation R² scores for OLS model: {cv_scores_ols}")


R² score of OLS model: 0.7032
MSE of OLS model: 2.8941
Cross-Validation R² scores for OLS model: [0.71787076 0.69696504 0.70634604 0.69692332 0.71127702]


In [13]:

# Support Vector Regression (SVR)
svr_model = make_pipeline(StandardScaler(), SVR(kernel='rbf', C=1.0, epsilon=0.1))
svr_model.fit(X_train_split, y_train_split)
y_val_pred_svr = svr_model.predict(X_val_split)

r2_svr = r2_score(y_val_split, y_val_pred_svr)
mse_svr = mean_squared_error(y_val_split, y_val_pred_svr)
cv_scores_svr = cross_val_score(svr_model, X_train_preproc, y_train, scoring='r2', cv=5)

print(f"R² score of SVR model: {r2_svr:.4f}")
print(f"MSE of SVR model: {mse_svr:.4f}")
print(f"Cross-Validation R² scores for SVR model: {cv_scores_svr}")


R² score of SVR model: 0.7401
MSE of SVR model: 2.5348
Cross-Validation R² scores for SVR model: [0.75511792 0.73485884 0.74398503 0.73103913 0.75049404]


In [15]:


# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_split, y_train_split)
y_val_pred_ridge = ridge_model.predict(X_val_split)

r2_ridge = r2_score(y_val_split, y_val_pred_ridge)
mse_ridge = mean_squared_error(y_val_split, y_val_pred_ridge)
cv_scores_ridge = cross_val_score(ridge_model, X_train_preproc, y_train, scoring='r2', cv=5)

print(f"R² score of Ridge model: {r2_ridge:.4f}")
print(f"MSE of Ridge model: {mse_ridge:.4f}")
print(f"Cross-Validation R² scores for Ridge model: {cv_scores_ridge}")


R² score of Ridge model: 0.7033
MSE of Ridge model: 2.8939
Cross-Validation R² scores for Ridge model: [0.71786362 0.69697693 0.70641042 0.69691826 0.71129616]


In [16]:

# Lasso Regression
lasso_model = Lasso(alpha=0.01)
lasso_model.fit(X_train_split, y_train_split)
y_val_pred_lasso = lasso_model.predict(X_val_split)

r2_lasso = r2_score(y_val_split, y_val_pred_lasso)
mse_lasso = mean_squared_error(y_val_split, y_val_pred_lasso)
cv_scores_lasso = cross_val_score(lasso_model, X_train_preproc, y_train, scoring='r2', cv=5)

print(f"R² score of Lasso model: {r2_lasso:.4f}")
print(f"MSE of Lasso model: {mse_lasso:.4f}")
print(f"Cross-Validation R² scores for Lasso model: {cv_scores_lasso}")


R² score of Lasso model: 0.7007
MSE of Lasso model: 2.9186
Cross-Validation R² scores for Lasso model: [0.71609473 0.69538063 0.70379822 0.69384497 0.70970758]
