In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
df = pd.read_csv('./data/dataset.csv')

In [None]:
X = df.drop('MW', axis=1)
y = df['MW']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=None, max_features='log2', min_samples_split=5, min_samples_leaf=2, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, subsample=0.8, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, colsample_bytree=0.8, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, num_leaves=31, random_state=42),
    # "SVR": SVR(C=10.0, gamma='scale', kernel='rbf'),
    "KNN": KNeighborsRegressor(n_neighbors=7, weights='distance', algorithm='auto', leaf_size=30, p=2),
    "Decision Tree": DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_split=5, min_samples_leaf=2, max_features='sqrt', criterion='squared_error', splitter='best')
}

In [16]:
results = []

for name, model in models.items():
    print(f"Evaluating {name}...")
    model.fit(X_train_scaled, y_train)

    cv_scores = cross_val_score(model, X_train_scaled, y_train, scoring="neg_mean_squared_error", cv=3, n_jobs=-1)
    cv_rmse = np.sqrt(-cv_scores.mean())

    y_pred = model.predict(X_test_scaled)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    model_filename = f'./models/{name.replace(" ", "_")}_model.pkl'
    joblib.dump(model, model_filename)
    print(f"Model saved as {model_filename}")

    results.append({"Model": name, "CV RMSE": cv_rmse, "Test RMSE": rmse, "R² Score": r2})

Evaluating Linear Regression...
Model saved as ./models/Linear_Regression_model.pkl
Evaluating Ridge Regression...
Model saved as ./models/Ridge_Regression_model.pkl
Evaluating Random Forest...
Model saved as ./models/Random_Forest_model.pkl
Evaluating Gradient Boosting...
Model saved as ./models/Gradient_Boosting_model.pkl
Evaluating XGBoost...
Model saved as ./models/XGBoost_model.pkl
Evaluating LightGBM...
Model saved as ./models/LightGBM_model.pkl
Evaluating KNN...
Model saved as ./models/KNN_model.pkl
Evaluating Decision Tree...
Model saved as ./models/Decision_Tree_model.pkl


In [17]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,CV RMSE,Test RMSE,R² Score
0,Linear Regression,598.857813,829.129713,0.995925
1,Ridge Regression,598.856941,829.130038,0.995925
2,Random Forest,3598.173831,843.216644,0.995785
3,Gradient Boosting,4654.809351,418.667695,0.998961
4,XGBoost,3845.147595,631.855955,0.997633
5,LightGBM,3827.655982,580.862715,0.998
6,KNN,4164.137054,2236.596753,0.970348
7,Decision Tree,4682.667498,1281.55944,0.990265


In [None]:
results_df.to_csv('results.csv',index=False)