## Load Data (With Cleaning)

In [2]:
from gdsc_cleaning import load_and_clean_gdsc
excluded_columns = ['LN_IC50', 'AUC', 'Z_SCORE', 'DRUG_ID', 'COSMIC_ID', 'DRUG_NAME', 'CELL_LINE_NAME']
df = load_and_clean_gdsc(excluded_columns=excluded_columns, drop_first=True)

## Regression with Packages

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression as SKLinearReg
from sklearn.tree import DecisionTreeRegressor as SKTreeReg
from sklearn.ensemble import RandomForestRegressor as SKForestReg
from sklearn.ensemble import GradientBoostingRegressor as SKGBReg

In [None]:
target = 'LN_IC50'
test_size = 0.2
random_state = 42


# Define X and y
y = df[target]
X = df.drop(columns=[c for c in excluded_columns if c in df.columns])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state
)

# Initialize models
models = {
    "Linear Regression": SKLinearReg(),
    "Decision Tree": SKTreeReg(random_state=random_state),
    "Random Forest": SKForestReg(n_estimators=5, random_state=random_state),
    "Gradient Boosting": SKGBReg(n_estimators=5, learning_rate=0.1, random_state=random_state),
}
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    results[name] = {"MSE": mse, "R2": r2}
    print(f"{name} -- MSE: {mse:.4f}, R2: {r2:.4f}")


Linear Regression -- MSE: 2.1262, R2: 0.6944
Decision Tree -- MSE: 2.2038, R2: 0.6832


## Regression from Scratch

In [7]:
from scratch_regression import LinearRegressionScratch, RandomForestRegressorScratch, GradientBoostingRegressorScratch

In [None]:
# Initialize scratch models
scratch_models = {
    "LinearRegressionScratch": LinearRegressionScratch(),
    "RandomForestScratch": RandomForestRegressorScratch(n_estimators=5, random_state=random_state),
    "GradientBoostingRegressorScratch": GradientBoostingRegressorScratch(n_estimators=5, learning_rate=0.1, random_state=0)
}

scratch_results = {}

for name, model in scratch_models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test, preds)
    r2 = r2_score(y_test, preds)
    scratch_results[name] = {"MSE": mse, "R2": r2}
    print(f"{name} -- MSE: {mse:.4f}, R2: {r2:.4f}")

LinearRegressionScratch -- MSE: 2.1255, R2: 0.6945
RandomForestScratch -- MSE: 2.1752, R2: 0.6873
GradientBoostingRegressorScratch -- MSE: 5.7666, R2: 0.1710
