In [2]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

# Required libs
import pandas as pd
import numpy as np
from itertools import product
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

# Dataset
dataset_path = "/content/gdrive/MyDrive/Colab Notebooks/Copy of 50_Startups.csv"

dataset = pd.read_csv(dataset_path)
dataset = pd.get_dummies(dataset, drop_first=True)  # One-hot encode categorical columns

# Independent & Dependent variables declartion
X = dataset[['R&D Spend', 'Administration', 'Marketing Spend', 'State_Florida', 'State_New York']]
y = dataset[['Profit']]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=0)

# MLR
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
r2_lr = r2_score(y_test, lr.predict(X_test))
print(f"\nMULTIPLE LINEAR REGRESSION  →  R² = {r2_lr:.4f}")

#SVM
from sklearn.svm import SVR

sc = StandardScaler()
X_train_svm = sc.fit_transform(X_train)
X_test_svm = sc.transform(X_test)

#comomn c_value parameter
#commong kernels for all
C_values = [10, 100, 500, 1000, 2000, 3000]
kernels = ['linear', 'rbf', 'poly', 'sigmoid']

#loop
svm_results = []
for C, kernel in product(C_values, kernels):
    reg = SVR(C=C, kernel=kernel)
    reg.fit(X_train_svm, y_train.values.ravel())
    r2 = r2_score(y_test, reg.predict(X_test_svm))
    svm_results.append([C, kernel, r2])

svm_df = pd.DataFrame(svm_results, columns=["C", "Kernel", "R²"])
print("\nSUPPORT VECTOR MACHINE RESULTS")
print(svm_df.pivot(index="C", columns="Kernel", values="R²").round(4))

best_svm = svm_df.loc[svm_df['R²'].idxmax()]
print(f"Best SVM: C={best_svm.C}, Kernel={best_svm.Kernel}, R²={best_svm['R²']:.4f}")

# Decision tree
from sklearn.tree import DecisionTreeRegressor

criteria = ['squared_error', 'absolute_error', 'friedman_mse']
max_features = ['sqrt', 'log2']
splitters = ['best', 'random']

dt_results = []
for crit, feat, split in product(criteria, max_features, splitters):
    reg = DecisionTreeRegressor(criterion=crit, splitter=split, max_features=feat, random_state=0)
    reg.fit(X_train, y_train)
    r2 = r2_score(y_test, reg.predict(X_test))
    dt_results.append([crit, feat, split, r2])

dt_df = pd.DataFrame(dt_results, columns=['Criterion', 'Max_Features', 'Splitter', 'R²'])
print("\nDECISION TREE RESULTS")
print(dt_df.pivot_table(index=['Criterion', 'Max_Features'], columns='Splitter', values='R²').round(4))

best_dt = dt_df.loc[dt_df['R²'].idxmax()]
print(f"Best Decision Tree: {best_dt.to_dict()}")

#Random forest
from sklearn.ensemble import RandomForestRegressor

criteria = ['squared_error', 'absolute_error']

max_features = ['sqrt', 'log2']
n_estimators = [10, 100]

rf_results = []
for crit, feat, n in product(criteria, max_features, n_estimators):
    reg = RandomForestRegressor(criterion=crit, max_features=feat, n_estimators=n, random_state=0)
    reg.fit(X_train, y_train.values.ravel())
    r2 = r2_score(y_test, reg.predict(X_test))
    rf_results.append([crit, feat, n, r2])

rf_df = pd.DataFrame(rf_results, columns=['Criterion', 'Max_Features', 'N_Estimators', 'R²'])
print("\nRANDOM FOREST RESULTS")
print(rf_df.pivot_table(index=['Criterion', 'Max_Features'], columns='N_Estimators', values='R²').round(4))

best_rf = rf_df.loc[rf_df['R²'].idxmax()]
print(f"Best Random Forest: {best_rf.to_dict()}")

# SUMMARY OF ALL MODELS
summary = pd.DataFrame([
    ['Multiple Linear Regression', r2_lr],
    ['SVM (Best)', best_svm['R²']],
    ['Decision Tree (Best)', best_dt['R²']],
    ['Random Forest (Best)', best_rf['R²']]
], columns=['Model', 'Best R²'])

print("\n FINAL SUMMARY")
print(summary)


Mounted at /content/gdrive

MULTIPLE LINEAR REGRESSION  →  R² = 0.8753

SUPPORT VECTOR MACHINE RESULTS
Kernel  linear    poly     rbf  sigmoid
C                                      
10     -0.1069 -0.1216 -0.1253  -0.1228
100     0.0430 -0.0842 -0.1208  -0.0964
500     0.5352  0.0718 -0.1015   0.0065
1000    0.7943  0.2113 -0.0800   0.1377
2000    0.8703  0.4921 -0.0306   0.3607
3000    0.8584  0.6603  0.0193   0.4987
Best SVM: C=2000, Kernel=linear, R²=0.8703

DECISION TREE RESULTS
Splitter                       best  random
Criterion      Max_Features                
absolute_error log2         -0.0575  -0.561
               sqrt         -0.0575  -0.561
friedman_mse   log2          0.7282  -0.951
               sqrt          0.7282  -0.951
squared_error  log2          0.7282  -0.951
               sqrt          0.7282  -0.951
Best Decision Tree: {'Criterion': 'squared_error', 'Max_Features': 'sqrt', 'Splitter': 'best', 'R²': 0.728249856755945}

4️⃣ RANDOM FOREST RESULTS
N_Estimators