In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder

# Load dataset (insert File Directory)
data = pd.read_csv('project2/penguins.csv')

# dropping NA's
penguins_clean = data.dropna()
penguins_clean.to_csv('penguins_clean.csv')

# Training Prep
X = penguins_clean.drop(['body_mass_g', 'island', 'sex'], axis=1)  # Exclude 'island' and 'sex' columns
y = penguins_clean['body_mass_g']

# encode dummy variables for X
X = pd.get_dummies(X)

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)  # test size 30%, train size 70%

# Training
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Extracting coefficients and intercept
intercept = regressor.intercept_
coefficients = regressor.coef_

# Printing intercept and coefficients
print(f"Intercept (beta_0): {intercept}")
print("Coefficients (beta_1, beta_2, ..., beta_p):")
for feature, coef in zip(X.columns, coefficients):
    print(f"{feature}: {coef}")

# Predictions
y_pred = regressor.predict(X_test)

# Evaluators
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"R-squared (R2): {r2}")


Intercept (beta_0): -4019.226873849506
Coefficients (beta_1, beta_2, ..., beta_p):
bill_length_mm: 44.71586096597852
bill_depth_mm: 128.80762092658472
flipper_length_mm: 19.682746969347136
species_Adelie: -137.64532528285824
species_Chinstrap: -649.164486080333
species_Gentoo: 786.8098113631904
MSE: 80219.8670848893
R-squared (R2): 0.8814019970610809
