In [None]:
### Step 0: Setup ----

# Uncomment and adjust to install libraries not available on Google Colab
# !pip install packagename

# Load any libraries used for this project
import pandas as pd # data frame to store the data

from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score # for evaluation metrics

import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting pairs plot

import numpy as np # many mathematical operations and more, used for sqrt()

from mlxtend.feature_selection import SequentialFeatureSelector as SFS

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
# -------------------------------------------------------------------
# Helper function for VIF
# -------------------------------------------------------------------

# This function takes your fitted model as input
# It iterates across all variables
# And it produces a pandas data frame of VIFs for each variable
def vif_table(model):
    X = model.model.exog
    names = model.model.exog_names
    vif_values = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
    return pd.DataFrame({"variable": names, "VIF": vif_values})


In [None]:
# specific Google Colab code to load csv data
from google.colab import files  # so we can upload a CSV to our notebook

uploaded = files.upload() # opens a file chooser to select our csv file


In [None]:
# load data for this project
penguins = pd.read_csv('PalmerPenguins.csv')

penguins


In [None]:
### Step 1: Pre-processing ----

# MODIFY THE CODE BELOW TO CREATE CATEGORICAL VARIABLES FOR SPECIES AND ISLAND
# KEEP THE CATEGORICAL VARIABLE FOR SEX

# Drop rows with missing values for simplicity (we will learn to impute later)
penguins.dropna(inplace=True)

# Create categorical variable(s)
penguins['sex'] = pd.Categorical(penguins['sex'])


# Create dummy variables

penguins = pd.get_dummies(penguins, columns=['sex'], drop_first=True)

# pd.get_dummies() creates a boolean variable, sex_male,
# we need it to be numeric
penguins['sex_male'] = penguins['sex_male'].astype(int)


penguins


In [None]:
### Step 2: Split data into train and test sets ----
train_data, test_data = train_test_split(
    penguins, # data set to split
    test_size=0.3, # proportion of observations to be in the test set
    random_state = 90210 # random seed for reproducibility
    )

In [None]:
### Step 3: Exploratory data analysis ----

# calculate basic summary statistics
# note that this only describes numeric variables
train_data.describe()

# Visualizations
sns.pairplot(train_data, diag_kind='kde')
plt.show()



In [None]:
### Step 4: Feature engineering ----

# ADD YOUR CREATED CATEGORICAL VARIABLES TO THE ARRAY OF PREDICTORS BELOW

predictors = [
    'bill_length_mm',
    'bill_depth_mm',
    'flipper_length_mm',
    'sex_male',
    # ADD VARIABLE NAMES HERE...
    ]

# separate X and Y variables
X_train = train_data[predictors].copy()
y_train = train_data['body_mass_g'].copy()

X_test = test_data[predictors].copy()
y_test = test_data['body_mass_g']

In [None]:
### Step 5: Feature and Model selection ----

# FIT A LINEAR REGRESSION MODEL REGRESSING body_mass_g ON YOUR PREDICTORS





In [None]:
# -------------------------------------------------------------------
# Diagnostic plots
# -------------------------------------------------------------------

# CHECK FOR:
# MULTICOLLINEARITY
# HETEROSKEDASTICITY
# NON-NORMALITY OF THE ERROR TERM



# For linear regression, if you are doing explanation, check:
# multicollinearity using VIF
# heteroskedasticity with plot of residuals vs. outcome and Breusch Pagan Test
# normality of error term with QQ plot
# (non-normality can indicate non-linear relationship or other pathologies)
# outliers/influential observations





In [None]:
# Note on high VIF: If your goal is explanation, you need to address
# multicollinearity by dropping co-linear variables.
# (See pairs plot above for hints on correlation)
# If your goal is prediction, then you do not need to worry about
# multicollinearity.

In [None]:
### Step 6: Predictions and final model evaluation ----

# Predict on your test set to accurately measure out-of-sample performance

# CALCULATE THE ROOT MEAN SQUARE ERROR FOR YOUR TEST PREDICTIONS
# CREATE A SCATTER PLOT OF THE PREDICTED VALUES VS. ACTUAL VALUES IN THE TEST SET