# Linear Models
- Simple Linear Regression
- Multiple Linear Regression
- Ridge Regression
- Lasso Regression

## Load Packages and Data

In [1]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.model_selection import GridSearchCV, RepeatedKFold, validation_curve
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm import tqdm


# this extension properly formats a cell after it is run
# !{sys.executable} -m pip install nb_black # UNCOMMENT TO INSTALL
%load_ext nb_black
%matplotlib inline

# Set the maximum number of rows to 200
pd.set_option("display.max_rows", 200)

# Set the maximum number of columns to 200
pd.set_option("display.max_columns", 200)

np.random.seed(42)

<IPython.core.display.Javascript object>

In [2]:
# Load the Ames, Iowa housing dataset
data = pd.read_csv("../data/housing_corr.csv")
data.shape

(2558, 65)

<IPython.core.display.Javascript object>

## Simple Linear Regression

#### Using TotalSF as Predictor

In [9]:
X = data["TotalSF"].values.reshape(-1, 1)
y = data["SalePrice"].values.reshape(-1, 1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Define the linear regression model
lm = LinearRegression()

# Fit Model
lm.fit(X_train, y_train)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

# Calculate the R-squared score using cross-validation
scores = cross_val_score(lm, X_test, y_test, cv=cv, scoring="r2")

# Print the average R-squared score across all cross-validation folds
print(f"Average R-squared score: {scores.mean()}")

Average R-squared score: 0.6409440488428209


<IPython.core.display.Javascript object>

#### Using OverallQual as Predictor

In [10]:
X = data["OverallQual"].values.reshape(-1, 1)
y = data["SalePrice"].values.reshape(-1, 1)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Define the linear regression model
lm = LinearRegression()

# Fit Model
lm.fit(X_train, y_train)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

# Calculate the R-squared score using cross-validation
scores = cross_val_score(lm, X_test, y_test, cv=cv, scoring="r2")

# Print the average R-squared score across all cross-validation folds
print(f"Average R-squared score: {scores.mean()}")

Average R-squared score: 0.6004552005801451


<IPython.core.display.Javascript object>

## Multiple Linear Regression using OverallQual & TotalSF

In [None]:
TESTING

asdf

testing testing testing Testinga

In [None]:
X = data['OverallQual', 'TotalSF'].values.reshape(-1, 1)
y = data["SalePrice"].values.reshape(-1, 1)


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Define the linear regression model
lm = LinearRegression()

# Fit Model
lm.fit(X_train, y_train)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=42)

# Calculate the R-squared score using cross-validation
scores = cross_val_score(lm, X_test, y_test, cv=cv, scoring="r2")

# Print the average R-squared score across all cross-validation folds
print(f"Average R-squared score: {scores.mean()}")

In [None]:
# Extract the independent and dependent variables from both dataframes
X_train = train_data[['OverallQual', 'TotalSF']]
y_train = train_data['SalePrice'].values

X_test = test_data[['OverallQual', 'TotalSF']]
y_test = test_data['SalePrice'].values

# Create a LinearRegression model and fit it on the training data
model = LinearRegression()
model.fit(X_train, y_train)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Perform cross-validation and calculate the R-squared score on the test data
scores = cross_val_score(model, X_test, y_test, scoring='r2', cv=cv)

# Print the mean R-squared score across all cross-validation folds
print('Mean R-squared score:', scores.mean())

## Dummify categorical Values

In [None]:
# Identify non-numeric features
non_numeric_columns = data.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Apply one-hot encoding to each non-numeric feature
data = pd.get_dummies(data, columns=non_numeric_columns)

# Preview the transformed dataset
data.head()

## Ridge Regression with all 749 Columns...

In [None]:
# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define the predictor and response variables
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data['SalePrice'].values


X_test = test_data.drop('SalePrice', axis=1)
y_test = test_data['SalePrice'].values

# Define the linear regression model
ridge = Ridge(alpha=1.0)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Calculate the R-squared score using cross-validation
scores = cross_val_score(ridge, X_test, y_test, cv=cv, scoring='r2')

# Print the average R-squared score across all cross-validation folds
print(f'Average R-squared score: {scores.mean()}')

## Lasso Regression with all 749 Columns...

In [None]:
from sklearn.linear_model import Lasso

# Split the data into training and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define the predictor and response variables
X_train = train_data.drop('SalePrice', axis=1)
y_train = train_data['SalePrice'].values


X_test = test_data.drop('SalePrice', axis=1)
y_test = test_data['SalePrice'].values

# Define the linear regression model
lasso = Lasso(alpha=1.0)

# Define the cross-validation scheme
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=42)

# Calculate the R-squared score using cross-validation
scores = cross_val_score(lasso, X_test, y_test, cv=cv, scoring='r2')

# Print the average R-squared score across all cross-validation folds
print(f'Average R-squared score: {scores.mean()}')