# Imports

In [None]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets, linear_model
from scipy.linalg import svd
import sklearn.cluster as cluster
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(1234)

In [None]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
# mpl.rcParams.update({"axes.grid": True})

# 7.1 Linear Regression

Regression - requires a real-valued outcome

Simple example - model someones weight given their age
More complex - use their age, height, and country of origin

What isn't regression?
Predict if a person is a male or female given data about them? Not because output not real-valued (categorical output)

Simple Regression
- Just one predictor variable, e.g. given gross square footage, what is sale price of house?
- Do a scatter plot of sale price vs. living area

Simple Linear Regression
- Model assumes outcome vs. feature is linear

What is lowess?
- Looks like piecewise-linear
- More difficult to explain though

What is a model?
- Idealized representation of a system
- *All models are wrong, but some are useful*

Linear models
- Models are linear when predictions are linear combinations of variables

### Data Load - California Housing Data Set

In [None]:
# housing = datasets.fetch_california_housing()
# df = pd.DataFrame(data=housing.data, columns=housing.feature_names)
df = pd.read_csv("./data/housing.csv")
display(df)

### Scatter with Lowess Trendline

In [None]:
px.scatter(
    df,
    x="GrLivArea",
    y="SalePrice",
    trendline="lowess",
    trendline_color_override="black",
)

# 7.2 Tips Dataset

Row is a table that was served by a waiter. Sex is who paid, not sex of waiter.  
Going to concentrate on total bill

In [None]:
data = sns.load_dataset("tips", cache=False)
data.head()

In [None]:
px.scatter(
    data,
    x="total_bill",
    y="tip",
    trendline="ols",
    trendline_color_override="black",
)

# 7.3 Creating a Plotly and Scikit-Learn Model

Scikit Process
- Instantiate linear model object
- Fit model
- Use the object to make predictions

### Create Model

In [None]:
features = data[["total_bill"]]
target = data["tip"]
f = linear_model.LinearRegression(fit_intercept=False)
f.fit(features, target)

### Predictions

In [None]:
data["prediction"] = f.predict(features)
data.head()

### Plot

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(x=data["total_bill"], y=data["tip"], mode="markers", name="actual")
)
fig.add_trace(
    go.Scatter(
        x=data["total_bill"], y=data["prediction"], mode="lines", name="prediction"
    )
)
fig.update_layout(font_size=20)

### Model Params


In [None]:
display([f.coef_, f.intercept_])

### Regresion with Plotly

In [139]:
px.scatter(
    data,
    x="total_bill",
    y="tip",
    trendline="ols",
    trendline_color_override="red",
)

# 7.4 Defining Loss Functions

Loss Function
- Numerically computes the badness of a model
- Tells us how wrong it is
- Most common is L2 (squared loss), as in L2 norm

Definition of L2 Loss Function: $$L(y,yhat) = (y - yhat)^2$$

Example:
- y = 5.15
- yhat = 1.04
- loss = (5.15 - 1.04)^2 = 16.89

Mean Square Error (MSE)

Compute the average of L2 over entire dataset, mean squared errorr, just take average of all instances of L(y,yhat)

In [None]:
(5.15 - 1.04) ** 2

# 7.5 Computing L2


In [None]:
l2_loss = (data["tip"] - data["prediction"]) ** 2
l2_loss.mean()

In [None]:
from sklearn.metrics import mean_squared_error

mean_squared_error(data["tip"], data["prediction"])

# 7.6 Optimizing L2

Different choice of slope variable theta --> different predictions --> different L2

In [None]:
mean_squared_error(data["tip"], data["total_bill"] * 0.2)

In [None]:
def mse_given_theta(theta: float):
    return mean_squared_error(data["tip"], data["total_bill"] * theta)

In [None]:
mse_given_theta(0.2)

## Demo of Range of Objective Variable Evaluations vs. MSE

In [None]:
thetas = np.linspace(0.1, 0.2, 100)
mses = [mse_given_theta(theta) for theta in thetas]
# display(mses)
fig = px.line(x=thetas, y=mses)
fig.update_layout(xaxis_title="theta", yaxis_title="MSE", font_size=20)

# 7.7 Optimize L2 with Scipy

In [None]:
import scipy.optimize

In [None]:
def g(x):
    return x**3 + x**2 - 3 * x + 2

In [None]:
scipy.optimize.minimize(g, x0=10)

In [None]:
x = np.linspace(-3, 2, 100)
px.line(x=x, y=g(x))

In [None]:
scipy.optimize.minimize(mse_given_theta, x0=0.2).x

In [None]:
# Minimizer can fail!
scipy.optimize.minimize(g, x0=-3)

# 7.8 Absolute and Huber Loss

L1 loss = |y - yhat|

MAE = mean absolute error = mean of L1 loss over data set

Choice of loss function matters! For example if you minimize L2 you don't necessarily minimize L1, and vice versa

L2 more heavily penalizes outliers than L1. Since L2 more affected by outliers, its solution also more affected by outliers, i.e. your solution is influenced more heavily by outliers

In [None]:
4.11**2

Showing plot of MAE vs. MSE over the tips data set

![alt text](./data/MAE_vs_MSE.png "MAE vs. MSE")

Note that MAE is piecewise linear, one impact of that is that its derivative is discontinous, whereas the derivative of MSE is continous. That means MAE not as well suited to numerical methods

## Huber Loss

A third loss function mentioned is the Huber loss function.  This is notable for its resistance to extreme values and is defined as a piecewise function:


$${\displaystyle L_{\delta }(y,f(x))={\begin{cases}{\frac {1}{2}}(y-f(x))^{2}&{\textrm {for}}|y-f(x)|\leq \delta ,\\\delta \,(|y-f(x)|-{\frac {1}{2}}\delta ),&{\textrm {otherwise.}}\end{cases}}}$$



# 7.9 Multiple Linear Regression



## Define a 2D fitting object

In [None]:
features = data[["total_bill", "size"]]
tip = data["tip"]
f2 = linear_model.LinearRegression(fit_intercept=False)
f2.fit(features, tip)

## Look at its Coefs

In [None]:
f2.coef_

## Make a Prediction

In [None]:
f2.predict([[10, 3]])

## Evaluate Coefs on All Features

In [None]:
data["prediction_2d"] = f2.predict(features)

In [None]:
data

## Compare MSE of 1 and 2D Fits

In [None]:
[
    mean_squared_error(data["tip"], data["prediction"]),
    mean_squared_error(data["tip"], data["prediction_2d"]),
]

## 3d Scatter Plot

In [None]:
px.scatter_3d(data, x="total_bill", y="size", z="tip")

## 3d Scatter Plot with Plane from Model Fit

In [None]:
(table_bills, table_sizes) = np.meshgrid(range(50), range(6))
predictions_mesh = f2.coef_[0] * table_bills + f2.coef_[1] * table_sizes

In [None]:
fig = go.Figure()
fig.add_trace(
    go.Scatter3d(
        x=data["total_bill"],
        y=data["size"],
        z=data["tip"],
        mode="markers",
        name="actual",
    )
)

fig.add_trace(
    go.Surface(
        x=table_bills,
        y=table_sizes,
        z=predictions_mesh,
        name="predicted",
    )
)


fig.show()

# 7.10 Non-Numeric Features

Example: suppose a feature is days of the week - how to handle this?
- Naive approach is to map each day to an integer, such as Monday = 0, Tuesday = 1, etc..
    - This is a bad idea because it implies a distance between say Monday and Thursday (3) that does not exist
    - Because these days are all equally "far apart" from each other not in time but in merit
- Better approach - one-hot encoding
    - For K unique values, create K new features
    - Called *dummies* in the pandas lingo

## 1-Hot Example Image

![alt text](./data/One_Hot_Encoding.png "1-Hot Encoding")

## Implementing 1-Hot with Pandas

In [125]:
data = sns.load_dataset("tips")
data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [126]:
# Copy of just the string features
three_features = ["total_bill", "size", "day"]
three_features_data = data[three_features]
three_features_data

Unnamed: 0,total_bill,size,day
0,16.99,2,Sun
1,10.34,3,Sun
2,21.01,3,Sun
3,23.68,2,Sun
4,24.59,4,Sun
...,...,...,...
239,29.03,3,Sat
240,27.18,2,Sat
241,22.67,2,Sat
242,17.82,2,Sat


In [128]:
dummies = pd.get_dummies(three_features_data["day"])
# dummies.head()
data_w_dummies = pd.concat([three_features_data, dummies], axis=1).drop(columns=["day"])
data_w_dummies

Unnamed: 0,total_bill,size,Thur,Fri,Sat,Sun
0,16.99,2,False,False,False,True
1,10.34,3,False,False,False,True
2,21.01,3,False,False,False,True
3,23.68,2,False,False,False,True
4,24.59,4,False,False,False,True
...,...,...,...,...,...,...
239,29.03,3,False,False,True,False
240,27.18,2,False,False,True,False
241,22.67,2,False,False,True,False
242,17.82,2,False,False,True,False


In [131]:
tip = data["tip"]
f_with_day = linear_model.LinearRegression(fit_intercept=False).fit(data_w_dummies, tip)
f_with_day.coef_

array([0.09299361, 0.18713231, 0.66829361, 0.74578683, 0.62112858,
       0.73228865])

## Depiction of Model

![alt text](./data/six_dims_model.png "Six-Dimensional Model")

## Example Scenario Evaluation Using the 6-D Model

### Using the coefs by hand

In [132]:
# Example scenario
party_size = 3
total_bill = 50.0
thurs = 1
tip_prediction = 0.093 * total_bill + 0.187 * party_size + 0.668 * thurs
tip_prediction

5.8790000000000004

### Using predict function from the fit

In [138]:
# Compare using model eval. Note not identical because coefs used above (the theta vals) were rounded
f_with_day.predict([[total_bill, party_size, thurs, 0, 0, 0]])[0]

# Check a Saturday
f_with_day.predict([[total_bill, party_size, 0, 0, 1, 0]])[0]

5.832206051380103

## Single Model per Day

Note alternative approach is to create different models for each days

### Demo it With Plotly

In [141]:
px.scatter(
    data,
    x="total_bill",
    y="tip",
    trendline="ols",
    color="day",
)

# Mini Lesson 7.3 Categorical Data Types: Ordinal, Nominal 

Categorical data can be either nominal or ordinal
- Nominal
    - Labeled data with no inherent ordering, for example male female  
- Ordinal
    - Has a clear ordering to it, for example customer satisfaction data ("very satisfied", "somewhat satisfied", "not satisfied")

In [143]:
import plotly.express as px
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings("ignore")