# Polynomial regression

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## Why Polynomial Regression?

In [None]:
# To understand the need for polynomial regression, let’s generate some random dataset first.
np.random.seed(0)
x = 2 - 3 * np.random.normal(0, 1, 20)
x = np.sort(x)
y = x - 2 * (x ** 2) + 0.5 * (x ** 3) + np.random.normal(-3, 3, 20)
plt.scatter(x,y, s=10)
plt.show()

In [None]:
# Let’s apply a linear regression model to this dataset.
from sklearn.linear_model import LinearRegression

# transforming the data to include another axis
x = x.reshape((-1, 1))
y = y.reshape((-1, 1))

model = LinearRegression()
model.fit(x, y)
y_pred = model.predict(x)

plt.scatter(x, y, s=10)
plt.plot(x, y_pred, color='r')
plt.show()

In [None]:
print(f"Funkcija premice: \t y = {model.coef_[0][0]} x + {model.intercept_[0]}")

## What is Polynomial Regression?

Splošna enačba polinoma je: 
$$y = w_1 x + w_2 x^2 + w_3 x^3 + ... + w_{n-1} x^{n-1} + b$$
Tukaj lahko spreminjamo **stopnjo polinoma**, koeficiente `w` in `b`.

**Stopnja polinoma** nam pove koliko je **najvišja potenca** na katero dajemo naš `x`. Z višanje stopnje polinoma višamo kompleksnost modela.

In [None]:
from sklearn.preprocessing import PolynomialFeatures

X = np.array([0, 1, 2, 3, 4])
X = X.reshape((-1, 1))

poly_features = PolynomialFeatures(degree=2)
X_POLY = poly_features.fit_transform(X)

X_POLY

---

In [None]:
from sklearn.preprocessing import PolynomialFeatures

polynomial_features= PolynomialFeatures(degree=2, include_bias=False)
x_poly = polynomial_features.fit_transform(x)

In [None]:
# the default "include_bias=True" adds a feature that's constantly 1
x_poly[:4]

---

In [None]:
model = LinearRegression()
model.fit(x_poly, y)
y_poly_pred = model.predict(x_poly)

In [None]:
plt.scatter(x, y, s=10)
plt.plot(x, y_poly_pred, color='m')
plt.show()

In [None]:
print(f"Funkcija premice: \t y = {model.coef_[0][0]} x + {model.coef_[0][1]} x^2 + {model.intercept_[0]}")

---

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly1_features = PolynomialFeatures(degree=1, include_bias=False)
x_poly1 = poly1_features.fit_transform(x)
model = LinearRegression()
model.fit(x_poly1, y)
y_poly1_pred = model.predict(x_poly1)

poly2_features = PolynomialFeatures(degree=2, include_bias=False)
x_poly2 = poly2_features.fit_transform(x)
model = LinearRegression()
model.fit(x_poly2, y)
y_poly2_pred = model.predict(x_poly2)

poly3_features = PolynomialFeatures(degree=3, include_bias=False)
x_poly3 = poly3_features.fit_transform(x)
model = LinearRegression()
model.fit(x_poly3, y)
y_poly3_pred = model.predict(x_poly3)


plt.scatter(x, y, s=10, label="Data")
plt.plot(x, y_poly1_pred, label="1st deg")
plt.plot(x, y_poly2_pred, label="2nd deg")
plt.plot(x, y_poly3_pred, label="3rd deg")

plt.legend()
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=20, include_bias=False)
x_poly = poly_features.fit_transform(x)
model = LinearRegression()
model.fit(x_poly, y)
y_poly_pred = model.predict(x_poly)

plt.scatter(x, y, s=10, label="Data")
plt.plot(x, y_poly_pred, label="20th deg")

plt.legend()
plt.show()

----

## Polynomial Regression with Multiple Features

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
np.random.seed(1)
x_1 = np.absolute(np.random.randn(100, 1) * 10)
x_2 = np.absolute(np.random.randn(100, 1) * 30)
y = 2*x_1**2 + 3*x_2 + 2 + np.random.randn(100, 1)*20

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
axes[0].scatter(x_1, y)
axes[1].scatter(x_2, y)
axes[0].set_title("x_1 plotted")
axes[1].set_title("x_2 plotted")
plt.show()

In [None]:
df = pd.DataFrame({"x_1":x_1.reshape(100,), "x_2":x_2.reshape(100,), "y":y.reshape(100,)}, index=range(0,100))
df

In [None]:
X, y = df[["x_1", "x_2"]], df["y"]
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(X)

pd.DataFrame(poly_features, columns=[r"$x_1$", r"$x_2$", r"$x_1^2$", r"$x_1 x_2$", r"$x_2^2$"])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(poly_features, y, test_size=0.3, random_state=42)

In [None]:
poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train, y_train)
poly_reg_y_predicted = poly_reg_model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
poly_reg_rmse

---

## Bostom Hausing

```
The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
 prices and the demand for clean air', J. Environ. Economics & Management,
 vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
 ...', Wiley, 1980.   N.B. Various transformations are used in the table on
 pages 244-261 of the latter.

 Variables in order:
 CRIM     per capita crime rate by town
 ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
 INDUS    proportion of non-retail business acres per town
 CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
 NOX      nitric oxides concentration (parts per 10 million)
 RM       average number of rooms per dwelling
 AGE      proportion of owner-occupied units built prior to 1940
 DIS      weighted distances to five Boston employment centres
 RAD      index of accessibility to radial highways
 TAX      full-value property-tax rate per $10,000
 PTRATIO  pupil-teacher ratio by town
 B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
 LSTAT    % lower status of the population
 MEDV     Median value of owner-occupi ```ed homes in $1000's


Napovedovali bomo **INDUS**.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]
    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# rescale data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
poly = PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
print(f"X_train.shape: {X_train.shape}")
print(f"X_train_poly.shape: {X_train_poly.shape}")

In [None]:
poly_reg_model = LinearRegression()
poly_reg_model.fit(X_train_poly, y_train)
poly_reg_y_predicted = poly_reg_model.predict(X_test_poly)

In [None]:
from sklearn.metrics import mean_squared_error
poly_reg_rmse = np.sqrt(mean_squared_error(y_test, poly_reg_y_predicted))
poly_reg_rmse

## Disadvantages of polynomial regression