In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
dataset = load_boston()

In [3]:
x = dataset.data
y = dataset.target

In [4]:
feature_names = dataset.feature_names
DESCR = dataset.DESCR

In [5]:
df = pd.DataFrame(x, columns=feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


#### Einfache Lineare Regression

#### Bedeutung
  
- $R^2$: Wie viel Streuung kann von dem Regressionsmodell erklärt werden
- coef: Steigung der Geraden
- intercept: y-Achsenabschnitt
  
#### Symbole

- $\bar{x}$: Mittelwert von $x$
- $\bar{y}$: Mittelwert von $y$
- $\hat{y}$: Prediktion vom Modell

#### Datensatz

- $m$: Anzahl an Samples
- $n$: Anzahl an Features
- $x$: Input-Daten (Features)
- $y$: Output Daten (Targets)

#### Variablen

- $x \in \mathbb{R}^{m,n}$
- $y \in \mathbb{R}^{m,}$
- coef, $\bar{x} \in \mathbb{R}^{n}$
- intercept, $\bar{y} \in \mathbb{R}$

#### Formeln

- $\beta = (X^TX)^{-1}X^Ty$
- coef = $\beta$\[1:\]
- intercept = $\beta$\[0\]
- $\hat{y} = X\beta$
- $R^2 = 1 - \frac{\sum_{i=1}^n(y_i-\hat{y})^2}{\sum_{i=1}^n(y_i-\bar{y})^2}$

In [9]:
class LinearRegression:
    def __init__(self):
        self.coef_ = None # Steigung
        self.intercept_ = None # Y-Achsenabschnitt
    
    def _add_intercept(self, x):
        intercepts = np.ones(shape=(x.shape[0]))
        x = np.column_stack((intercepts, x)) # 1-D array in 2-D Array stacken
        return x
    
    def fit(self, x: np.ndarray, y: np.ndarray):
        x = self._add_intercept(x)
        inner = np.dot(x.T, x)
        inv = np.linalg.inv(inner)
        beta = np.dot(np.dot(inv, x.T), y)
        print(f"Beta: {beta}")
        self.intercept_ = beta[0]
        self.coef_ = beta[1:]
    
    def predict(self, x: np.ndarray):
        y_pred = np.array(
            [np.dot(self.coef_.T, xi) + self.intercept_ for xi in x] # y = mx + b
        )
        return y_pred
    
    def score(self, x: np.ndarray, y: np.ndarray):
        y_pred = self.predict(x)
        y_mean = np.mean(y, axis=0)
        frac1 = np.sum(
            [(y[i] - y_pred[i])**2 for i in range(len(y))]
        )
        frac2 = np.sum(
            [(y[i] - y_mean)**2 for i in range(len(y))]
        )
        r2_score = 1.0 - frac1 / frac2
        return r2_score

### Data split

In [10]:
np.random.seed(42)

print(dataset.data.shape) # 506 Zeilen mit 13 Spalten (Features)
x = dataset.data[:, 5:6] # 5. Spalte (Feature)
print(x.shape)
y = dataset.target

# Split:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # Test 30%; Train 70%
print(x_train.shape)
print(x_test.shape)

(506, 13)
(506, 1)
(354, 1)
(152, 1)


In [11]:
regr = LinearRegression()
regr.fit(x_train, y_train)
r2_score = regr.score(x_test, y_test)

print(f"Coef: {regr.coef_}")
print(f"Intercept: {regr.intercept_}")
print(f"R2-Score: {r2_score}")

Beta: [-34.66230744   9.1181022 ]
Coef: [9.1181022]
Intercept: -34.66230743840655
R2-Score: 0.45846499343030656


### Visualization

In [None]:
def plot_regressor(regr, x_test, y_test):
    x1 = np.min(x) - 1
    x2 = np.max(x) + 1    
    y_pred = regr.predict([x1, x2])
    y1, y2 = y_pred
    plt.plot((x1, x2), (y1, y2), color="black")
    # print(np.column_stack((x_test, y_test)))
    plt.scatter(x_test, y_test, color="red")
    plt.show()

### Test data

In [None]:
plot_regressor(regr, x_test, y_test)

In [None]:
def plot_residuals(regr, x_train, y_train, x_test, y_test):
    y_pred_train = regr.predict(x_train)
    y_pred_test = regr.predict(x_test)
    
    min_val = min(np.min(y_pred_train), np.min(y_pred_test)) # Was ist der minimale Wert von train und test
    max_val = max(np.max(y_pred_train), np.max(y_pred_test))
    
    print(np.min(y_pred_train))
    print(np.min(y_pred_test))
    print(min_val)
    
    plt.scatter(y_pred_train, y_pred_train - y_train, color="blue")
    plt.scatter(y_pred_test, y_pred_test - y_test, color="red")
    plt.hlines(y=0, xmin=min_val, xmax=max_val)
    plt.legend(["Train", "Test"])
    plt.show()

In [None]:
plot_residuals(regr, x_train, y_train, x_test, y_test)