<a href="https://colab.research.google.com/github/kingajutrzenka/scikit-learn/blob/main/regresja%20%3A%20metody%20oceny%20modelu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Regresja : metody oceny modelu**

# Import bibliotek

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
y_true = 100 +20 * np.random.randn(50)
y_true

array([102.20967206,  88.99988317,  64.86652099, 114.06086713,
        95.90426038, 111.30597971,  91.46502016,  93.78462728,
       103.78484618, 121.19575063, 123.26970528,  92.10443404,
        70.53306753,  80.89343244,  93.09201581, 104.99118672,
        80.84615928,  90.71711018,  89.40660269,  91.75581333,
       132.36246174, 120.93771238,  99.16257883, 118.36226059,
       102.44052896,  97.02088372,  70.94970867,  96.30405372,
       109.07042367,  98.88678735, 121.26724728,  62.29279613,
        80.79488505, 107.16451958,  77.61011397, 102.13158449,
       115.62211776, 107.02325852, 135.56873784, 113.15512118,
        95.77058795, 115.69318651,  91.35323059,  70.38200727,
       112.54110033,  80.4692914 ,  86.56937763,  62.05608391,
       119.57674495, 105.31367409])

In [3]:
y_pred = y_true + 10 * np.random.randn(50)
y_pred

array([109.66209665,  95.5551822 ,  65.87544803, 114.50132102,
        98.66153429, 107.94636237,  88.75584322,  93.47319987,
       107.90635475, 124.44428899, 117.67710072,  81.74003166,
        65.42333134,  83.23187389, 107.4357318 , 116.22727413,
        75.21459807,  82.73765236, 100.76146787,  97.75904862,
       122.88565233, 135.26240047,  82.27862267, 119.64265746,
       102.5949971 ,  93.12544656,  78.19908331,  92.00498053,
       102.36830723, 106.42203796, 129.39870774,  59.20735655,
        68.06789733, 115.25566458,  91.0335169 ,  99.83254359,
       112.8755156 , 109.70505987, 128.76090376,  91.75498803,
        85.82119917, 102.52208979,  96.52034268,  56.51323917,
       113.20443125,  96.09015082,  73.81553624,  64.24013616,
       125.76169722, 113.5801272 ])

In [4]:
results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
results

Unnamed: 0,y_true,y_pred
0,102.209672,109.662097
1,88.999883,95.555182
2,64.866521,65.875448
3,114.060867,114.501321
4,95.90426,98.661534
5,111.30598,107.946362
6,91.46502,88.755843
7,93.784627,93.4732
8,103.784846,107.906355
9,121.195751,124.444289


In [5]:
results['error'] = results['y_true'] - results['y_pred']
results

Unnamed: 0,y_true,y_pred,error
0,102.209672,109.662097,-7.452425
1,88.999883,95.555182,-6.555299
2,64.866521,65.875448,-1.008927
3,114.060867,114.501321,-0.440454
4,95.90426,98.661534,-2.757274
5,111.30598,107.946362,3.359617
6,91.46502,88.755843,2.709177
7,93.784627,93.4732,0.311427
8,103.784846,107.906355,-4.121509
9,121.195751,124.444289,-3.248538


# Interpretacja graficzna

In [7]:
def plot_regression_results(y_true, y_pred):
    results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
    min_val = min(results['y_true'].min(), results['y_pred'].min())
    max_val = max(results['y_true'].max(), results['y_pred'].max())
    fig = go.Figure(data = [go.Scatter(x=results['y_true'], y=results['y_pred'], mode='markers'),
                    go.Scatter(x=[min_val, max_val], y=[min_val, max_val])],
                    layout=go.Layout(showlegend=False,
                                     xaxis_title='y_true',
                                     yaxis_title='y_pred',
                                     title='Regresja liniowa'))
    fig.show()
plot_regression_results(y_true, y_pred)

# dzięki temu wykresowi widzimy błędy

In [9]:
y_true = 100 +20 * np.random.randn(1000)
y_pred = y_true + 10 * np.random.randn(1000)
results = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
results['error'] = results['y_true'] - results['y_pred']
px.histogram(results, x='error')

# średni błąd bezwzględny

In [11]:
def mean_absolute_error(y_true, y_pred):
    return abs(y_true - y_pred).sum() / len(y_true)
mean_absolute_error(y_true, y_pred)

np.float64(7.798765973799477)

In [12]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_true, y_pred)

7.798765973799477

**to jest to samo, ale to z sklearn lepsze**

# Błąd średniokwadratowy ( różnice do kwadratu )

In [15]:
from sklearn.metrics import mean_squared_error
m = mean_squared_error(y_true, y_pred)

# Pierwastek błędu średniokwadratowego

In [16]:
np.sqrt(mean_squared_error(y_true, y_pred))
np.sqrt(m)

np.float64(9.975367849789261)

# Błąd maksymalny/minimalny

In [18]:
def max_error(y_true, y_pred):
    return abs(y_true - y_pred).max()
max_error(y_true, y_pred)

np.float64(40.7475291867986)

In [20]:
def min_error(y_true, y_pred):
    return abs(y_true - y_pred).min()
min_error(y_true, y_pred)

np.float64(0.002828244154258641)

# Współczynnik determinacji

In [21]:
from sklearn.metrics import r2_score

r2_score(y_true, y_pred)

0.7673761773195155