## Regresja liniowa

In [None]:
# On Unix
!wget https://raw.githubusercontent.com/jgrynczewski/datascience_demo/main/weights_heights.csv

In [None]:
# On Windows
# !pip install wget
# !python -m wget https://raw.githubusercontent.com/jgrynczewski/datascience_demo/main/weights_heights.csv

# 1. Popatrzmy na dane

In [None]:
# Import potrzebnych bibliotek

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('weights_heights.csv', index_col='Index')
data.head(10)

In [None]:
data.describe()

In [None]:
# Wysokość przeliczamy na metry
data["Height"] = data["Height"] * 0.0254

In [None]:
# Wagę przeliczamy na kilogramy
data["Weight"] = data["Weight"] * 0.4536

In [None]:
# Wyświetlamy jeszcze raz data
data.head(10)

In [None]:
data[["Height", "Weight"]].corr()

In [None]:
data.plot(y="Height", color="red", title="Height distribution")

In [None]:
data.plot(y="Weight", color="green", title="Weight distribution")

In [None]:
data.plot(y="Height", kind="hist", color="red", title="Height distribution")

In [None]:
data.plot(y="Weight", kind='hist', color="green", title="Weight distribution")

In [None]:
def calc_bmi(height, weight):
  return weight / (height**2)

In [None]:
data.plot(y='Height', x='Weight', kind='scatter', title='Height vs Weight')

Minimalizacja funkcji błędu

In [None]:
# Przygotowanie danych pod minimalizację
# mnożenie macierzowe jako uogólnienie iloczynu skalarnego.

X = np.array([(x, 1) for x in data["Weight"]])  # dlaczego tak? mnożenie macierzowe
# -> y = w1*x + w0  <=> Y = w * X, gdzie w = [w1, w0], a X = [(x0, 1), (x1, 1), ...]
Y = data['Height']
print(X)
print(Y)

In [None]:
# funkcja błędu

def error(w, X, Y):
  err = Y - np.matmul(X, w)
  return err.dot(err)

In [None]:
# minimalizacja

# początkowy strzał
x0 = np.array([0, 0])

# scipy.optimize.minimize - minimization of scalar function of one or more variables

# parameters:
# fun: callable, the objective function to be minimized
# x0: ndarray, shape(n,) - initial guess  - that's how the minimize function
#                                           deduce the dimiension of the first
#                                           parameter of the fun.
# args: tuple (optional) - extra arguments passed to the objective function
#                          and its derivatives
# method str or callable - type od solver. If not given, chosen to be one
#                          of BFGS, L-BFGS-B, SLSQP, depending on whether
#                          or not the problem has constraints or bounds.
# bounds - sequence or Bounce (optional)

# returns:
# res: OptimizeResult
# The optimization result represented as a OptimizeResult object. Important
# attributes are: x the solution array, success a Boolean flag indicating
# if the optimizer exited successfully
result = scipy.optimize.minimize(
    error,
    x0=x0,
    args=(X, Y),
    method='L-BFGS-B',
    bounds=((-100, 100), (-10,10))
)

In [None]:
w1, w0 = result.x
print(w1)
print(w0)

In [None]:
x = np.linspace(min(data['Weight']), max(data['Weight']), 100)
y = w0 + x * w1   # hipoteza: korelacje pomiędzy wartościami height i weight można opisać funkcją liniową (regresja liniowa)

data.plot(
    y='Height',
    x='Weight',
    kind='scatter'
)

plt.plot(x, y, '-r')
plt.show()

In [None]:
# Przewidywanie

x = 80
y = w0 + x*w1
print(y)