In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Regression

## Linear Regression

In [None]:
def generate_data_reg(rule, size=100, noise_std=1):
    xx = np.random.uniform(low=0, high=100, size=size)
    noise = np.random.normal(0, noise_std, size=size)
    yy = rule(xx) + noise
    return xx, yy

In [None]:
xx, yy = generate_data_reg(rule=lambda x: 10 + 0.5*x, noise_std=10)
plt.scatter(xx, yy)

In [None]:
def plot_data(xx, yy, **kwargs):
    plt.scatter(xx, yy, **kwargs)
    plt.xlabel("X")
    plt.ylabel("Y")


In [None]:
plot_data(xx, yy)

In [None]:
xx, yy = generate_data_reg(rule=lambda x: 10 + x**2, noise_std=1000)
plot_data(xx, yy)

In [None]:
xx, yy = generate_data_reg(rule=lambda x: 10 + 0.5*x, noise_std=10)

### General workflow

Make prediction -> Compare with ground truth -> Make a slightly better prediction <br>

Let's make a prediction!

In [None]:
coefs = [10, -1] # Random prediction
plot_data(xx, coefs[0] + xx*coefs[1], color="red")
plot_data(xx, yy, color="blue")

How do we evaluate this prediction? We need to construct a cost function.

In [None]:
#TOREMOVE
def cost_reg(truth, prediction):
    return ((truth-prediction)**2)

In [None]:
# def cost_reg(truth, prediction):
    ### What would be a reasonable cost function?

In [None]:
## Let's check that this cost function makes sense

In [None]:
def score_prediction(xx, yy_true, coefs):
    """
    Returns the cost function associated with given coefficients.
    coefs is a list of two values - [intercept, slope].
    """
    ### That should be easy.
    #TOREMOVE
    return cost_reg(yy_true, coefs[0] + xx*coefs[1]).sum()

Okay, we have a cost function that will allow us to evaluate any given prediction. Now we need some way to make a better prediction using this cost function. 

Our strategy will be the following: <br>
- Try changing each of the coefficients a little.
- As soon as we stumble upon a beneficial change (that makes the regression line better), we accept the new regression line.
This is not the most efficient strategy (and definitely not the one they really use in application) but it's simple and it works for this simple task!

In [None]:
def update_coefs_linreg(xx, yy_true, coefs, step_size=0.01):
    new_coefs = coefs[:] # Initialize the new coefficients with the values of old coefficients, then we will change them a little.

    # This is just switch between the coefficients when updating
    ii = np.arange(len(coefs))
    np.random.shuffle(ii)
    
    for i in ii: # Try changing each of the coefficients a little
        
        delta = np.zeros(len(coefs)) 
        delta[i] = step_size # Changing ith coefficient by 'step_size'
        
        # If adding delta to i-th coefficient makes the line better
        if score_prediction(xx, yy_true, coefs+delta) < score_prediction(xx, yy_true, coefs): 
            new_coefs += delta # Add this delta
            break # And end this optimization step
        # If subtracting delta to i-th coefficient makes the line better
        elif score_prediction(xx, yy_true, coefs-delta) < score_prediction(xx, yy_true, coefs): 
            new_coefs -= delta # Subtract this delta
            break # And end this optimization step
    return new_coefs

In [None]:
coefs = [10, -1]
optimized_one_step_coefs = update_coefs_linreg(xx, yy, coefs)
print(optimized_one_step_coefs)

In [None]:
plt.figure(figsize=(10, 5))
plot_data(xx, coefs[0] + xx * coefs[1], color="red", label="old prediction", s=10)
plot_data(xx, optimized_one_step_coefs[0] + xx * optimized_one_step_coefs[1], 
          color="green", label="new prediction", s=10)
plot_data(xx, yy, color="blue")
plt.legend()

Now we can do our optimization!

In [None]:
def linear_regression(xx, yy, step_size=0.01):
    coefs = np.array([1.0, 1.0])
    while True:
        previous_coefs = coefs.copy()
        coefs = update_coefs_linreg(xx, yy, coefs, step_size=step_size)
        print(coefs, score_prediction(xx, yy, coefs))
        if all(previous_coefs == coefs):
            break
    return coefs

In [None]:
# We will skip this
def normalize_reg(xx, yy):
    m_x, m_y = xx.mean(), yy.mean()
    s_x, s_y = xx.std(), yy.std()
    return (xx - m_x)/s_x, (yy - m_y)/s_y, m_x, m_y, s_x, s_y

def linear_regression_normalized(xx, yy, step_size=0.01):
    xx, yy, m_x, m_y, s_x, s_y = normalize_reg(xx, yy)
    coefs = np.array([1.0, 1.0])
    while True:
        previous_coefs = coefs.copy()
        coefs = update_coefs_linreg(xx, yy, coefs, step_size=step_size)
        if all(previous_coefs == coefs):
            break
    b = m_y + s_y*coefs[0] - s_y/s_x * coefs[1] * m_x
    k = s_y/s_x * coefs[1]
    coefs = [b, k]    
    return coefs

In [None]:
coefs = linear_regression(xx, yy, step_size=0.01)

In [None]:
plot_data(xx, yy)
plot_data(xx, coefs[0] + coefs[1]*xx)

# Examples with real data

In [None]:
import seaborn as sns

iris = sns.load_dataset('iris')
iris

In [None]:
xx = iris.sepal_length
yy = iris.sepal_width

In [None]:
plt.scatter(xx, yy)

In [None]:
setosas = iris.loc[iris.species == "setosa"]
xx = setosas.sepal_length
yy = setosas.sepal_width
plt.scatter(xx, yy)
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")

In [None]:
coefs = linear_regression(xx, yy)

In [None]:
plt.scatter(xx, yy, color="blue")
plt.scatter(xx, coefs[0] + xx*coefs[1], color="green")
plt.xlabel("Sepal length")
plt.ylabel("Sepal width")

In [None]:
def predict_sepal_width_from_sepal_length(sepal_length, coefs):
    #TOREMOVE
    return coefs[0] + coefs[1]*sepal_length 

# Classification

In [None]:
def generate_data_cls(rule, n_variables=2, size=100):
    xx = np.random.uniform(low=0, high=100, size=(size, n_variables))
    yy = []
    for i in range(xx.shape[0]): # Number of observations
        yy.append(rule(*xx[i, :])) # Apply rule function to all the features of ith observation
    yy = np.array(yy).astype(float) # Convert to numpy array of type float
    return xx, yy

In [None]:
xx, yy = generate_data_cls(rule=lambda x1, x2: x1 < x2)
plt.scatter(xx[:, 0], xx[:, 1], c=["red" if y == 0 else "green" for y in yy])

In [None]:
def plot_data(xx, yy):
    plt.scatter(xx[:, 0], xx[:, 1], color=["green" if y else "red" for y in yy])
    plt.xlabel("X1")
    plt.ylabel("X2")

In [None]:
xx, yy = generate_data_cls(rule=lambda x1, x2: (x1-50)*(x2-50) > 10, size=500)
plot_data(xx, yy)

In [None]:
xx, yy = generate_data_cls(rule=lambda x1, x2: (x1-50)**2 + (x2-50)**2 < 25**2, size=500)
plot_data(xx, yy)

In [None]:
xx, yy = generate_data_cls(rule=lambda x1, x2: x2 > 100-x1, size=500)
plot_data(xx, yy)

## Logistic Regression

General principle: we want to assign (?somehow?) a value to each point. If this value is, say, large, then the point is green, if it is small, then it is red. <br>
Two questions: <br>
- How to assign this value?
- How to formalize the "if large, green, if small, red" principle?
  
Let's start from the end.

### If large, green; if small, red.

This principle can be formalized with a logistic function that converts any number from $-\infty$ to $+\infty$ to a $[0, 1]$ range.

In [None]:
def logistic_function(x):
    return 1/(1+np.exp(-x))

sample_x = np.arange(-10, 10, 0.1)
sample_y = logistic_function(sample_x)
plt.plot(sample_x, sample_y)
plt.show()

So, whatever number our algorithm outputs for a certain point, we apply logistic function to it and the closer it is to 1 the more sure we are that the point should be green and the closer it is to 0, the more sure we are that the point should be red. <br>
Notice that we are only able to do it because of the limited range. We can tell exactly how close 0.99 is to 1.0 but we cannot tell how close 10 is to $\infty$

### Assigning value to each point

There are many ways of doing that. And different ways correspond to different machine learning algorithms for classification. We are going to use the simplest one - logistic regression. In logistic regression each point is assigned a value equal to a linear combination of its coordinates.

In [None]:
sample_point = [1, 1] # x1 = 1, x2 = 1
b, w1, w2 = 10, 20, 30 
assigned_value = b + sample_point[0] * w1 + sample_point[1] * w2 # b + x1*w1 + x2*w2
print(assigned_value, logistic_function(assigned_value))

So, what we need to do is to learn coefficients $b$, $w_1$ and $w_2$ what will make the best predictions on our dataset (predict something close to 0 when the point is red and close to 1.0 when the point is green). <br>

In [None]:
predictions = logistic_function(-1000 + xx @ np.array([10, 20]))
plt.scatter(xx[:, 0], xx[:, 1], c=predictions, cmap="RdYlGn")

The separation line is the one where the weighted sum of the variables is equal to 0 and the logit of is equal to 0.5 <br><br>
$b + w_1x_1 + w_2x_2 = 0$ <br> 
$w_2x_2 = -b - w_1x_1$ <br>
$x_2 = -\frac{b}{w_2} - \frac{w_1}{w_2}x_1$  <br><br>
This is the equation of the separation line. You can see that it's intercept with $0$ is equal to $-\frac{b}{w_2}$ and its inclination is $-\frac{w_1}{w_2}$

### Cost function

Cost: <br>
$C = -(y*log(p) + (1-y)*log(1-p)$)

|yy |prediction|C|
|---|----------|-------------|
|0|->0|?|
|0|->1|?|
|1|->0|?|
|1|->1|?|

In [None]:
def cost_cls(truth, prediction, eps=1e-15):
    # Implement yourself
    prediction = np.clip(prediction, eps, 1 - eps)
    return -(truth * np.log(prediction) + (1-truth)*np.log(1-prediction))

In [None]:
def score_prediction_cls(xx, yy_true, coefs):
    ones = np.ones_like(yy_true).reshape(len(yy_true), 1)
    xx_with_ones = np.hstack([ones, xx])
    return cost_reg(logistic_function(xx_with_ones@coefs), yy_true).sum()

Notice that here we do not have a b coefficient. That is because it is included in coefs: our coefs will be in a form <br>
$[b, w1, w2]$ <br>
And inside the function we add a column of ones to xx so that it has the following shape: <br>
$[1, x_{11}, x_{12}]$<br>
$[1, x_{21}, x_{22}]$ <br>
...<br>
$[1, x_{m1}, x_{m2}]$ <br>


With matrix by vector multiplication it becomes $1*b + x_1*w_1 + x_2*w_2$ 

## Updating coefficients

So, the only thing we have to do now is to come up with a way to propose a prediction that is better than the current one. <br>
Then we will be able to start with a random separation line and by iterative improvement get the best prediction possible! 

We will employ the same algorithm as with linear regression.

In [None]:
def update_coefs_logreg(xx, yy_true, coefs, step_size=0.001):
    new_coefs = coefs[:] # Initialize the new coefficients with the values of old coefficients, then we will change them a little.

    ii = np.arange(len(coefs))
    # np.random.shuffle(ii)
    for i in ii: # Try changing each of the coefficients a little
        delta = np.zeros(len(coefs)) 
        delta[i] = step_size # Changing ith coefficient by 'step_size'
        
        # If adding delta to i-th coefficient makes the prediction better
        if score_prediction_cls(xx, yy_true, coefs+delta) < score_prediction_cls(xx, yy_true, coefs): 
            new_coefs += delta # Add this delta
            break # And end this optimization step
        # If subtracting delta to i-th coefficient makes the prediction better
        elif score_prediction_cls(xx, yy_true, coefs-delta) < score_prediction_cls(xx, yy_true, coefs): 
            new_coefs -= delta # Subtract this delta
            break # And end this optimization step
    return new_coefs

In [None]:
coefs = np.array([1.0, 1.0, 1.0])
update_coefs_logreg(xx, yy, coefs)

In [None]:
predictions = logistic_function(1 + xx @ np.array([1, 1]))
plt.scatter(xx[:, 0], xx[:, 1], c=predictions, cmap="RdYlGn")

And we do the optimization as before

In [None]:
def logistic_regression(xx, yy, step_size=0.01):
    # Normalization helps, if you want to know why, we can discuss
    means = xx.mean(axis=0)
    sds = xx.std(axis=0)
    xx_normalized = (xx - means)/sds
    
    coefs = np.array([1.0]*(xx.shape[1] + 1)) # [1.0, 1.0, 1.0]
    n_steps = 0
    while True:
        previous_coefs = coefs.copy()
        coefs = update_coefs_logreg(xx_normalized, yy, coefs, step_size=step_size)
        if np.allclose(previous_coefs, coefs):
            break
        n_steps += 1
    print(f"finished in {n_steps} steps")
    
    # We got the coefs, but they only work for the "normalized space". We have to translate them to the initial space.
    b = coefs[0] - (coefs[1:]*means/sds).sum()
    ws = list(coefs[1:]/sds)
    coefs = np.array([b] + ws)
    return coefs

In [None]:
coefs = logistic_regression(xx, yy, step_size=0.1)
b, w1, w2 = coefs
xx_for_line = np.linspace(xx[:, 0].min(), xx[:, 1].max(), 10)
yy_for_line = -b/w2 - w1/w2 * xx_for_line
plt.plot(xx_for_line, yy_for_line)
plot_data(xx, yy)

In [None]:
def plot_splitting_line(coefs, xx):
    xx_for_line = np.array([xx[:, 0].min(), xx[:, 0].max()])
    b, w1, w2 = coefs
    yy_for_line = -b/w2 - w1/w2 * xx_for_line
    plt.plot(xx_for_line, yy_for_line)

    # Making the plot pretiter, don't worry about it
    mx, mn = xx[:, 1].max(), xx[:, 1].min()
    rng = mx-mn 
    # plt.ylim(mn-0.1*rng, mx+0.1*rng)

In [None]:
plot_data(xx, yy)
plot_splitting_line(coefs, xx)

In [None]:
xx, yy = generate_data_cls(n_variables=2, size=300, rule=lambda x1, x2: x1 - 3*x2 + 1 > 0)
plot_data(xx, yy)
coefs = logistic_regression(xx, yy, step_size=0.2)
plot_splitting_line(coefs, xx)

In [None]:
xx, yy = generate_data_cls(n_variables=2, size=200, rule=lambda x1, x2: (x1-10)**2+(x2-10)**2 < 25**2)
plot_data(xx, yy)

In [None]:
coefs = logistic_regression(xx, yy, step_size=0.01)
print(coefs)

In [None]:
plot_data(xx, yy)
plot_splitting_line(coefs, xx)

# Examples with real data

In [None]:
import seaborn as sns

iris = sns.load_dataset('iris')
iris

In [None]:
xx = iris.iloc[:, :2].to_numpy()
yy = (iris.species == "setosa").astype(float).to_numpy()
plot_data(xx, yy)
coefs = logistic_regression(xx, yy, step_size=0.01)
plot_splitting_line(coefs, xx)

In [None]:
def predict(val, coefs):
    #TOREMOVE
    val = np.array([1] + list(val))
    logistic_function_result = logistic_function(val @ coefs)
    return "green" if logistic_function_result > 0.5 else "red"

In [None]:
print(predict([3.5, 3.0], coefs))
print(predict([6.5, 3.0], coefs))