# Linear Classifiers

## Fitting a line to data

In [None]:
import numpy as np
import matplotlib.pyplot as plt

Supose we are measuring how a mouse weight (X) can be used to predict the mouse size(Y). The values are the following:

In [None]:
X = np.array([0.60316714, 5.13981077, 0.57754654, 3.35880456, 5.28171939,
        9.41578636, 2.43742198, 5.99075038, 2.49605785, 6.83781763,
        0.16296473, 9.29969598])
Y = np.array([15.15613261, 23.89223832, 15.72151754, 16.35859565, 22.06175073,
        27.36346235, 20.4802553 , 24.54353801, 21.22924112, 21.77229456,
        14.94636364, 30.70479942])
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
plt.xlabel('Mouse weight')
plt.ylabel('Mouse size')
plt.show()

We train to find a good line that fit the data.

In [None]:
def draw_line(line, color, ax, values):
    min_value = np.min(values)
    max_value = np.max(values)
    # Generate x-values
    x = np.linspace(min_value, max_value, 100)  # range of x-values
    y = line(x)
    ax.plot(x, y, color=color)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
line = np.poly1d([1.2, 16])
draw_line(line, 'red', ax, X)
plt.show()

Now it comes the question, is it a good one? Is it the best posible line to fit the data?

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
draw_line(np.poly1d([1.2, 16]), 'red', ax, X)
draw_line(np.poly1d([1.4, 14]), 'gray', ax, X)
draw_line(np.poly1d([0.8, 16]), 'cyan', ax, X)
draw_line(np.poly1d([0, 22]), 'green', ax, X)
plt.show()

We can measure now how well the line fits the data by seen how close is it to the data points

In [None]:
distances = [abs(line(x) - y) for x, y in zip(X, Y)]
print(distances)
print(sum(distances))

Since we want to penalize larger divergences, we square the terms (additionally, the *abs* have some nasty mathematical properties).

In [None]:
distances = [(line(x) - y)**2 for x, y in zip(X, Y)]

if we add all the distances, the result is named **sum of squared residuals (SSR)**, because the **residuals** are the differences between the real and estimated values.

In [None]:
sum(distances)

Now lets create a function for performing the evaluation

In [None]:
def sum_sq_res(line, X, Y):
    return sum((line(x) - y)**2 for x, y in zip(X, Y))

sum_sq_res(line, X, Y)

Lets evaluate the functions we used before

In [None]:
all_lines = [
    (np.poly1d([1.2, 16]), 'red'), 
    (np.poly1d([1.4, 14]), 'gray'),
    (np.poly1d([0.8, 16]), 'cyan'),
    (np.poly1d([0, 22]), 'green'),
]

fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
for l, color in all_lines:
    draw_line(l, color, ax, X)
    print(color, sum_sq_res(l, X, Y))
plt.show()

As you can see, the better the line fit the data, the smaller value it have on the SSR. 

Lets try to find the line with the minimal value. This method is called **Least Squares**. We need to find two values:
- The curve slope, that controls the angle with respect to the horizontal axis
- The curve intercept, that controls the point where the curve cuts the vertical axis.

Consider the horizontal line with the average _y_ value. This is not a good one, but since it is based on data, will be our starting point.



In [None]:
b = np.average(Y)
print(b)
line = np.poly1d([0, b])

fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
draw_line(line, 'red', ax, X)

Now we explore the influence of the slope in the SSR

In [None]:
for slope in np.arange(-4, 4, 0.1):
    line = np.poly1d([slope, b])
    ssr = sum_sq_res(line, X, Y)
    print(slope, ssr)

In [None]:
# using a comprehension
points = [(slope, sum_sq_res(np.poly1d([slope, b]), X, Y)) for slope in np.arange(-5, 5, 0.1)]
ssrs = np.array(points)
plt.plot(ssrs[:, 0],ssrs[:, 1])
plt.show()


The ssr function has a derivative that is easy to calculate, allowing us to directly locate the minimum by finding the point where the derivative equals zero.

For this example, since we have a computer, we can directly get the slope with lowest ssr value:

In [None]:
min_row_index = np.argmin(ssrs[:, 1])
min_slope = ssrs[min_row_index, 0]
min_slope

In [None]:
b = np.average(Y)
print(b)
line = np.poly1d([min_slope, b])

fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
draw_line(line, 'red', ax, X)

We can do a similar operation in both parameters simultaneously to find the best line.

In [None]:
points = [(slope, c, sum_sq_res(np.poly1d([slope, c]), X, Y)) 
          for slope in np.arange(-5, 5, 0.1) 
          for c in np.arange(10, 20, 0.1)]

In [None]:
points = np.array(points)
points

Lets plot the SSR values with respect to slope and c

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Create a 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot the points in 3D
ax.scatter(points[:, 0], points[:, 1], points[:, 2], c=points[:, 2])

# Set labels and title
ax.set_xlabel('slope')
ax.set_ylabel('c')
ax.set_zlabel('SSV')
ax.set_title('3D Scatter Plot')

# Show the plot
plt.show()


And get the lowest value

In [None]:
min_row_index = np.argmin(points[:,2])
min_slope, min_c, ssr = points[min_row_index]
min_slope, min_c, ssr

In [None]:
line = np.poly1d([min_slope, min_c])

fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter(X, Y, color='blue', label='Random Points')
draw_line(line, 'red', ax, X)

Summarizing:
- To fit the model to the available data we need to minimize the sum of squared residual from the model and the data
- To do this, if we know a good candidate parameter interval, we can explore it. 
    - We can also analytically calculate the partial derivatives and find the point where it is zero
- The point where the SSR is minimal can be used in the model. derivative is zero is the one that minimizes the SSR, so it must be used for the final model.

## Linear classifiers in Scikit-learn

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

data = pd.read_csv('data/heart.csv')
label_encoder = LabelEncoder()
data["Sex"] = label_encoder.fit_transform(data["Sex"])
data["ChestPainType"] = label_encoder.fit_transform(data["ChestPainType"])
data["RestingECG"] = label_encoder.fit_transform(data["RestingECG"])
data["ExerciseAngina"] = label_encoder.fit_transform(data["ExerciseAngina"])
data["ST_Slope"] = label_encoder.fit_transform(data["ST_Slope"])
data.head()

### Simple linear regression models
Lets try to predict the person Age, based on the other parameters

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np

X = data.iloc[:, 1:]
y = data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

regressor = LinearRegression()
regressor.fit(X_train, y_train)


# Print the coefficients
print("Coefficients:", regressor.coef_)
print("Intercept:", regressor.intercept_)


And evaluate it.

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

# Calculate predicted values
y_pred = regressor.predict(X)

# Calculate sum of squared errors (SSE)
sse = mean_squared_error(y, y_pred) * len(y)

# Calculate total sum of squares (SST)
sst = np.sum((y - np.mean(y))**2)

# Calculate R-squared value
r_squared = 1 - (sse / sst)

# Calculate p-values using statsmodels
X_t = sm.add_constant(X)  # Add constant column for intercept
model = sm.OLS(y, X_t)
results = model.fit()
p_values = results.pvalues

# Print results
print("Sum of Squared Errors (SSE):", sse)
print("R-squared:", r_squared)
print("P-values:")
for k, p in p_values.items():
    print(k, str(round(p, 2)))