In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import binary_crossentropy

### Goals of this notebook

For this notebook, I hope to compare my own implementation of Logistic Regression with one implemented by scikit-learn. Although, before I do this I need some data to work with. I will start with a simple linearly separable example, then will move to a more complex example.

### Creating our dataset

I am not really sure the optimal approach to generating a dataset for classification. So, why not try a super naive yet simple approach.

#### IMPLEMENTATION 1:
For each class generate random weight vector (number of features + 1). Then, generate random data points (num samples, num features + 1). After projecting data points on to each classes weight vector, whichever class vector results in highest value (highest dot product) will be the samples class. 

In [None]:
def generate_dataset_for_classification(num_samples, num_classes, num_features, seed=42):
    np.random.seed(seed)
    X = np.random.uniform(-10, 10, (num_samples, num_features))
    W = np.random.randn(num_features + 1, num_classes)
    # add bias
    X = np.hstack((X, np.ones(X.shape[0]).reshape(-1, 1)))
    dots_plus_noise = np.dot(X, W) + np.random.normal(0, 0.1, (num_samples, num_classes))
    y = np.argmax(dots_plus_noise, axis=1)
    return X, y

X, y = generate_dataset_for_classification(100, 2, 2)

plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def generate_linearly_separable_data(num_samples, dim, distance_threshold):
    # Mean of the two distributions
    mean1 = np.zeros(dim)
    mean2 = np.ones(dim)

    # Covariance matrices for the distributions
    cov = np.eye(dim) * 0.5  # Scaled identity matrix

    # Generate samples
    X1 = np.random.multivariate_normal(mean1, cov, num_samples)
    X2 = np.random.multivariate_normal(mean2, cov, num_samples)

    # Combine the samples
    X = np.vstack((X1, X2))
    y = np.hstack((np.zeros(num_samples), np.ones(num_samples)))

    # Calculate midpoint between the means
    midpoint = (mean1 + mean2) / 2

    # any point along the line connecting the two means should be removed 
    v_hat = (mean2 - mean1).T / np.linalg.norm(mean2 - mean1)

    projections = np.dot(X, v_hat).dot(v_hat)
    X = X[np.linalg.norm(X - projections) > distance_threshold)]


    # Remove points too close to the midpoint
    distances = np.linalg.norm(X - midpoint, axis=1)
    keep = distances > distance_threshold
    X = X[keep]
    y = y[keep]

    return X, y

# Generate data
X, y = generate_linearly_separable_data(num_samples=100, dim=2, distance_threshold=0.5)

# Plotting
plt.scatter(X[y == 0][:, 0], X[y == 0][:, 1], color='red', label='Class 0')
plt.scatter(X[y == 1][:, 0], X[y == 1][:, 1], color='blue', label='Class 1')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Linearly Separable Data')
plt.legend()
plt.show()


In [None]:
points = np.array([[1,1],[1,2],[3,2],[3,3]])

mid = np.mean(points, axis=0).reshape(-1, 1)
ortho_mid = mid.T / np.linalg.norm(mid)

print(ortho_mid)

projects = np.dot(points, ortho_mid.T).dot(ortho_mid)

plt.scatter(points[:, 0], points[:, 1])
plt.scatter(mid[0], mid[1])
plt.scatter(dif_mid[:, 0], dif_mid[:, 1])
plt.scatter(projects[:, 0], projects[:, 1])
plt.legend(['points', 'mid', 'dif_mid', 'projects'])
plt.show()