In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
# Example to load real data: iris = load_iris(); X = iris.data; y = iris.target

In [2]:
# Supervised: Linear Regression (from scratch)
# Predict a number using a straight line
class LinearRegressionEasy:
    def __init__(self, learn_rate=0.01, steps=1000):
        self.learn_rate = learn_rate  # How fast to learn
        self.steps = steps  # How many times to update
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)  # Start with zero weights
        self.bias = 0  # Start with zero bias

        # Update weights to reduce error
        for _ in range(self.steps):
            y_pred = np.dot(X, self.weights) + self.bias  # Predict
            error = y_pred - y
            self.weights -= self.learn_rate * np.dot(X.T, error) / n_samples
            self.bias -= self.learn_rate * np.sum(error) / n_samples

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias  # Predict new data

# Test it
X = np.array([[1], [2], [3], [4], [5]])  # House sizes
y = np.array([2, 4, 6, 8, 10])  # House prices
model = LinearRegressionEasy()
model.fit(X, y)
print("Prediction for size 6:", model.predict(np.array([[6]])))  # Should be ~12

Prediction for size 6: [11.93728249]


In [3]:
# Supervised: Linear Regression (libarary imported):
from sklearn.linear_model import LinearRegression

# Same data
X = np.array([[1], [2], [3], [4], [5]])
y = np.array([2, 4, 6, 8, 10])
model = LinearRegression()
model.fit(X, y)
print("Prediction for size 6:", model.predict(np.array([[6]])))

Prediction for size 6: [12.]


In [4]:
# Supervised: Logistic Regression
# From Scratch

# Predict yes/no (0 or 1)
class LogisticRegressionEasy:
    def __init__(self, learn_rate=0.01, steps=1000):
        self.learn_rate = learn_rate
        self.steps = steps
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))  # Turn score into probability

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Update weights to predict better
        for _ in range(self.steps):
            score = np.dot(X, self.weights) + self.bias
            y_pred = self.sigmoid(score)
            error = y_pred - y
            self.weights -= self.learn_rate * np.dot(X.T, error) / n_samples
            self.bias -= self.learn_rate * np.sum(error) / n_samples

    def predict(self, X):
        score = np.dot(X, self.weights) + self.bias
        return np.where(self.sigmoid(score) > 0.5, 1, 0)  # 1 if prob > 0.5, else 0

# Test it
X = np.array([[0.5], [1.5], [2.5], [3.5]])  # Feature (e.g., email length)
y = np.array([0, 0, 1, 1])  # Labels (0=not spam, 1=spam)
model = LogisticRegressionEasy()
model.fit(X, y)
print("Prediction for length 2:", model.predict(np.array([[2]])))  # 0 or 1

Prediction for length 2: [1]


In [5]:
# library:

from sklearn.linear_model import LogisticRegression

X = np.array([[0.5], [1.5], [2.5], [3.5]])
y = np.array([0, 0, 1, 1])
model = LogisticRegression()
model.fit(X, y)
print("Prediction for length 2:", model.predict(np.array([[2]])))

Prediction for length 2: [1]


In [6]:
#Supervised: K-Nearest Neighbors (KNN)
#From Scratch

# Predict by looking at closest points
class KNNEasy:
    def __init__(self, k=3):
        self.k = k  # Number of neighbors

    def fit(self, X, y):
        self.X_train = X  # Store data
        self.y_train = y

    def predict(self, X):
        predictions = []
        for x in X:
            # Calculate distances to all points
            distances = [np.sqrt(np.sum((x - x_train)**2)) for x_train in self.X_train]
            # Get K closest points
            closest = np.argsort(distances)[:self.k]
            closest_labels = [self.y_train[i] for i in closest]
            # Pick most common label
            most_common = max(set(closest_labels), key=closest_labels.count)
            predictions.append(most_common)
        return np.array(predictions)

# Test it
X = np.array([[1,1], [2,2], [3,3], [4,4]])  # Features (size, weight)
y = np.array([0, 0, 1, 1])  # Labels (0=apple, 1=orange)
model = KNNEasy(k=2)
model.fit(X, y)
print("Prediction for [2.5,2.5]:", model.predict(np.array([[2.5,2.5]])))  # 0 or 1

Prediction for [2.5,2.5]: [0]


In [7]:
# library:
from sklearn.neighbors import KNeighborsClassifier

X = np.array([[1,1], [2,2], [3,3], [4,4]])
y = np.array([0, 0, 1, 1])
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X, y)
print("Prediction for [2.5,2.5]:", model.predict(np.array([[2.5,2.5]])))

Prediction for [2.5,2.5]: [0]


In [8]:
# Supervised: Decision Trees
# From Scratch

# Note: Simplified! Full version needs complex split logic
class DecisionTreeEasy:
    def __init__(self, max_depth=2):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        # Stop if all labels same or max depth reached
        if len(np.unique(y)) == 1 or depth >= self.max_depth:
            return np.bincount(y).argmax()  # Most common label

        # Dummy split (first feature, average value)
        feature = 0
        threshold = np.mean(X[:, feature])
        left_idx = X[:, feature] < threshold
        right_idx = ~left_idx
        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)
        return (feature, threshold, left, right)

    def predict(self, X):
        return [self._predict(x, self.tree) for x in X]

    def _predict(self, x, tree):
        if not isinstance(tree, tuple):
            return tree
        feature, threshold, left, right = tree
        return self._predict(x, left if x[feature] < threshold else right)

# Test it
X = np.array([[1,1], [2,2], [3,3], [4,4]])
y = np.array([0, 0, 1, 1])
model = DecisionTreeEasy(max_depth=2)
model.fit(X, y)
print("Prediction for [2.5,2.5]:", model.predict(np.array([[2.5,2.5]])))

Prediction for [2.5,2.5]: [np.int64(1)]


In [9]:
# library
from sklearn.tree import DecisionTreeClassifier

X = np.array([[1,1], [2,2], [3,3], [4,4]])
y = np.array([0, 0, 1, 1])
model = DecisionTreeClassifier(max_depth=2)
model.fit(X, y)
print("Prediction for [2.5,2.5]:", model.predict(np.array([[2.5,2.5]])))

Prediction for [2.5,2.5]: [0]


In [10]:
# Unsupervised: K-Means Clustering
# From Scratch

# Group data into clusters
class KMeansEasy:
    def __init__(self, n_clusters=2, max_steps=100):
        self.n_clusters = n_clusters
        self.max_steps = max_steps
        self.centroids = None

    def fit(self, X):
        # Start with random centroids
        self.centroids = X[np.random.choice(X.shape[0], self.n_clusters, replace=False)]

        for _ in range(self.max_steps):
            # Assign points to nearest centroid
            distances = [np.sqrt(np.sum((X - c)**2, axis=1)) for c in self.centroids]
            labels = np.argmin(distances, axis=0)

            # Update centroids
            new_centroids = []
            for i in range(self.n_clusters):
                if len(X[labels == i]) > 0:
                    new_centroids.append(X[labels == i].mean(axis=0))
                else:
                    new_centroids.append(self.centroids[i])
            new_centroids = np.array(new_centroids)

            # Stop if centroids don't move
            if np.all(self.centroids == new_centroids):
                break
            self.centroids = new_centroids

        return labels

# Test it
X = np.array([[1,1], [2,2], [10,10], [11,11]])  # Data points
model = KMeansEasy(n_clusters=2)
labels = model.fit(X)
print("Cluster labels:", labels)  # e.g., [0, 0, 1, 1]

Cluster labels: [1 1 0 0]


In [11]:
# library
from sklearn.cluster import KMeans

X = np.array([[1,1], [2,2], [10,10], [11,11]])
model = KMeans(n_clusters=2, n_init=10)
labels = model.fit_predict(X)
print("Cluster labels:", labels)

Cluster labels: [0 0 1 1]


In [12]:
# Unsupervised: Principal Component Analysis (PCA)
# From Scratch

# Reduce data dimensions
class PCAEasy:
    def __init__(self, n_components=2):
        self.n_components = n_components
        self.components = None
        self.mean = None

    def fit(self, X):
        # Center data
        self.mean = np.mean(X, axis=0)
        X_centered = X - self.mean
        # Calculate covariance
        cov = np.dot(X_centered.T, X_centered) / X.shape[0]
        # Get top directions
        eigenvalues, eigenvectors = np.linalg.eig(cov)
        idx = eigenvalues.argsort()[::-1]
        self.components = eigenvectors[:, idx[:self.n_components]]

    def transform(self, X):
        X_centered = X - self.mean
        return np.dot(X_centered, self.components)  # Project to new space

# Test it
X = np.array([[1,2,3], [4,5,6], [7,8,9]])  # 3D data
pca = PCAEasy(n_components=2)
pca.fit(X)
reduced = pca.transform(X)
print("Reduced data:", reduced)

Reduced data: [[-5.19615242e+00 -3.33066907e-16]
 [ 0.00000000e+00  0.00000000e+00]
 [ 5.19615242e+00  3.33066907e-16]]


In [13]:
# library

from sklearn.decomposition import PCA

X = np.array([[1,2,3], [4,5,6], [7,8,9]])
pca = PCA(n_components=2)
reduced = pca.fit_transform(X)
print("Reduced data:", reduced)

Reduced data: [[-5.19615242e+00 -2.56395025e-16]
 [ 0.00000000e+00 -0.00000000e+00]
 [ 5.19615242e+00 -2.56395025e-16]]
