In [10]:
'''
K-Nearest Neighbors (KNN)

K-Means Clustering

Linear Regression

Logistic Regression


'''

class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X  # list of lists
        self.y_train = y  # list

    def euclidean_distance(self, a, b):
        total = 0
        for i in range(len(a)):
            total += (a[i] - b[i]) ** 2
        return total ** 0.5

    def predict(self, X):
        predictions = []
        for x in X:
            distances = []
            for i in range(len(self.X_train)):
                dist = self.euclidean_distance(x, self.X_train[i])
                distances.append((dist, self.y_train[i]))
            distances.sort()
            k_nearest = [label for _, label in distances[:self.k]]

            count = {}
            for label in k_nearest:
                if label not in count:
                    count[label] = 0
                count[label] += 1

            best_label = max(count, key=count.get)
            predictions.append(best_label)
        return predictions


In [6]:
l1 = [[2, 2]]
l2 = [0]
knn = KNN()
print(knn.fit(l1,l2))

None


In [7]:
import random

class KMeans:
    def __init__(self, k=2, max_iters=100):
        self.k = k
        self.max_iters = max_iters

    def fit(self, X):
        self.centroids = random.sample(X, self.k)

        for _ in range(self.max_iters):
            clusters = [[] for _ in range(self.k)]
            for point in X:
                distances = []
                for centroid in self.centroids:
                    dist = sum((point[i] - centroid[i]) ** 2 for i in range(len(point))) ** 0.5
                    distances.append(dist)
                cluster_index = distances.index(min(distances))
                clusters[cluster_index].append(point)

            new_centroids = []
            for cluster in clusters:
                if not cluster:
                    new_centroids.append(random.choice(X))
                    continue
                mean = []
                for i in range(len(cluster[0])):
                    col_sum = sum(point[i] for point in cluster)
                    mean.append(col_sum / len(cluster))
                new_centroids.append(mean)

            if new_centroids == self.centroids:
                break
            self.centroids = new_centroids

    def predict(self, X):
        predictions = []
        for point in X:
            distances = []
            for centroid in self.centroids:
                dist = sum((point[i] - centroid[i]) ** 2 for i in range(len(point))) ** 0.5
                distances.append(dist)
            predictions.append(distances.index(min(distances)))
        return predictions


In [8]:
class LinearRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def fit(self, X, y):
        self.n = len(X[0])
        self.w = [0.0] * self.n
        self.b = 0.0

        for _ in range(self.epochs):
            dw = [0.0] * self.n
            db = 0.0
            for i in range(len(X)):
                y_pred = sum(self.w[j] * X[i][j] for j in range(self.n)) + self.b
                error = y_pred - y[i]
                for j in range(self.n):
                    dw[j] += error * X[i][j]
                db += error

            for j in range(self.n):
                self.w[j] -= self.lr * dw[j] / len(X)
            self.b -= self.lr * db / len(X)

    def predict(self, X):
        predictions = []
        for x in X:
            pred = sum(self.w[j] * x[j] for j in range(self.n)) + self.b
            predictions.append(pred)
        return predictions


In [9]:
import math

class LogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs

    def sigmoid(self, z):
        return 1 / (1 + math.exp(-z))

    def fit(self, X, y):
        self.n = len(X[0])
        self.w = [0.0] * self.n
        self.b = 0.0

        for _ in range(self.epochs):
            dw = [0.0] * self.n
            db = 0.0
            for i in range(len(X)):
                z = sum(self.w[j] * X[i][j] for j in range(self.n)) + self.b
                pred = self.sigmoid(z)
                error = pred - y[i]
                for j in range(self.n):
                    dw[j] += error * X[i][j]
                db += error

            for j in range(self.n):
                self.w[j] -= self.lr * dw[j] / len(X)
            self.b -= self.lr * db / len(X)

    def predict(self, X):
        predictions = []
        for x in X:
            z = sum(self.w[j] * x[j] for j in range(self.n)) + self.b
            pred = self.sigmoid(z)
            predictions.append(1 if pred >= 0.5 else 0)
        return predictions


In [11]:
###############################

In [12]:
# Decision Tree
class DecisionTreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value


class DecisionTree:
    def __init__(self, max_depth=3):
        self.max_depth = max_depth

    def gini(self, groups, classes):
        n_instances = sum(len(group) for group in groups)
        gini = 0.0
        for group in groups:
            size = len(group)
            if size == 0:
                continue
            score = 0.0
            labels = [row[-1] for row in group]
            for class_val in classes:
                proportion = labels.count(class_val) / size
                score += proportion ** 2
            gini += (1 - score) * (size / n_instances)
        return gini

    def split(self, index, value, dataset):
        left, right = [], []
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def best_split(self, dataset):
        class_values = list(set(row[-1] for row in dataset))
        best_index, best_value, best_score, best_groups = 999, 999, 999, None
        for index in range(len(dataset[0]) - 1):
            for row in dataset:
                groups = self.split(index, row[index], dataset)
                gini = self.gini(groups, class_values)
                if gini < best_score:
                    best_index, best_value, best_score, best_groups = index, row[index], gini, groups
        return {'index': best_index, 'value': best_value, 'groups': best_groups}

    def to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)

    def build_tree(self, dataset, depth):
        split = self.best_split(dataset)
        left, right = split['groups']
        node = DecisionTreeNode(feature=split['index'], threshold=split['value'])
        if depth >= self.max_depth or not left or not right:
            node.value = self.to_terminal(left + right)
            return node
        node.left = self.build_tree(left, depth + 1)
        node.right = self.build_tree(right, depth + 1)
        return node

    def fit(self, X, y):
        dataset = [X[i] + [y[i]] for i in range(len(X))]
        self.root = self.build_tree(dataset, 1)

    def _predict(self, node, row):
        if node.value is not None:
            return node.value
        if row[node.feature] < node.threshold:
            return self._predict(node.left, row)
        else:
            return self._predict(node.right, row)

    def predict(self, X):
        return [self._predict(self.root, row) for row in X]


# Naive Bayes for Gaussian distributions
import math

class NaiveBayes:
    def fit(self, X, y):
        self.classes = list(set(y))
        self.summaries = {}
        for c in self.classes:
            features = [X[i] for i in range(len(X)) if y[i] == c]
            summary = []
            for i in range(len(X[0])):
                col = [x[i] for x in features]
                mean = sum(col) / len(col)
                variance = sum((x - mean) ** 2 for x in col) / len(col)
                summary.append((mean, variance))
            self.summaries[c] = summary

    def gaussian(self, x, mean, var):
        if var == 0:
            return 1.0 if x == mean else 0.0
        exponent = math.exp(-(x - mean) ** 2 / (2 * var))
        return (1 / math.sqrt(2 * math.pi * var)) * exponent

    def predict(self, X):
        predictions = []
        for row in X:
            probs = {}
            for c in self.classes:
                prob = 1
                for i in range(len(row)):
                    mean, var = self.summaries[c][i]
                    prob *= self.gaussian(row[i], mean, var)
                probs[c] = prob
            predictions.append(max(probs, key=probs.get))
        return predictions


# Run test cases for Decision Tree and Naive Bayes

# Decision Tree test (binary feature)
X_tree = [[2.7], [1.0], [3.0], [1.5]]
y_tree = [0, 0, 1, 1]
model_tree = DecisionTree(max_depth=2)
model_tree.fit(X_tree, y_tree)
tree_result = model_tree.predict([[2.0], [3.1]])

# Naive Bayes test (Gaussian)
X_nb = [[1.0, 20.0], [2.0, 21.0], [3.0, 22.0], [4.0, 23.0]]
y_nb = [0, 0, 1, 1]
model_nb = NaiveBayes()
model_nb.fit(X_nb, y_nb)
nb_result = model_nb.predict([[1.5, 20.5], [3.5, 22.5]])

(tree_result, nb_result)

([0, 1], [0, 1])

In [13]:
# SVM with basic hard-margin using gradient descent for 2D data
class SimpleSVM:
    def __init__(self, lr=0.01, epochs=1000, lambda_param=0.01):
        self.lr = lr
        self.epochs = epochs
        self.lambda_param = lambda_param

    def fit(self, X, y):
        n_features = len(X[0])
        self.w = [0.0] * n_features
        self.b = 0.0

        for _ in range(self.epochs):
            for i in range(len(X)):
                x_i = X[i]
                y_i = 1 if y[i] == 1 else -1
                condition = y_i * (sum(self.w[j] * x_i[j] for j in range(n_features)) + self.b) >= 1
                if condition:
                    for j in range(n_features):
                        self.w[j] -= self.lr * (2 * self.lambda_param * self.w[j])
                else:
                    for j in range(n_features):
                        self.w[j] -= self.lr * (2 * self.lambda_param * self.w[j] - y_i * x_i[j])
                    self.b += self.lr * y_i

    def predict(self, X):
        return [1 if sum(self.w[j] * x[j] for j in range(len(x))) + self.b >= 0 else 0 for x in X]


# PCA using eigen decomposition (manual matrix operations for small data)
def transpose(matrix):
    return list(map(list, zip(*matrix)))

def mean_vector(X):
    return [sum(col) / len(col) for col in zip(*X)]

def subtract_mean(X, mean_vec):
    return [[x[i] - mean_vec[i] for i in range(len(x))] for x in X]

def cov_matrix(X):
    n = len(X)
    X_T = transpose(X)
    return [[sum(X_T[i][k] * X_T[j][k] for k in range(n)) / (n - 1)
             for j in range(len(X_T))] for i in range(len(X_T))]

def eigen_decomp_2x2(matrix):
    # Only for 2x2 matrices
    a, b = matrix[0][0], matrix[0][1]
    c, d = matrix[1][0], matrix[1][1]
    trace = a + d
    det = a * d - b * c
    lambda1 = trace / 2 + ((trace**2 - 4 * det) ** 0.5) / 2
    lambda2 = trace / 2 - ((trace**2 - 4 * det) ** 0.5) / 2
    # eigenvectors (not normalized)
    v1 = [b, lambda1 - a] if b != 0 else [1, 0]
    v2 = [b, lambda2 - a] if b != 0 else [0, 1]
    return [(lambda1, v1), (lambda2, v2)]

class SimplePCA:
    def __init__(self, n_components=1):
        self.n_components = n_components

    def fit_transform(self, X):
        self.mean_vec = mean_vector(X)
        centered = subtract_mean(X, self.mean_vec)
        cov = cov_matrix(centered)
        eigs = eigen_decomp_2x2(cov)
        eigs.sort(key=lambda x: -x[0])  # Sort by eigenvalue descending
        self.components = [vec for val, vec in eigs[:self.n_components]]
        return [[sum(row[j] * self.components[0][j] for j in range(len(row)))] for row in centered]


# Run test cases
# SVM test
X_svm = [[1, 2], [2, 3], [3, 3], [2, 1], [3, 2]]
y_svm = [0, 0, 0, 1, 1]
model_svm = SimpleSVM()
model_svm.fit(X_svm, y_svm)
svm_result = model_svm.predict([[2, 2], [1, 1]])

# PCA test
X_pca = [[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0]]
model_pca = SimplePCA(n_components=1)
pca_result = model_pca.fit_transform(X_pca)
pca_result_flat = [round(val[0], 2) for val in pca_result]  # flatten and round for readability

(svm_result, pca_result_flat)

([0, 1], [0.51, -2.52, 0.66, -0.15, 1.5])

In [14]:
class SelfAttention:
    def __init__(self, embed_dim):
        self.embed_dim = embed_dim
        self.W_q = [[0.1 for _ in range(embed_dim)] for _ in range(embed_dim)]
        self.W_k = [[0.1 for _ in range(embed_dim)] for _ in range(embed_dim)]
        self.W_v = [[0.1 for _ in range(embed_dim)] for _ in range(embed_dim)]

    def dot(self, a, b):
        return sum(x * y for x, y in zip(a, b))

    def matvec(self, mat, vec):
        return [self.dot(row, vec) for row in mat]

    def softmax(self, x):
        max_x = max(x)
        exps = [pow(2.718, i - max_x) for i in x]
        sum_exps = sum(exps)
        return [j / sum_exps for j in exps]

    def attention(self, q, k, v):
        scores = [self.dot(q, ki) for ki in k]
        weights = self.softmax(scores)
        output = [0.0 for _ in v[0]]
        for i in range(len(v)):
            for j in range(len(v[0])):
                output[j] += weights[i] * v[i][j]
        return output

    def forward(self, inputs):
        Q = [self.matvec(self.W_q, x) for x in inputs]
        K = [self.matvec(self.W_k, x) for x in inputs]
        V = [self.matvec(self.W_v, x) for x in inputs]
        return [self.attention(q, K, V) for q in Q]


# Multi-head Attention (2 heads for simplicity)
class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        self.heads = [SelfAttention(embed_dim) for _ in range(num_heads)]
        self.num_heads = num_heads
        self.embed_dim = embed_dim

    def forward(self, inputs):
        head_outputs = [head.forward(inputs) for head in self.heads]
        combined = []
        for i in range(len(inputs)):
            merged = []
            for h in head_outputs:
                merged += h[i]
            combined.append(merged[:self.embed_dim])  # truncate for simplicity
        return combined


# Simplified Transformer Encoder block (Self-Attention + Add & Norm)
class TransformerEncoder:
    def __init__(self, embed_dim, num_heads):
        self.mha = MultiHeadAttention(embed_dim, num_heads)

    def add_and_norm(self, x, sublayer_out):
        return [[x[i][j] + sublayer_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]

    def forward(self, inputs):
        attention_out = self.mha.forward(inputs)
        return self.add_and_norm(inputs, attention_out)


# Input test data
input_data = [[1.0, 0.0, 1.0], [0.0, 2.0, 0.0], [1.0, 1.0, 1.0]]
sa = SelfAttention(embed_dim=3)
self_attn_result = sa.forward(input_data)

mha = MultiHeadAttention(embed_dim=3, num_heads=2)
mha_result = mha.forward(input_data)

encoder = TransformerEncoder(embed_dim=3, num_heads=2)
encoder_result = encoder.forward(input_data)

self_attn_result, mha_result, encoder_result

([[0.23467958578183762, 0.23467958578183762, 0.23467958578183762],
  [0.23467958578183762, 0.23467958578183762, 0.23467958578183762],
  [0.2353621866011892, 0.2353621866011892, 0.2353621866011892]],
 [[0.23467958578183762, 0.23467958578183762, 0.23467958578183762],
  [0.23467958578183762, 0.23467958578183762, 0.23467958578183762],
  [0.2353621866011892, 0.2353621866011892, 0.2353621866011892]],
 [[1.2346795857818376, 0.23467958578183762, 1.2346795857818376],
  [0.23467958578183762, 2.2346795857818376, 0.23467958578183762],
  [1.2353621866011892, 1.2353621866011892, 1.2353621866011892]])

In [15]:
class SelfAttention:
    def __init__(self, embed_dim):
        self.embed_dim = embed_dim
        self.W_q = [[0.1]*embed_dim for _ in range(embed_dim)]
        self.W_k = [[0.2]*embed_dim for _ in range(embed_dim)]
        self.W_v = [[0.3]*embed_dim for _ in range(embed_dim)]

    def dot(self, a, b):
        return sum(x * y for x, y in zip(a, b))

    def matvec(self, mat, vec):
        return [self.dot(row, vec) for row in mat]

    def softmax(self, x):
        max_x = max(x)
        exp = [pow(2.718, i - max_x) for i in x]
        total = sum(exp)
        return [i / total for i in exp]

    def attention(self, q, K, V):
        scores = [self.dot(q, k) for k in K]
        weights = self.softmax(scores)
        out = [0.0] * len(V[0])
        for i in range(len(weights)):
            for j in range(len(V[0])):
                out[j] += weights[i] * V[i][j]
        return out

    def forward(self, inputs):
        Q = [self.matvec(self.W_q, x) for x in inputs]
        K = [self.matvec(self.W_k, x) for x in inputs]
        V = [self.matvec(self.W_v, x) for x in inputs]
        return [self.attention(q, K, V) for q in Q]


In [16]:
class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        self.heads = [SelfAttention(embed_dim) for _ in range(num_heads)]

    def forward(self, inputs):
        head_outputs = [head.forward(inputs) for head in self.heads]
        combined = []
        for i in range(len(inputs)):
            concat = []
            for h in head_outputs:
                concat += h[i]
            combined.append(concat[:len(inputs[0])])  # truncate for simplicity
        return combined


In [17]:
class TransformerEncoder:
    def __init__(self, embed_dim, num_heads):
        self.mha = MultiHeadAttention(embed_dim, num_heads)

    def add_and_norm(self, x, sublayer_out):
        return [[x[i][j] + sublayer_out[i][j] for j in range(len(x[0]))] for i in range(len(x))]

    def forward(self, inputs):
        attn_output = self.mha.forward(inputs)
        return self.add_and_norm(inputs, attn_output)


In [18]:
input_data = [[1, 0, 1], [0, 2, 0], [1, 1, 1]]

sa = SelfAttention(embed_dim=3)
print("Self Attention:", sa.forward(input_data))

mha = MultiHeadAttention(embed_dim=3, num_heads=2)
print("Multi-Head Attention:", mha.forward(input_data))

encoder = TransformerEncoder(embed_dim=3, num_heads=2)
print("Transformer Encoder Output:", encoder.forward(input_data))


Self Attention: [[0.7081524235578396, 0.7081524235578396, 0.7081524235578396], [0.7081524235578396, 0.7081524235578396, 0.7081524235578396], [0.7123355014792846, 0.7123355014792846, 0.7123355014792846]]
Multi-Head Attention: [[0.7081524235578396, 0.7081524235578396, 0.7081524235578396], [0.7081524235578396, 0.7081524235578396, 0.7081524235578396], [0.7123355014792846, 0.7123355014792846, 0.7123355014792846]]
Transformer Encoder Output: [[1.7081524235578396, 0.7081524235578396, 1.7081524235578396], [0.7081524235578396, 2.7081524235578396, 0.7081524235578396], [1.7123355014792847, 1.7123355014792847, 1.7123355014792847]]


In [20]:
class SimpleCNN:
    def __init__(self, kernel_size=3):
        self.kernel_size = kernel_size
        # Initialize a simple kernel with fixed weights
        self.kernel = [0.2] * kernel_size

    def conv1d(self, x):
        n = len(x)
        k = self.kernel_size
        output = []
        for i in range(n - k + 1):
            s = 0
            for j in range(k):
                s += x[i + j] * self.kernel[j]
            output.append(s)
        return output

    def relu(self, x):
        return [max(0, i) for i in x]

    def max_pool(self, x, pool_size=2, stride=2):
        pooled = []
        for i in range(0, len(x) - pool_size + 1, stride):
            pooled.append(max(x[i:i + pool_size]))
        return pooled

    def forward(self, x):
        conv_out = self.conv1d(x)
        relu_out = self.relu(conv_out)
        pooled = self.max_pool(relu_out)
        return pooled
cnn = SimpleCNN()
input_signal = [1, 2, 3, 4, 5, 6]
print("CNN output:", cnn.forward(input_signal))


CNN output: [1.8, 3.0]


In [21]:
class SimpleRNN:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim
        # Random-ish initialization of weights
        self.W_xh = [[0.1 for _ in range(input_dim)] for _ in range(hidden_dim)]
        self.W_hh = [[0.1 for _ in range(hidden_dim)] for _ in range(hidden_dim)]
        self.b_h = [0.0 for _ in range(hidden_dim)]

    def dot(self, a, b):
        return sum(x * y for x, y in zip(a, b))

    def matvec(self, mat, vec):
        return [self.dot(row, vec) for row in mat]

    def tanh(self, x):
        # Approximate tanh with math.tanh or simple version
        import math
        return [math.tanh(i) for i in x]

    def forward(self, inputs):
        h = [0.0] * self.hidden_dim
        outputs = []
        for x in inputs:
            x_vec = [x] if not isinstance(x, list) else x
            h_next = self.matvec(self.W_xh, x_vec)
            h_rec = self.matvec(self.W_hh, h)
            h = self.tanh([h_next[i] + h_rec[i] + self.b_h[i] for i in range(self.hidden_dim)])
            outputs.append(h)
        return outputs


In [22]:
rnn = SimpleRNN(input_dim=1, hidden_dim=2)
seq = [[1], [2], [3]]
print("RNN output:", rnn.forward(seq))


RNN output: [[0.09966799462495582, 0.09966799462495582], [0.21645477239531144, 0.21645477239531144], [0.33041224905058686, 0.33041224905058686]]


In [23]:
# Statistical Functions from Scratch

def mean(data):
    return sum(data) / len(data)

def variance(data):
    m = mean(data)
    return sum((x - m) ** 2 for x in data) / (len(data) - 1)  # sample variance

def std_dev(data):
    return variance(data) ** 0.5

def covariance(x, y):
    mean_x = mean(x)
    mean_y = mean(y)
    n = len(x)
    return sum((x[i] - mean_x) * (y[i] - mean_y) for i in range(n)) / (n - 1)

def correlation(x, y):
    return covariance(x, y) / (std_dev(x) * std_dev(y))

def skewness(data):
    m = mean(data)
    s = std_dev(data)
    n = len(data)
    return (sum((x - m) ** 3 for x in data) / n) / (s ** 3)

def kurtosis(data):
    m = mean(data)
    s = std_dev(data)
    n = len(data)
    return (sum((x - m) ** 4 for x in data) / n) / (s ** 4) - 3

def z_scores(data):
    m = mean(data)
    s = std_dev(data)
    return [(x - m) / s for x in data]

def covariance_matrix(matrix):
    n = len(matrix)
    dim = len(matrix[0])
    means = [mean([matrix[i][j] for i in range(n)]) for j in range(dim)]
    cov_mat = [[0] * dim for _ in range(dim)]
    for i in range(dim):
        for j in range(dim):
            cov_mat[i][j] = sum((matrix[k][i] - means[i]) * (matrix[k][j] - means[j]) for k in range(n)) / (n - 1)
    return cov_mat

def confusion_matrix(y_true, y_pred, labels=None):
    if labels is None:
        labels = list(set(y_true + y_pred))
    matrix = [[0]*len(labels) for _ in labels]
    label_index = {label: idx for idx, label in enumerate(labels)}
    for t, p in zip(y_true, y_pred):
        matrix[label_index[t]][label_index[p]] += 1
    return matrix

def accuracy(y_true, y_pred):
    correct = sum(t == p for t, p in zip(y_true, y_pred))
    return correct / len(y_true)

def precision(y_true, y_pred, positive_label):
    tp = sum((t == positive_label and p == positive_label) for t, p in zip(y_true, y_pred))
    fp = sum((t != positive_label and p == positive_label) for t, p in zip(y_true, y_pred))
    return tp / (tp + fp) if tp + fp > 0 else 0

def recall(y_true, y_pred, positive_label):
    tp = sum((t == positive_label and p == positive_label) for t, p in zip(y_true, y_pred))
    fn = sum((t == positive_label and p != positive_label) for t, p in zip(y_true, y_pred))
    return tp / (tp + fn) if tp + fn > 0 else 0

def f1_score(y_true, y_pred, positive_label):
    p = precision(y_true, y_pred, positive_label)
    r = recall(y_true, y_pred, positive_label)
    return 2 * p * r / (p + r) if p + r > 0 else 0

def bias_variance_decomposition(predictions, true_values):
    n = len(true_values)
    mean_pred = mean(predictions)
    bias_sq = (mean_pred - mean(true_values)) ** 2
    variance_pred = variance(predictions)
    noise = variance(true_values)  # Assuming noise = variance of true_values
    return bias_sq, variance_pred, noise

def cross_validation_split(data, k):
    n = len(data)
    fold_size = n // k
    return [data[i*fold_size:(i+1)*fold_size] for i in range(k)]

def mean_squared_error(y_true, y_pred):
    n = len(y_true)
    return sum((y_true[i] - y_pred[i]) ** 2 for i in range(n)) / n

def bootstrap_sampling(data, n_samples):
    import random
    samples = []
    for _ in range(n_samples):
        sample = [random.choice(data) for _ in range(len(data))]
        samples.append(sample)
    return samples

def hypothesis_test_t_stat(sample1, sample2):
    m1, m2 = mean(sample1), mean(sample2)
    v1, v2 = variance(sample1), variance(sample2)
    n1, n2 = len(sample1), len(sample2)
    se = ((v1/n1) + (v2/n2)) ** 0.5
    return (m1 - m2) / se

# Example Usage
if __name__ == "__main__":
    data1 = [2, 4, 6, 8, 10]
    data2 = [1, 3, 5, 7, 9]

    print("Mean:", mean(data1))
    print("Variance:", variance(data1))
    print("Standard Deviation:", std_dev(data1))
    print("Covariance:", covariance(data1, data2))
    print("Correlation:", correlation(data1, data2))
    print("Skewness:", skewness(data1))
    print("Kurtosis:", kurtosis(data1))
    print("Z-Scores:", z_scores(data1))

    matrix = [[2,3], [4,5], [6,7]]
    print("Covariance Matrix:", covariance_matrix(matrix))

    y_true = [1, 0, 1, 1, 0]
    y_pred = [1, 0, 0, 1, 0]
    print("Confusion Matrix:", confusion_matrix(y_true, y_pred))
    print("Accuracy:", accuracy(y_true, y_pred))
    print("Precision (1):", precision(y_true, y_pred, 1))
    print("Recall (1):", recall(y_true, y_pred, 1))
    print("F1 Score (1):", f1_score(y_true, y_pred, 1))

    preds = [3, 5, 7]
    true_vals = [2, 5, 8]
    print("Bias, Variance, Noise:", bias_variance_decomposition(preds, true_vals))

    print("Cross-validation folds:", cross_validation_split(data1, 3))

    print("Mean Squared Error:", mean_squared_error(true_vals, preds))

    samples = bootstrap_sampling(data1, 3)
    print("Bootstrap samples:", samples)

    t_stat = hypothesis_test_t_stat(data1, data2)
    print("T-test statistic:", t_stat)


Mean: 6.0
Variance: 10.0
Standard Deviation: 3.1622776601683795
Covariance: 10.0
Correlation: 0.9999999999999998
Skewness: 0.0
Kurtosis: -1.9120000000000004
Z-Scores: [-1.2649110640673518, -0.6324555320336759, 0.0, 0.6324555320336759, 1.2649110640673518]
Covariance Matrix: [[4.0, 4.0], [4.0, 4.0]]
Confusion Matrix: [[2, 0], [1, 2]]
Accuracy: 0.8
Precision (1): 1.0
Recall (1): 0.6666666666666666
F1 Score (1): 0.8
Bias, Variance, Noise: (0.0, 4.0, 9.0)
Cross-validation folds: [[2], [4], [6]]
Mean Squared Error: 0.6666666666666666
Bootstrap samples: [[10, 10, 4, 6, 8], [2, 10, 8, 2, 8], [4, 4, 2, 8, 4]]
T-test statistic: 0.5


In [24]:
import math

def dot_product(vec1, vec2):
    return sum(x * y for x, y in zip(vec1, vec2))

def matmul(A, B):
    """Multiply matrix A (list of lists) with matrix B (list of lists)"""
    result = []
    rows_A = len(A)
    cols_A = len(A[0])
    rows_B = len(B)
    cols_B = len(B[0])
    assert cols_A == rows_B, "Incompatible dimensions for multiplication"
    for i in range(rows_A):
        row_result = []
        for j in range(cols_B):
            s = 0
            for k in range(cols_A):
                s += A[i][k] * B[k][j]
            row_result.append(s)
        result.append(row_result)
    return result

def transpose(matrix):
    return list(map(list, zip(*matrix)))

def softmax(x):
    max_x = max(x)
    exps = [math.exp(i - max_x) for i in x]
    sum_exps = sum(exps)
    return [j / sum_exps for j in exps]

def scale_dot_product_attention(Q, K, V):
    """
    Q, K, V are lists of vectors:
    Q: query vectors (n_q x d)
    K: key vectors (n_k x d)
    V: value vectors (n_k x d_v)
    """
    d_k = len(K[0])
    # Calculate scores = Q x K^T
    K_T = transpose(K)
    scores = matmul(Q, K_T)  # shape: (n_q x n_k)

    # Scale scores
    scaled_scores = []
    scale_factor = math.sqrt(d_k)
    for row in scores:
        scaled_scores.append([x / scale_factor for x in row])

    # Apply softmax to each row (query)
    attention_weights = []
    for row in scaled_scores:
        attention_weights.append(softmax(row))

    # Multiply weights by V
    output = []
    for weights in attention_weights:
        out_vec = [0] * len(V[0])
        for w, v in zip(weights, V):
            for i in range(len(v)):
                out_vec[i] += w * v[i]
        output.append(out_vec)
    return output, attention_weights

class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        assert embed_dim % num_heads == 0
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        # Initialize weights (Q,K,V) for each head, simple fixed weights for demo
        # For simplicity, we use identity-like weights, i.e. no change on vectors
        self.W_Q = [self.identity_matrix(self.head_dim) for _ in range(num_heads)]
        self.W_K = [self.identity_matrix(self.head_dim) for _ in range(num_heads)]
        self.W_V = [self.identity_matrix(self.head_dim) for _ in range(num_heads)]
        # Output linear projection weights (identity for simplicity)
        self.W_O = self.identity_matrix(embed_dim)

    def identity_matrix(self, dim):
        return [[1 if i == j else 0 for j in range(dim)] for i in range(dim)]

    def split_heads(self, x):
        # x shape: (seq_len x embed_dim)
        split = []
        for i in range(self.num_heads):
            head = []
            for vec in x:
                head.append(vec[i * self.head_dim:(i + 1) * self.head_dim])
            split.append(head)
        return split

    def combine_heads(self, heads):
        # heads: list of num_heads elements each (seq_len x head_dim)
        combined = []
        seq_len = len(heads[0])
        for i in range(seq_len):
            combined_vec = []
            for head in heads:
                combined_vec.extend(head[i])
            combined.append(combined_vec)
        return combined

    def linear(self, x, W):
        # x: list of vectors; W: square matrix (dim x dim)
        return [ [dot_product(W_row, vec) for W_row in transpose(W)] for vec in x]

    def forward(self, Q, K, V):
        # Split into heads
        Q_heads = self.split_heads(Q)
        K_heads = self.split_heads(K)
        V_heads = self.split_heads(V)

        attention_outputs = []
        for i in range(self.num_heads):
            # Linear projections (identity here for demo)
            q_proj = self.linear(Q_heads[i], self.W_Q[i])
            k_proj = self.linear(K_heads[i], self.W_K[i])
            v_proj = self.linear(V_heads[i], self.W_V[i])

            attn_out, _ = scale_dot_product_attention(q_proj, k_proj, v_proj)
            attention_outputs.append(attn_out)

        # Combine heads
        concat = self.combine_heads(attention_outputs)
        # Final linear projection (identity)
        output = self.linear(concat, self.W_O)
        return output

class TransformerEncoderBlock:
    def __init__(self, embed_dim, num_heads):
        self.mha = MultiHeadAttention(embed_dim, num_heads)

    def forward(self, x):
        # Self-attention
        attn_output = self.mha.forward(x, x, x)  # Q=K=V=x
        # Add & Norm skipped for simplicity
        return attn_output

# Example usage:
if __name__ == "__main__":
    # Input: sequence length=2, embedding dimension=4
    x = [
        [1, 0, 1, 0],
        [0, 2, 0, 2]
    ]
    encoder = TransformerEncoderBlock(embed_dim=4, num_heads=2)
    output = encoder.forward(x)
    print("Transformer Encoder output:", output)


Transformer Encoder output: [[0.6697615493266569, 0.6604769013466862, 0.6697615493266569, 0.6604769013466862], [0.055807219207169745, 1.8883855615856606, 0.055807219207169745, 1.8883855615856606]]


In [25]:
import math
import random
from collections import defaultdict
from math import log, exp

##########################
# 1. Naive Bayes (Categorical) from scratch
##########################
class NaiveBayes:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}
        self.classes = set()
        self.features = []

    def fit(self, X, y):
        self.classes = set(y)
        n = len(y)
        self.features = list(range(len(X[0])))

        # Prior probabilities P(c)
        self.class_probs = {c: sum(1 for label in y if label == c) / n for c in self.classes}

        # Conditional probabilities P(x_i | c)
        self.feature_probs = {c: defaultdict(lambda: defaultdict(int)) for c in self.classes}
        class_counts = defaultdict(int)

        for xi, label in zip(X, y):
            class_counts[label] += 1
            for i, val in enumerate(xi):
                self.feature_probs[label][i][val] += 1

        # Convert counts to probabilities with Laplace smoothing
        for c in self.classes:
            for i in self.features:
                total = class_counts[c] + len(self.feature_probs[c][i])
                for val in self.feature_probs[c][i]:
                    self.feature_probs[c][i][val] = (self.feature_probs[c][i][val] + 1) / total

    def predict(self, X):
        preds = []
        for xi in X:
            class_scores = {}
            for c in self.classes:
                score = math.log(self.class_probs[c])
                for i, val in enumerate(xi):
                    prob = self.feature_probs[c][i].get(val, 1e-6)
                    score += math.log(prob)
                class_scores[c] = score
            preds.append(max(class_scores, key=class_scores.get))
        return preds


##########################
# 2. K-Nearest Neighbors (KNN)
##########################
class KNN:
    def __init__(self, k=3):
        self.k = k
        self.X = []
        self.y = []

    def fit(self, X, y):
        self.X = X
        self.y = y

    def _distance(self, a, b):
        return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))

    def predict(self, X_test):
        preds = []
        for x in X_test:
            distances = [(self._distance(x, xi), yi) for xi, yi in zip(self.X, self.y)]
            distances.sort(key=lambda t: t[0])
            nearest = distances[:self.k]
            votes = defaultdict(int)
            for _, label in nearest:
                votes[label] += 1
            preds.append(max(votes, key=votes.get))
        return preds


##########################
# 3. K-Means Clustering
##########################
class KMeans:
    def __init__(self, k=2, max_iters=100):
        self.k = k
        self.max_iters = max_iters
        self.centroids = []

    def _distance(self, a, b):
        return math.sqrt(sum((x - y) ** 2 for x, y in zip(a, b)))

    def fit(self, X):
        self.centroids = random.sample(X, self.k)
        for _ in range(self.max_iters):
            clusters = [[] for _ in range(self.k)]
            for x in X:
                distances = [self._distance(x, c) for c in self.centroids]
                idx = distances.index(min(distances))
                clusters[idx].append(x)

            new_centroids = []
            for cluster in clusters:
                if cluster:
                    mean_centroid = [sum(dim) / len(cluster) for dim in zip(*cluster)]
                else:
                    mean_centroid = random.choice(X)
                new_centroids.append(mean_centroid)

            if new_centroids == self.centroids:
                break
            self.centroids = new_centroids

    def predict(self, X):
        preds = []
        for x in X:
            distances = [self._distance(x, c) for c in self.centroids]
            preds.append(distances.index(min(distances)))
        return preds


##########################
# 4. Torch-like MLP Basic (Pure Python, no torch)
##########################
class SimpleMLP:
    def __init__(self, input_dim, hidden_dim, output_dim):
        # Weights initialization
        self.W1 = [[random.uniform(-0.1, 0.1) for _ in range(hidden_dim)] for _ in range(input_dim)]
        self.b1 = [0.0] * hidden_dim
        self.W2 = [[random.uniform(-0.1, 0.1) for _ in range(output_dim)] for _ in range(hidden_dim)]
        self.b2 = [0.0] * output_dim

    def relu(self, x):
        return [max(0, i) for i in x]

    def matmul(self, x, W):
        return [sum(x[j] * W[j][i] for j in range(len(x))) for i in range(len(W[0]))]

    def forward(self, x):
        h = self.relu([a + b for a, b in zip(self.matmul(x, self.W1), self.b1)])
        out = [a + b for a, b in zip(self.matmul(h, self.W2), self.b2)]
        return out


##########################
# 5. Attention with KV Cache (simplified)
##########################
class AttentionKVCache:
    def __init__(self):
        self.keys_cache = []
        self.values_cache = []

    def attend(self, q, k, v):
        # q, k, v are lists of floats
        scores = [dot_product(q, ki) for ki in k]
        max_score = max(scores)
        exps = [math.exp(s - max_score) for s in scores]
        sum_exps = sum(exps)
        weights = [e / sum_exps for e in exps]
        out = [0] * len(v[0])
        for w, vi in zip(weights, v):
            for i in range(len(vi)):
                out[i] += w * vi[i]
        return out

    def forward(self, q, k, v):
        # Append new keys and values to cache
        self.keys_cache.extend(k)
        self.values_cache.extend(v)
        return self.attend(q, self.keys_cache, self.values_cache)


##########################
# 6. Logistic Regression (Binary)
##########################
class LogisticRegression:
    def __init__(self, lr=0.1, epochs=100):
        self.lr = lr
        self.epochs = epochs
        self.weights = None
        self.bias = 0

    def sigmoid(self, z):
        return 1 / (1 + math.exp(-z))

    def fit(self, X, y):
        n, d = len(X), len(X[0])
        self.weights = [0.0] * d
        for _ in range(self.epochs):
            for xi, yi in zip(X, y):
                z = sum(w * x for w, x in zip(self.weights, xi)) + self.bias
                pred = self.sigmoid(z)
                error = yi - pred
                for j in range(d):
                    self.weights[j] += self.lr * error * xi[j]
                self.bias += self.lr * error

    def predict(self, X):
        preds = []
        for xi in X:
            z = sum(w * x for w, x in zip(self.weights, xi)) + self.bias
            p = self.sigmoid(z)
            preds.append(1 if p > 0.5 else 0)
        return preds


##########################
# 7. TF-IDF from scratch
##########################
class TFIDF:
    def __init__(self):
        self.df = defaultdict(int)
        self.idf = {}
        self.vocab = set()
        self.N = 0

    def fit(self, docs):
        self.N = len(docs)
        for doc in docs:
            words = set(doc)
            for w in words:
                self.df[w] += 1
        for w, freq in self.df.items():
            self.idf[w] = math.log((self.N + 1) / (freq + 1)) + 1

    def transform(self, docs):
        tfidf_docs = []
        for doc in docs:
            tf = defaultdict(int)
            for w in doc:
                tf[w] += 1
            doc_len = len(doc)
            tfidf = {}
            for w in tf:
                tfidf[w] = (tf[w] / doc_len) * self.idf.get(w, 0)
            tfidf_docs.append(tfidf)
        return tfidf_docs


##########################
# 8. Metrics (Accuracy, Precision, Recall, F1) -- simple binary example
##########################
def accuracy_score(y_true, y_pred):
    return sum(t == p for t, p in zip(y_true, y_pred)) / len(y_true)

def precision_score(y_true, y_pred):
    tp = sum((t == 1 and p == 1) for t, p in zip(y_true, y_pred))
    fp = sum((t == 0 and p == 1) for t, p in zip(y_true, y_pred))
    return tp / (tp + fp) if tp + fp > 0 else 0

def recall_score(y_true, y_pred):
    tp = sum((t == 1 and p == 1) for t, p in zip(y_true, y_pred))
    fn = sum((t == 1 and p == 0) for t, p in zip(y_true, y_pred))
    return tp / (tp + fn) if tp + fn > 0 else 0

def f1_score(y_true, y_pred):
    p = precision_score(y_true, y_pred)
    r = recall_score(y_true, y_pred)
    return 2 * p * r / (p + r) if p + r > 0 else 0