In [26]:
# Task A: Matrix Standardization
# Here we take each feature, subtract the mean and divide by the variance of each feature.
# This results in 0 mean and unit variance for the dimensions

# This function accepts data matrix X (n x d) and outputs the standardized matrix

import numpy as np
def standardize(X):
    # input X is n x d numpy matrix
    array = np.array(X) # confirm that X is a numpy matrix
    for idx, column in enumerate(array.T):
        mean = np.mean(column)
        variance = np.var(column)
        print('This is the mean {} and this is the variance {} of column {}'.format(mean,variance,idx))
        for sidx, sample in enumerate(column):
            array[sidx,idx] = (sample - mean)/(variance)
    return array
        
data = [[1,0,3,5],[1,1,5,7],[0,0,4,10],[2,-1,5,11]]
standardize(data)
    

[[ 1  0  3  5]
 [ 1  1  5  7]
 [ 0  0  4 10]
 [ 2 -1  5 11]]
This is the mean 1.0 and this is the variance 0.5 of column 0
This is the mean 0.0 and this is the variance 0.5 of column 1
This is the mean 4.25 and this is the variance 0.6875 of column 2
This is the mean 8.25 and this is the variance 5.6875 of column 3
[[ 0  0 -1  0]
 [ 0  2  1  0]
 [-2  0  0  0]
 [ 2 -2  1  0]]


array([[ 0,  0, -1,  0],
       [ 0,  2,  1,  0],
       [-2,  0,  0,  0],
       [ 2, -2,  1,  0]])

In [36]:
# Task B: Pairwise distance in the plane
# Write a function that accepts two matrices (one is P, p x 2 and the other is Q, q x 2).
# Here each row contains the x,y coordinates
# The output has to be all pairwise distances between all points between the two matrices
# All output distances will be saved in matrix D, where Dij is the distance of the ith point in
# P to the jth point in Q

import numpy as np

def pairwise(P,Q):
    # check that p has 2 columns
    P = np.array(P)
    Q = np.array(Q)
    if P.shape[1] != 2:
        print('First matrix does not have two columns')
        return None
    if Q.shape[1] != 2:
        print('Second matrix does not have two columns')
        return None
    print(P.shape[0], Q.shape[0])
    distance_matrix = np.zeros([P.shape[0], Q.shape[0]])
    
    def euclid(x,y):
        return np.sqrt((x[1]-y[1])**2 + (x[0]-y[0])**2)
        
    for idxP, point in enumerate(P):
        for idxQ, coord in enumerate(Q):
            distance = euclid(point,coord)
            distance_matrix[idxP,idxQ] = distance
    
    return distance_matrix
            
p = [[0,1],[4,1]]
q = [[1,1],[6,3],[8,6]]

pairwise(p,q)

2 3


array([[1.        , 6.32455532, 9.43398113],
       [3.        , 2.82842712, 6.40312424]])

In [100]:
# Task C: Likelihood of a Data Sample
# Implementation for two model case. xn is in R-d space
# two sets of parameters, u1, sigma1, u2, sigma2
# probability of x given u and sigma is given by equation
# The function must return the most likely assignment (1, 2)
# Which means xn is assigned to one gausian distribution

import numpy as np

def evaluate(x, mu, cov):
    # Here the models mean
    # and the covariance are incorporated
    x = np.array(x)
    mu = np.array(mu)
    cov = np.array(cov)
    d = x.shape[1]
    det = np.linalg.det(cov)
    inv = np.linalg.inv(cov)
    probability = 100*np.exp(-0.5 * np.matmul(np.matmul((x - mu),inv),(x - mu).T))/(((2*np.pi)**(d/2) * det) ** 0.5) 
    return probability

def assignment(sample, m1, m2):
    # here m1 and m2 are tuples containing the model 
    # parameters in the order - mean and covariance
    prob1 = evaluate(sample, m1[0], m1[1])
    prob2 = evaluate(sample, m2[0], m1[1])
    if abs(prob1 - prob2) < 5:
        print('The models are basically the same with a percent probability margin of 5%')
    elif prob1 > prob2:
        print('The first model is the more likely one with a probability of {} (difference in percent was {}) and parameters {}'.format(prob1, (prob1-prob2),m1))
    else:
        print('The second model is the more likely one with probability of {} (difference in percent was {}) and parameters'.format(prob2, (prob2-prob1) ,m2))
    
X = [[0, 4]]
u1 = [0, 5]
s1 = [[1, 0],[0, 1]]
u2 = [0, 7]
s2 = [[4, 3], [3, 4]]
m1 = (u1, s1)
m2 = (u2, s2)

assignment(X, m1, m2)



The first model is the more likely one with a probability of [[24.19707245]] (difference in percent was [[23.75388761]]) and parameters ([0, 5], [[1, 0], [0, 1]])
