In [1]:
from utils import *
from sklearn import datasets
import numpy as np
from collections import defaultdict

In [2]:
data = datasets.load_iris()

features = data.data
targets = data.target

In [3]:
def create_target_feature_dict(features, targets):
    targets_data_dict = {}
    for i in range(len(targets)):
        if targets[i] not in targets_data_dict.keys():
            targets_data_dict[targets[i]] = []
            targets_data_dict[targets[i]].append(features[i])
        else:
            targets_data_dict[targets[i]].append(features[i])

    for i in targets_data_dict.keys():
        targets_data_dict[i] = np.array(targets_data_dict[i])
    return targets_data_dict

In [25]:
def create_target_mean_dict(_dict):
    
    avg_dict = {}
    for k in _dict.keys():
        avg_dict[k] = np.sum(_dict[k], axis=0)/len(_dict[k])
    return avg_dict

In [151]:
def create_within_covariance_matrix(_avg_dict, _tgt_features_dict):
    cov_matrix_dict = {}
    for k in _avg_dict.keys():
        s = 0
        for i in range(len(_tgt_features_dict[k])):
            s = s + ((_tgt_features_dict[k][i] - _avg_dict[k])*np.transpose([(_tgt_features_dict[k][i] - _avg_dict[k])]))/(len(_tgt_features_dict[k])-1)
        cov_matrix_dict[k] = s
    return cov_matrix_dict

In [163]:
def find_within_class_scatter_matrix(_cov_matrix_dict):
    
    within_class_mtx = 0
    for k in _cov_matrix_dict.keys():
        within_class_mtx = within_class_mtx + _cov_matrix_dict[k]
    return (len(_cov_matrix_dict.keys())-1)*within_class_mtx

In [114]:
def find_grand_mean_vec(_features):
    return np.sum(_features, axis=0)/len(_features)

In [123]:
def create_between_covariance_matrix(_mean_by_target_dict, _gran_mean_vec):
    
    between_cov_mtx = {}
    for k in _mean_by_target_dict.keys():
        for i in range(len(_mean_by_target_dict[k])):
            between_cov_mtx[k] = (_mean_by_target_dict[k] - _gran_mean_vec)*np.transpose([_mean_by_target_dict[k] - _gran_mean_vec])
    return between_cov_mtx

In [128]:
def find_between_class_scatter_matrix(_between_cov_mtx_dict):
    
    between_class_mtx = 0
    for k in _between_cov_mtx_dict.keys():
        between_class_mtx = between_class_mtx + _between_cov_mtx_dict[k]
        
    between_class_mtx = len(_between_cov_mtx_dict) * between_class_mtx
    return between_class_mtx

In [27]:
tgt_ftr_dict = create_target_feature_dict(features, targets)

In [28]:
mean_by_target = create_target_mean_dict(tgt_ftr_dict)

In [29]:
mean_by_target

{0: array([5.006, 3.428, 1.462, 0.246]),
 1: array([5.936, 2.77 , 4.26 , 1.326]),
 2: array([6.588, 2.974, 5.552, 2.026])}

In [159]:
cov_matrix_by_tgt = create_within_covariance_matrix(mean_by_target, tgt_ftr_dict)

In [160]:
cov_matrix_by_tgt

{0: array([[0.12424898, 0.09921633, 0.0163551 , 0.01033061],
        [0.09921633, 0.1436898 , 0.01169796, 0.00929796],
        [0.0163551 , 0.01169796, 0.03015918, 0.00606939],
        [0.01033061, 0.00929796, 0.00606939, 0.01110612]]),
 1: array([[0.26643265, 0.08518367, 0.18289796, 0.05577959],
        [0.08518367, 0.09846939, 0.08265306, 0.04120408],
        [0.18289796, 0.08265306, 0.22081633, 0.07310204],
        [0.05577959, 0.04120408, 0.07310204, 0.03910612]]),
 2: array([[0.40434286, 0.09376327, 0.3032898 , 0.04909388],
        [0.09376327, 0.10400408, 0.07137959, 0.04762857],
        [0.3032898 , 0.07137959, 0.30458776, 0.04882449],
        [0.04909388, 0.04762857, 0.04882449, 0.07543265]])}

In [164]:
within_class_matrix = find_within_class_scatter_matrix(cov_matrix_by_tgt)

In [165]:
within_class_matrix

array([[1.59004898, 0.55632653, 1.00508571, 0.23040816],
       [0.55632653, 0.69232653, 0.33146122, 0.19626122],
       [1.00508571, 0.33146122, 1.11112653, 0.25599184],
       [0.23040816, 0.19626122, 0.25599184, 0.2512898 ]])

In [185]:
gran_mean_vec = find_grand_mean_vec(features)

In [186]:
gran_mean_vec

array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [187]:
between_cov_mtx = create_between_covariance_matrix(mean_by_target, gran_mean_vec)

In [188]:
between_cov_mtx

{0: array([[ 0.70112711, -0.31037156,  1.92251733,  0.79825778],
        [-0.31037156,  0.13739378, -0.85105067, -0.35336889],
        [ 1.92251733, -0.85105067,  5.271616  ,  2.18885333],
        [ 0.79825778, -0.35336889,  2.18885333,  0.90884444]]),
 1: array([[ 0.00858711, -0.02662622,  0.04651867,  0.01173778],
        [-0.02662622,  0.08256044, -0.14424133, -0.03639556],
        [ 0.04651867, -0.14424133,  0.252004  ,  0.06358667],
        [ 0.01173778, -0.03639556,  0.06358667,  0.01604444]]),
 2: array([[ 0.55452844, -0.06205556,  1.335932  ,  0.61559111],
        [-0.06205556,  0.00694444, -0.1495    , -0.06888889],
        [ 1.335932  , -0.1495    ,  3.218436  ,  1.48304   ],
        [ 0.61559111, -0.06888889,  1.48304   ,  0.68337778]])}

In [189]:
between_class_matrix = find_between_class_scatter_matrix(between_cov_mtx)

In [190]:
between_class_matrix

array([[ 3.792728, -1.19716 ,  9.914904,  4.27676 ],
       [-1.19716 ,  0.680696, -3.434376, -1.37596 ],
       [ 9.914904, -3.434376, 26.226168, 11.20644 ],
       [ 4.27676 , -1.37596 , 11.20644 ,  4.8248  ]])

In [191]:
np.linalg.inv(within_class_matrix)

array([[ 1.80725312, -0.89619683, -1.49865365,  0.56956517],
       [-0.89619683,  2.37231318,  0.44496668, -1.48438125],
       [-1.49865365,  0.44496668,  2.46486729, -1.48439603],
       [ 0.56956517, -1.48438125, -1.48439603,  6.12872989]])

In [192]:
between_class_matrix

array([[ 3.792728, -1.19716 ,  9.914904,  4.27676 ],
       [-1.19716 ,  0.680696, -3.434376, -1.37596 ],
       [ 9.914904, -3.434376, 26.226168, 11.20644 ],
       [ 4.27676 , -1.37596 , 11.20644 ,  4.8248  ]])

In [196]:
sw_1_sb = np.matmul(np.linalg.inv(within_class_matrix),between_class_matrix)

In [200]:
eigs = np.linalg.eig(sw_1_sb)

In [201]:
eigs[0]

array([ 4.73221359e+01,  4.19524833e-01,  7.60492412e-15, -3.80710247e-15])

In [202]:
eigs[1]

array([[-0.20874182, -0.00653196,  0.74599644,  0.22410959],
       [-0.38620369, -0.58661055, -0.42659328,  0.25191508],
       [ 0.55401172,  0.25256154, -0.44531412,  0.32570073],
       [ 0.7073504 , -0.76945309,  0.2514017 , -0.88330783]])

In [166]:
t_features = np.array([[4,2],[2,4],[2,3],[3,6],[4,4],[9,10],[6,8],[9,5],[8,7],[10,8]])
t_targets = np.array([0,0,0,0,0,1,1,1,1,1])

In [167]:
t_tgt_ftr_dict = create_target_feature_dict(t_features, t_targets)

In [168]:
t_mean_by_target = create_target_mean_dict(t_tgt_ftr_dict)

In [169]:
t_mean_by_target

{0: array([3. , 3.8]), 1: array([8.4, 7.6])}

In [170]:
t_cov_matrix_by_tgt = create_within_covariance_matrix(t_mean_by_target, t_tgt_ftr_dict)

In [171]:
t_cov_matrix_by_tgt

{0: array([[ 1.  , -0.25],
        [-0.25,  2.2 ]]), 1: array([[ 2.3 , -0.05],
        [-0.05,  3.3 ]])}

In [172]:
t_within_class_matrix = find_within_class_scatter_matrix(t_cov_matrix_by_tgt)

In [173]:
t_within_class_matrix

array([[ 3.3, -0.3],
       [-0.3,  5.5]])

In [174]:
t_gran_mean_vec = find_grand_mean_vec(t_features)

In [176]:
t_gran_mean_vec

array([5.7, 5.7])

In [177]:
t_between_cov_mtx = create_between_covariance_matrix(t_mean_by_target, t_gran_mean_vec)

In [178]:
t_between_cov_mtx

{0: array([[7.29, 5.13],
        [5.13, 3.61]]), 1: array([[7.29, 5.13],
        [5.13, 3.61]])}

In [179]:
t_between_class_matrix = find_between_class_scatter_matrix(t_between_cov_mtx)

In [180]:
t_between_class_matrix

array([[29.16, 20.52],
       [20.52, 14.44]])

In [181]:
t_within_class_matrix

array([[ 3.3, -0.3],
       [-0.3,  5.5]])

In [182]:
np.linalg.inv(t_within_class_matrix)

array([[0.30454042, 0.0166113 ],
       [0.0166113 , 0.18272425]])

In [183]:
t_between_class_matrix

array([[29.16, 20.52],
       [20.52, 14.44]])

In [204]:
t_sw_1_sb = np.matmul(np.linalg.inv(t_within_class_matrix),t_between_class_matrix)

In [206]:
t_eings = np.linalg.eig(t_sw_1_sb)

In [207]:
t_eings[0]

array([ 1.22006645e+01, -8.88178420e-16])

In [208]:
t_eings[1]

array([[ 0.90878558, -0.57549341],
       [ 0.41726342,  0.81780642]])

In [12]:
x1_avg = np.sum(x1, axis=0)/len(x1)

In [13]:
x2_avg = np.sum(x2, axis=0)/len(x2)

In [81]:
s1 = 0
for i in range(len(x1)):
    s1 = s1 + (x1[i] - x1_avg)*np.transpose([(x1[i] - x1_avg)])

In [82]:
s1

array([[ 1.  , -1.8 ],
       [-1.8 ,  3.24]])

In [69]:
len(x2)

5

In [78]:
s2 = 0
for i in range(1):
    s2 = s2 + (x2[i] - x2_avg)*np.transpose([(x2[i] - x2_avg)])

In [79]:
s2

array([[0.36, 1.44],
       [1.44, 5.76]])

In [17]:
sw = (s1/4)+(s2/4)

In [18]:
x1_avg

array([3. , 3.8])

In [19]:
np.sum(np.concatenate((x1,x2), axis=0),axis=0)

array([57, 57])

In [20]:
x1

array([[4, 2],
       [2, 4],
       [2, 3],
       [3, 6],
       [4, 4]])

In [21]:
(np.sum(x1, axis=0)+np.sum(x2, axis=0))/np.concatenate()

TypeError: concatenate() missing 1 required positional argument: 'arrays'