In [126]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import copy

# Define Functions

Functions need to create:<br>
split_by_category()<br>
decentralization()<br>
calculate_S()<br>
invertible()<br>
calculate_projection_direction()<br>
calculate_projected_values()<br>
LDA()

## Split Data and Calculate Mean Values of Each Category

In [129]:
def split_by_category(data):
    dataset = [pd.DataFrame()]*len(data['label'].unique())
    for i in range(len(data['label'].unique())):
        dataset[i] = data[data['label'] == data['label'].unique()[i]]
    return dataset

## Decentralization 

In [133]:
def decentralization(dataset, mean = 'within_class'):
    #notice that copy only copies the value in the shallowest level. DF inside list is not copied
    decentralized_dataset = copy.deepcopy(dataset)
    if mean == 'total_mean':
        for data_cat in decentralized_dataset:
            for column in [column for column in data_cat.columns if column not in ['num', 'label']]:
                data_cat[column] = data_cat[column].apply(lambda x: x - pd.concat(decentralized_dataset).mean()[column])
    else:
        for data_cat in decentralized_dataset:
            for column in [column for column in data.columns if column not in ['num', 'label']]:
                data_cat[column] = data_cat[column].apply(lambda x: x - data_cat.mean()[column])  
    return decentralized_dataset

## Calculate Sw and Sb

In [14]:
def calculate_S(dataset):
    Sw_list = []
    features = [column for column in dataset[0].columns if column not in ['num', 'label']]
    for data in dataset:
        Sw_list.append(np.dot(data[features].T,data[features]))
    prob = [data.shape[0] for data in dataset]
    Sw = np.dot(Sw_list, prob)/sum(prob)
    return Sw

## Calculate det of Sw -- to Decide Invertible or Not

In [137]:
def invertible(Sw):
    if np.linalg.det(Sw) != 0:
        return True
    return False

## Calculate Eigenvalues -- Which is the Projection Direction

In [186]:
%%html
<img src='lda_eigenvalues.png', width=500, height=300>

In [158]:
def calculate_projection_direction(Sw,Sb):
    eigenvalues,eigenvectors = np.linalg.eig(np.dot(np.linalg.inv(Sw),Sb))
    print('Eigenvalues and Eigenvectors:', eigenvalues,eigenvectors)
    b = list(eigenvalues).index(max(eigenvalues))
    w = eigenvectors[:,b]
    print('Eigenvector with the largest eigenvalues:',W)
    return w

## Calculated Projected Values

In [183]:
def calculate_projected_values(w, x):
    return np.dot(w, x)

## LDA 

In [184]:
def LDA(data):
    dataset = split_by_category(data)
    Sw = calculate_S(decentralization(dataset, mean = 'within_class'))
    Sb = calculate_S(decentralization(dataset, mean = 'total_mean'))
    if invertible(Sw):
        w = calculate_projection_direction(Sw,Sb)
        features = [column for column in dataset[0].columns if column not in ['num', 'label']]
        data['train'] = data.apply(lambda row: np.array([row[column] for column in features]), axis=1)
        data['projected'] = data['train'].apply(lambda x: calculate_projected_values(w, x))
        data.drop(columns = ['train'], inplace = True)
    else:
        print('Sw not invertible')
    return data

# Sample

In [185]:
data = pd.read_csv('watermelon.csv')
LDA(data)

Eigenvalues and Eigenvectors: [3.65289951 1.53568198] [[ 0.55287782  0.04091411]
 [-0.83326234  0.99916267]]
Eigenvector with the largest eigenvalues: [ 0.55287782 -0.83326234]


Unnamed: 0,num,density,sugar,label,projected
0,1,0.697,0.46,1,0.002055
1,2,0.774,0.376,1,0.114621
2,3,0.634,0.264,1,0.130543
3,4,0.608,0.318,1,0.071172
4,5,0.556,0.215,1,0.128249
5,6,0.403,0.237,1,0.025327
6,7,0.481,0.149,1,0.141778
7,8,0.437,0.211,1,0.065789
8,9,0.666,0.091,0,0.29239
9,10,0.243,0.0267,0,0.112101
