$${\color{yellow}{\text{Applied Linear Algebra: Variance maximization using PyTorch}}}$$



---

Load essential libraries

---

In [2]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import sys
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

---

Mount Google Drive folder if running Google Colab

---

In [3]:
## Mount Google drive folder if running in Colab
if('google.colab' in sys.modules):
    from google.colab import drive
    drive.mount('/content/drive', force_remount = True)
    DIR = '/content/drive/MyDrive/Colab Notebooks/ALA-25/Class-folder/In-Class'
    DATA_DIR = DIR+'/Data/'
else:
    DATA_DIR = 'Data/'

Mounted at /content/drive


---

Load the food texture dataset

---

In [4]:
## Load the food texture dataset
FILE = DATA_DIR + 'food-texture.csv'
df_food = pd.read_csv(FILE, index_col = 0, header = 0)
df_food.head(5)

Unnamed: 0,Oil,Density,Crispy,Fracture,Hardness
B110,16.5,2955,10,23,97
B136,17.7,2660,14,9,139
B171,16.2,2870,12,17,143
B192,16.7,2920,10,31,95
B225,16.3,2975,11,26,143


---

Preprocess the dataset

---

In [9]:
## Create a list of continuous and categorical column names
continuous_cols = ['Oil', 'Density', 'Fracture', 'Hardness']
categorical_cols = ['Crispy']

# Typecasting columns to correct types
df_food[categorical_cols] = df_food[categorical_cols].astype('category')
df_food[continuous_cols] = df_food[continuous_cols].astype('float64')

## Print dataframe column types
df_food.dtypes
# print(df_food)

Unnamed: 0,0
Oil,float64
Density,float64
Crispy,category
Fracture,float64
Hardness,float64


---

Using PyTorch, calculate an optimal direction $\mathbf{v}$ (a vector with unit magnitude) such that the variance of the projected values $$\dfrac{1}{n}\sum_{i=1}^n\left(\underbrace{\mathbf{x}^{(i)}\cdot\mathbf{v}}_{\text{projection of }i\text{th sample}}-\underbrace{\pmb{\mu}\cdot\mathbf{v}}_{\text{average of projected samples = projection of average sample}}\right)^2$$ is maximized.

The direction vector that you will get as the answer from this cell should match with the answer from the next cell where we do PCA using the in-built sklearn library.

---

In [29]:
# Data matrix (select only continuous columns)
X = torch.tensor(df_food[continuous_cols].values, dtype = torch.float64)
# print(X)

# Mean sample
mu = torch.mean(X, axis = 0)
print(mu)

# X_centered = X - mu
# print(X_centered)

# Initial direction vector (has to be a unit vector)
w = torch.tensor(np.ones(X.shape[1]), dtype = torch.float64, requires_grad = True)
print("The orginal value",w)
with torch.no_grad():
  w.data = w.data / torch.norm(w.data)


# Define optimizer (try different optimizers if answers don't match)
optimizer = torch.optim.Adam([w], lr = 1e-2)
print(optimizer)

# Loss function
def loss_fn(w):
    X_centered = X - mu
    # X_centered_df = pd.DataFrame(X_centered.numpy(), columns=continuous_cols)
    # print("The mean centered X:\n", X_centered_df.iloc[:5, :4])  # 5 rows, 2 columns
    loss = -torch.mean(torch.square(torch.matmul(X_centered,w )))
    return loss

# Optimization loop
num_epochs = 1000
for epoch in range(num_epochs):
  # Zero out the gradients
  optimizer.zero_grad()

  # Loss calculation
  loss = loss_fn(w)
#   print("The loss after function",loss)

  # Backward propagation and optimization
  loss.backward()
  optimizer.step()
#   print(w)

  # Print the loss every 2 epochs
  if epoch%100 == 0:
    print(f'Epoch {epoch}, loss = {loss.item()}')

  # Constraint satisfaction (w should be unit vector)
  with torch.no_grad():
    w.data = w.data / torch.norm(w.data)

# Print the optimized direction vector
# w = torch.softmax(w)
print(w)

tensor([  17.2020, 2857.6000,   20.8600,  128.1800], dtype=torch.float64)
The orginal value tensor([1., 1., 1., 1.], dtype=torch.float64, requires_grad=True)
Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    decoupled_weight_decay: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0
)
Epoch 0, loss = -4330.057909
Epoch 100, loss = -5168.438772581795
Epoch 200, loss = -4593.682308322146
Epoch 300, loss = -4497.6966611865955
Epoch 400, loss = -4483.36372857365
Epoch 500, loss = -4481.374751049885
Epoch 600, loss = -4481.272642241255
Epoch 700, loss = -4481.575041520063
Epoch 800, loss = -4482.021563469777
Epoch 900, loss = -4482.5062362839235
tensor([-0.5002,  0.4999,  0.4999,  0.5001], dtype=torch.float64,
       requires_grad=True)


---

PCA using sklearn module (just run the cell to get the optimized direction vector)

---

In [30]:
from sklearn.decomposition import PCA

# Create and fit PCA object
pca = PCA(n_components = X.shape[1])
pca.fit_transform(X.detach().numpy())

# Print optimal direction vector
print(pca.components_[0])

[-0.00958687  0.99923202  0.0249962   0.02861215]
