In [1]:
import pickle
import numpy as np
import pandas as pd
from scipy import linalg, sparse
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
import matplotlib.pyplot as pp
from sklearn.decomposition import TruncatedSVD, PCA
from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler

## 02 - Analyze the data to determine the *k*-value to use for SVD and PCA

Determine the *k*-value to use for the SVD and PCA dimensional reductions

### 2.1) Import Preprocessed data from step 1

Import the likes data

In [3]:
likesMAT = pickle.load(open("likesMAT.pkl","rb"))
likesMAT.shape

(9500, 536204)

Import the ages data

In [4]:
agesARR = pickle.load(open("agesARR.pkl", "rb"))
agesARR.shape

(9500, 4)

Do a train/test split

In [5]:
train_likesMAT, test_likesMAT, train_agesARR, test_agesARR = train_test_split(likesMAT, agesARR, test_size=0.05, random_state=42)

### 2.2) Compute eigenvalues

First, get a square matrix of the smallest dimension

In [6]:
sq_likesMAT_1 = np.dot(train_likesMAT, train_likesMAT.T)
sq_likesMAT_1.shape

(9025, 9025)

Convert the square matrix to a Torch tensor and move to GPU

In [7]:
tensor_sq_likesMAT = torch.Tensor(sq_likesMAT_1.todense()) \
                            .to(torch.float) \
                            .cuda()

Compute the eigenvalues on the GPU, cast to reals only (*there should be no imaginary parts*), return to the CPU, and convert to a 1D numpy array

In [None]:
tensor_sq_likesMAT_evals = torch.linalg.eig(tensor_sq_likesMAT)

sq_likesMAT_evals = tensor_sq_likesMAT_evals.eigenvalues.to(torch.float32).cpu().numpy()
print(sq_likesMAT_evals)

Clean-up GPU VRAM to preserve available space to work

In [None]:
del tensor_sq_likesMAT_evals
del tensor_sq_likesMAT

torch.cuda.empty_cache()

Print the last hundred of so eigenvalues

In [None]:
print(np.sort(sq_likesMAT_evals)[8900:])

Plot the eigenvalues in order of magnitude

In [None]:
ind_to_start = 4000 #8050
pp.plot(np.asarray(range(np.unique(sq_likesMAT_evals)[ind_to_start:].shape[0])), np.sort(np.unique(sq_likesMAT_evals))[ind_to_start:], 'x')
pp.show()

Calculate the deltas between eigenvalues and print the last hundred or so

In [None]:
sq_likesMAT_evals_deltas = []
for i in range(1, sq_likesMAT_evals.shape[0]):
    sq_likesMAT_evals_deltas.append(np.sort(sq_likesMAT_evals)[i] - np.sort(sq_likesMAT_evals)[i-1])

sq_likesMAT_evals_deltas = np.asarray(sq_likesMAT_evals_deltas)
sq_likesMAT_evals_deltas.shape

In [None]:
np.unique(sq_likesMAT_evals_deltas).shape

Print out the top values

In [None]:
print(np.sort(sq_likesMAT_evals_deltas)[8900:])
np.sort(sq_likesMAT_evals_deltas)[8900:].shape

Plot the eigenvalue deltas in order of magnitude

In [None]:
ind_to_start = 2000
pp.plot(np.asarray(range(sq_likesMAT_evals_deltas[ind_to_start:].shape[0])), np.sort(sq_likesMAT_evals_deltas)[ind_to_start:], 'x')
pp.show()

### 2.3) Compute PCA stuff

PCA requires scaling our inputs, so we have

In [None]:
scaler = StandardScaler(with_mean=False)
train_likesMAT_std = scaler.fit_transform(train_likesMAT)
test_likesMAT_std = scaler.transform(test_likesMAT)

Apply PCA

In [None]:
pca = PCA(n_components=4100)
# pca = PCA()
train_likesMAT_pca = pca.fit_transform(train_likesMAT_std)

# Calculate the cumulative explained variance
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Determine the number of components to keep for 85% variance explained
n_components = np.argmax(cumulative_variance_ratio >= 0.85) + 1

In [None]:
print(n_components)

In [None]:
print(cumulative_variance_ratio)

In [None]:
len(cumulative_variance_ratio)

In [None]:
print(cumulative_variance_ratio[3500:])