In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from framework.common.util import save_data_table

In [8]:
# Path to dataset
dataset = "Pollen/processed/pollen.500g.txt"
# Number of genes to use
n_genes = 500
# Number of PCA components to use
n_components = 10
# Where to save the resulting PCA components
results_dir = "results/pollen/pca/pca_" + str(n_genes) + "g"

features_start_col_idx = 0
has_labels = True
features_end_col_idx = -2

In [4]:
base_dir = "data/"
filepath = base_dir + dataset
df = pd.read_csv(filepath, sep="\t", header=0, index_col=0)

if has_labels:
    features = df.iloc[:, features_start_col_idx:features_end_col_idx].values.astype(dtype=np.float64)
    label_info = df.iloc[:, features_end_col_idx:].values
else:
    features = df.iloc[:, features_start_col_idx:].values.astype(dtype=np.float64)
    
features_scaled = scale(features)

In [9]:
# Run PCA and save the reduced dimensional representations
pca_components = PCA(n_components=n_components).fit_transform(features_scaled)

results = np.hstack((
    np.expand_dims(df.index.values, axis=1),
    pca_components
))
if has_labels:
    results = np.hstack((results, label_info))
    
header = ["cell_ids"]
for l in range(1, n_components + 1):
    header.append("dim{}".format(l))
if has_labels:
    header.extend(list(df.columns.values[features_end_col_idx:]))
header = np.array(header)

results = np.vstack((header, results))

if not os.path.exists(results_dir):
    os.makedirs(results_dir)

save_data_table(
    results,
    results_dir + "/latent_representations.txt")