In [1]:
import os
import xlrd
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

output_dir = "/data1/bio/projects/isalafutdinov"
table_file = os.path.join(output_dir, "Metabolomics results 14022020_4_claster_FINAL.xlsx")
table_df = pd.read_excel(table_file)
table_df = table_df.loc[table_df["Sample Name"] != "Sample Type", 
                        list(filter(lambda x: len(x.strip()) > 0 and "unnamed" not in x.lower(), table_df.columns))]
table_df = table_df.rename(columns={"Sample Name": "Metabolyte"}).set_index("Metabolyte").transpose()
table_df.index.names = ["Sample Name"]

In [2]:
scaled_df = pd.DataFrame(StandardScaler().fit_transform(table_df), columns=table_df.columns, 
                         index=table_df.index)

In [3]:
pca = PCA(n_components=2)
pca_df = pd.DataFrame(pca.fit_transform(scaled_df), columns=["PCA {}".format(i) for i in range(1, 3)], 
                      index=scaled_df.index)

In [4]:
# pca_df.plot.scatter(x=pca_df.columns[0], y=pca_df.columns[1])
plt.rcParams["figure.figsize"] = (28, 20)
fig, ax = plt.subplots()
ax.set_xlabel("Principal Component 1", fontsize = 15)
ax.set_ylabel("Principal Component 2", fontsize = 15)
ax.set_title("2 component PCA", fontsize = 20)

pca_x = pca_df[pca_df.columns[0]].values
pca_y = pca_df[pca_df.columns[1]].values
ax.scatter(x=pca_x, y=pca_y)

for idx, txt in enumerate(pca_df.index):
    ax.annotate(txt, (pca_x[idx], pca_y[idx]), fontsize="xx-small")

plt.savefig(os.path.join(output_dir, "pca.png"), dpi=300)
plt.clf()
plt.close()