In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import decomposition

csv_filename = 'data/genotype_matrix.csv'

df = pd.read_csv(csv_filename)
df.rename(columns={'Unnamed: 0': 'Sample'}, inplace=True)
df

In [None]:

non_snp_columns = ['Population code', 'Sample']
snp_columns = [col for col in df.columns if col not in non_snp_columns]
print(snp_columns)
# Drop the columns
df_only_snps = df.drop(non_snp_columns, axis=1)

matrix = df_only_snps.to_numpy()
matrix = matrix.T
print(matrix.shape)
matrix

In [None]:
pca = decomposition.PCA(n_components=2)
pca.fit(matrix)

print(pca.explained_variance_ratio_)
print(pca.singular_values_)

In [None]:
transformed_to_plot = pca.transform(matrix)
print(transformed_to_plot.shape)
print(len(transformed_to_plot[:, 0]))
print(len(transformed_to_plot[:, 1]))
print(transformed_to_plot[:5])

plt.scatter(x=transformed_to_plot[:, 0], y=transformed_to_plot[:, 1])

In [None]:

import altair as alt

df_plot = df[snp_columns].copy()
# df_plot``
df_plot = df_plot.transpose()
df_plot.columns = df_plot.columns.astype(str)
# df.columns.
df_plot


In [None]:
df_plot['PC1'] = transformed_to_plot[:, 0]
df_plot['PC2'] = transformed_to_plot[:, 1]
df_plot

In [None]:
alt.Chart(df_plot).mark_point().encode(
    x='PC1',
    y='PC2',
    # color=alt.Color('Population code', scale=alt.Scale(scheme='category20')),
    # shape='Population code',
    
)

In [None]:
pop = pd.read_csv('data/igsr_populations.tsv', sep='\t')
pop

In [None]:
df_plot_with_pop = df_plot.merge(pop, on='Population code', how='inner').copy()
df_plot_with_pop

In [None]:
alt.Chart(df_plot_with_pop).mark_point().encode(
    x='PC1',
    y='PC2',
    color=alt.Color('Superpopulation name', scale=alt.Scale(scheme='category20')),
    shape='Superpopulation name',
    # fill='Population code',
)

In [None]:
from sklearn.manifold import TSNE

x = matrix
x_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x)
x_embedded.shape

In [None]:
df_plot_with_pop['tsne1'] = x_embedded[:, 0]
df_plot_with_pop['tsne2'] = x_embedded[:, 1]
df_plot_with_pop

In [None]:
alt.Chart(df_plot_with_pop).mark_point().encode(
    x='tsne1',
    y='tsne2',
    color=alt.Color('Population code', scale=alt.Scale(scheme='category20')),
    # fill='Population code',
)

In [None]:
alt.Chart(df_plot_with_pop).mark_point().encode(
    x='tsne1',
    y='tsne2',
    color=alt.Color('Superpopulation name', scale=alt.Scale(scheme='category20')),
    shape='Superpopulation name',
    # fill='Population code',
)