# GMS Intro to Stats
## Excercise 2: Dimension reduction methods.
### Plotting a map of Europe given distances between cities
+ Distance file available from RMDS project:
    - [https://github.com/cheind/rmds/blob/master/examples/european_city_distances.csv](https://github.com/cheind/rmds/blob/master/examples/european_city_distances.csv)

In [None]:
import pandas as pd
import numpy as np
from sklearn import manifold
import matplotlib.pyplot as plt
from adjustText import adjust_text
import seaborn as sns
sns.set()

In [None]:
# Get the data
dists = pd.read_csv("european_city_distances.csv", delimiter=';', index_col=0)
dists

In [None]:
# Get the data in a easy to use matrix and labels
adist = np.array(dists)
cities = dists.index.values

# Caculate the coordinates in a 2D map
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6) # In 2D, with distances already precomputed and fixed orientation
results = mds.fit(adist)

coords = results.embedding_

# Plot a 2D map of baed on the distances from each other.
with sns.axes_style("white"):
    plt.figure(1,figsize=(20,10))
    plt.scatter(coords[:, 0], coords[:, 1], marker = 'o', color='black', s=40)
    texts = []
    for label, x, y in zip(cities, coords[:, 0], coords[:, 1]):
        texts.append(plt.text(x,y,label,size=15,color='darkblue'))
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color='k', lw=0.5))
    plt.show()

### How about using genetic distances to map ethnicity

+ Firstly find genome positiions in your cohort that are informative about race:
    - Use the 1000 genome dataset to pick out positions with a minor allele frequency different between two of African, Asian or European populations (filtering out any multi allelic sites).
+ For these sites, calculate the Euclidean distance between your samples:
    - To calculate the distance at each position, for a pair of samples, if they are both share the same alleles, distance = 0; or share one allele, distance = 1; and if they share no alleles, distance = 2.
    - Square each the distance at each position, add them all together and take the square root of the answer.
    - This is used to plot a 2D map of genomic 'distance' between the samples. We can compare the reported ethnicity to the position on the map.

In [None]:
# Get the data in
genetic_distances = np.genfromtxt('genetic_distances.csv', delimiter=',')
recorded_ethnicities = open('recorded_ethnicities.txt').read().splitlines()

# Define the markers and colours for each recorded ethnicity
ethnicities = {
'Australia': ['Australian', 'd', 'blue'], 
'Germany': ['German', 's', 'blue'], 
'Poland': ['Poland', '1', 'blue'], 
'Russia': ['Russia', 'p', 'blue'], 
'Ukraine': ['Ukraine', '3', 'blue'], 
'Yemen': ['Yemen', '4', 'red'], 
'Korea South': ['S_Korea', 'D', 'red'], 
'Singapore': ['Singapore', '|', 'red'], 
'Vietnam': ['Vietnam', '>', 'red'], 
'Canada_Wh': ['White_Canadian', '^', 'blue'], 'Canada_As': ['Asian_Canadian', '^', 'red'],  'Canada_NK': ['Unknown_Canadian', '^', 'black'], 
'USA_Af': ['African_American','o','green'], 'USA_As': ['Asian_American','o','red'], 'USA_Wh': ['White_American','o','blue'], 'USA_La': ['Latin_American','o','gold'], 'USA_NK': ['Unknown_American', 'o', 'black'], 
'Unknown_Af': ['African_Unknown','x','green'], 'Unknown_As': ['Asian_Unknown','x','red'], 'Unknown_Wh': ['White_Unknown','x','blue'], 'Unknown_La': ['Latin_Unknown','x','gold'], 'Unknown_PI': ['Pacific_Islander_Unknown','x','purple'], 'Unknown_NK': ['Unknown_Unknown', 'x', 'black']}

# OK Calculation time
mds = manifold.MDS(n_components=2, dissimilarity="precomputed", random_state=6)
results = mds.fit(genetic_distances)

coords = results.embedding_

In [None]:
# Get the order to plot it in
nationalities = []
for item in ethnicities:
    nationalities.append(item)

plotting_dict = {}
for item in nationalities:
    plotting_dict[item] = []

for k,item in enumerate(recorded_ethnicities):
    plotting_dict[item].append(k)

correct_order = []
for key in plotting_dict:
    correct_order.append(key)
correct_order = list(set(correct_order))
correct_order
# correct_order.sort()

# Plot a 2D map of genetic distances
with sns.axes_style("darkgrid"):
    fig = plt.figure(1,figsize=(18, 12))
    ax = plt.subplot(111)
    
    correct_order = ['USA_Af', 'Unknown_Af', 'Canada_As', 'USA_As', 'Korea South', 'Singapore', 'Vietnam', 'Yemen', 'Unknown_As', 'Unknown_PI', 'Australia', 'Canada_Wh', 'USA_Wh', 'Germany', 'Poland', 'Ukraine', 'Russia', 'Unknown_Wh', 'USA_La', 'Unknown_La', 'Canada_NK', 'USA_NK', 'Unknown_NK']
    
    for key in correct_order:
        if key in ethnicities:
             plt.scatter(coords[plotting_dict[key], 0], coords[plotting_dict[key], 1], c=ethnicities[key][2], marker=ethnicities[key][1],label=ethnicities[key][0])
    
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * .8, box.height])
    ax.legend(loc = 'center left', fontsize='xx-large',bbox_to_anchor=(1, .5))