# Generating snp_dist matrix for tSNE analysis - Typhimurium

In [1]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import io
from sklearn.decomposition import PCA
from Bio.Phylo.TreeConstruction import _Matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 300)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Function to create matrix using snp_distance produced by snp_dists for group 1


def create_matrix(fname):
    d = pd.read_csv(fname, header = 0)
    d = d.rename(columns = {'Unnamed: 0':''})
    d = d.replace(np.NaN, '')
    d1 = d
    d1_labels = d.columns
    d2_labels = pd.DataFrame(d1_labels)
    d2_labels = d1_labels.transpose()
    names = d1_labels.to_list()
    # first record is empty, remove it
    names.pop(0)
    d.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/snp_matrix/d.csv', header = False, index = False)
    f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/snp_matrix/d.csv'
    df1 = pd.read_csv(f1, header = None)
    df2 = df1.drop([0], axis = 1)
    df3 = df2.replace(np.NaN, '')
    df4 = np.array(df3)
    df5 = np.tril(df4)
    df6 = np.array(df5).tolist()
    # extract lower triangualar matrix
    lower = []
    for i in range(0, len(df6)):
        tmp = []
        tmp = df6[i][:i]
        lower.append(tmp)
    # include diagonal to lower triangular matrix
    for i in range(0, len(lower)):
        lower[i].insert(len(lower[i]), 0)
    matrix = lower
    m = _Matrix(names, matrix)
    return m

In [3]:
# Run function to get the matrix

f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/input_data/distace_snp_sites_1.tsv'
matrix = create_matrix(f1)
snpdistgroup1typh = matrix

In [4]:
# Run the tSNE program

X_tsne = TSNE(learning_rate=200, n_components = 2, n_iter = 1000, random_state = 1).fit_transform(snpdistgroup1typh)

In [5]:
X_tsne #get the output

array([[ -9.500885 ,  27.39049  ],
       [ 32.088375 , -22.394089 ],
       [ -7.007181 , -27.78953  ],
       ...,
       [-32.590916 ,   3.2527757],
       [ -6.7712383, -22.660282 ],
       [-14.960226 , -10.328787 ]], dtype=float32)

In [6]:
# Create a dataframe with tSNE output

np.random.seed(1)
a = pd.DataFrame(X_tsne)
a.columns = ['tSNE1', 'tSNE2']
a = a.reset_index()

In [7]:
a

Unnamed: 0,index,tSNE1,tSNE2
0,0,-9.500885,27.390490
1,1,32.088375,-22.394089
2,2,-7.007181,-27.789530
3,3,47.953041,-9.844385
4,4,5.913189,-6.457929
...,...,...,...
1071,1071,9.361316,48.963840
1072,1072,-20.925077,34.960152
1073,1073,-32.590916,3.252776
1074,1074,-6.771238,-22.660282


In [8]:
# Get the genome ids

b = pd.DataFrame(snpdistgroup1typh.names)
b.columns = ['id']
b = b.reset_index()

In [9]:
b

Unnamed: 0,index,id
0,0,DRR106950
1,1,ERR023784
2,2,ERR023837
3,3,ERR024361
4,4,ERR024365
...,...,...
1071,1071,SRR9984493
1072,1072,SRR9989219
1073,1073,SRR9989254
1074,1074,SRR9989264


In [10]:
# Merge all datasets

snp_group_1_typhimurium = pd.merge(b, a, on = 'index')
snp_group_1_typhimurium = snp_group_1_typhimurium[['id', 'tSNE1', 'tSNE2']]

In [11]:
snp_group_1_typhimurium

Unnamed: 0,id,tSNE1,tSNE2
0,DRR106950,-9.500885,27.390490
1,ERR023784,32.088375,-22.394089
2,ERR023837,-7.007181,-27.789530
3,ERR024361,47.953041,-9.844385
4,ERR024365,5.913189,-6.457929
...,...,...,...
1071,SRR9984493,9.361316,48.963840
1072,SRR9989219,-20.925077,34.960152
1073,SRR9989254,-32.590916,3.252776
1074,SRR9989264,-6.771238,-22.660282


In [12]:
# Export the data 

snp_group_1_typhimurium.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/snp_matrix/snp_group_1_typhimurium.csv', header = True, index = False)