# Generating snp_dist matrix for tSNE analysis - Newport

In [1]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import io
from sklearn.decomposition import PCA
from Bio.Phylo.TreeConstruction import _Matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 300)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Function to create matrix using snp_distance produced by snp_dists for group 1


def create_matrix(fname):
    d = pd.read_csv(fname, header = 0)
    d = d.rename(columns = {'Unnamed: 0':''})
    d = d.replace(np.NaN, '')
    d1 = d
    d1_labels = d.columns
    d2_labels = pd.DataFrame(d1_labels)
    d2_labels = d1_labels.transpose()
    names = d1_labels.to_list()
    # first record is empty, remove it
    names.pop(0)
    d.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/snp_matrix/d.csv', header = False, index = False)
    f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/snp_matrix/d.csv'
    df1 = pd.read_csv(f1, header = None)
    df2 = df1.drop([0], axis = 1)
    df3 = df2.replace(np.NaN, '')
    df4 = np.array(df3)
    df5 = np.tril(df4)
    df6 = np.array(df5).tolist()
    # extract lower triangualar matrix
    lower = []
    for i in range(0, len(df6)):
        tmp = []
        tmp = df6[i][:i]
        lower.append(tmp)
    # include diagonal to lower triangular matrix
    for i in range(0, len(lower)):
        lower[i].insert(len(lower[i]), 0)
    matrix = lower
    m = _Matrix(names, matrix)
    return m

In [3]:
# Run function to get the matrix

f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/input_data/distance_snp_sites_newport.tsv'
matrix = create_matrix(f1)
snpnewport = matrix

In [4]:
# Run the tSNE program

X_tsne = TSNE(learning_rate=200, n_components = 2, n_iter = 1000, random_state = 1).fit_transform(snpnewport)

In [5]:
X_tsne #get the output

array([[-22.42562 , -20.604546],
       [ 47.124672, -26.425928],
       [ 28.164728,  29.820534],
       ...,
       [ 33.04647 ,  37.446224],
       [ 33.05661 ,  37.449913],
       [ 32.42923 ,  38.650124]], dtype=float32)

In [6]:
# Create a dataframe with tSNE output

np.random.seed(1)
a = pd.DataFrame(X_tsne)
a.columns = ['tSNE1', 'tSNE2']
a = a.reset_index()

In [7]:
a

Unnamed: 0,index,tSNE1,tSNE2
0,0,-22.425619,-20.604546
1,1,47.124672,-26.425928
2,2,28.164728,29.820534
3,3,28.228798,29.594076
4,4,28.111811,29.886261
...,...,...,...
2360,2360,8.753829,1.106126
2361,2361,28.841772,32.965000
2362,2362,33.046471,37.446224
2363,2363,33.056610,37.449913


In [8]:
# Get the genome ids

b = pd.DataFrame(snpnewport.names)
b.columns = ['id']
b = b.reset_index()

In [9]:
b

Unnamed: 0,index,id
0,0,SRR1002805
1,1,SRR1002816
2,2,SRR1002817
3,3,SRR1002827
4,4,SRR1002828
...,...,...
2360,2360,SRR953548
2361,2361,SRR953551
2362,2362,SRR980337
2363,2363,SRR980338


In [10]:
# Merge all datasets

snp_newport = pd.merge(b, a, on = 'index')
snp_newport = snp_newport[['id', 'tSNE1', 'tSNE2']]

In [11]:
snp_newport

Unnamed: 0,id,tSNE1,tSNE2
0,SRR1002805,-22.425619,-20.604546
1,SRR1002816,47.124672,-26.425928
2,SRR1002817,28.164728,29.820534
3,SRR1002827,28.228798,29.594076
4,SRR1002828,28.111811,29.886261
...,...,...,...
2360,SRR953548,8.753829,1.106126
2361,SRR953551,28.841772,32.965000
2362,SRR980337,33.046471,37.446224
2363,SRR980338,33.056610,37.449913


In [12]:
# Export the data 

snp_newport.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/snp_matrix/snp_newport.csv', header = True, index = False)