# Generating the kmer-distance matrix for tSNE analysis (Infantis)

In [14]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import io
from sklearn.decomposition import PCA
from Bio.Phylo.TreeConstruction import _Matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 300)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [15]:
# Function to create matrix using kmer-distance produced by akronymer for group 1


def create_matrix(fname):
    d = pd.read_csv(fname, header = 0, sep = '\t')
    d = d.rename(columns = {'Unnamed: 0':''})
    d = d.replace(np.NaN, '')
    d1 = d
    d1_labels = d.columns
    d2_labels = pd.DataFrame(d1_labels)
    d2_labels = d1_labels.transpose()
    names = d1_labels.to_list()
    # first record is empty, remove it
    names.pop(0)
    d.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/infantis/kmer_matrix/d.csv', header = False, index = False)
    f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/infantis/kmer_matrix/d.csv'
    df1 = pd.read_csv(f1, header = None)
    df2 = df1.drop([0], axis = 1)
    df3 = df2.replace(np.NaN, '')
    df4 = np.array(df3)
    df5 = np.tril(df4)
    df6 = np.array(df5).tolist()
    # extract lower triangualar matrix
    lower = []
    for i in range(0, len(df6)):
        tmp = []
        tmp = df6[i][:i]
        lower.append(tmp)
    # include diagonal to lower triangular matrix
    for i in range(0, len(lower)):
        lower[i].insert(len(lower[i]), 0)
    matrix = lower
    m = _Matrix(names, matrix)
    return m

In [16]:
# Run function to get the matrix

f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/infantis/input_data/infantis_core_genome_alignment.aln_output'
matrix = create_matrix(f1)
kmerinfantis = matrix

In [17]:
# Run the tSNE program

X_tsne = TSNE(learning_rate=200, n_components = 2, n_iter = 1000, random_state = 1).fit_transform(kmerinfantis)

In [18]:
X_tsne #get the output

array([[ 50.336273,  33.53639 ],
       [ -4.253155,  62.09437 ],
       [-22.32762 ,  80.7301  ],
       ...,
       [-46.08219 , -43.567802],
       [ 29.236895, -21.540947],
       [ 29.885448, -22.502644]], dtype=float32)

In [19]:
# Create a dataframe with tSNE output

np.random.seed(1)
a = pd.DataFrame(X_tsne)
a.columns = ['tSNE1', 'tSNE2']
a = a.reset_index()

In [20]:
a

Unnamed: 0,index,tSNE1,tSNE2
0,0,50.336273,33.536388
1,1,-4.253155,62.094372
2,2,-22.327620,80.730103
3,3,18.469278,-64.362190
4,4,-5.255455,54.452606
...,...,...,...
2846,2846,-10.340121,-25.705935
2847,2847,-40.617153,-11.294018
2848,2848,-46.082191,-43.567802
2849,2849,29.236895,-21.540947


In [21]:
b = pd.DataFrame(kmerinfantis.names)
b.columns = ['id']
b = b.reset_index()

In [22]:
b

Unnamed: 0,index,id
0,0,ERR3181860
1,1,SRR1002832
2,2,SRR1002834
3,3,SRR10222594
4,4,SRR10240828
...,...,...
2846,2846,SRR9984488
2847,2847,SRR9984489
2848,2848,SRR9984501
2849,2849,SRR9984504


In [23]:
# Merge all datasets

kmer_infantis = pd.merge(b, a, on = 'index')
kmer_infantis = kmer_infantis[['id', 'tSNE1', 'tSNE2']]

In [24]:
kmer_infantis

Unnamed: 0,id,tSNE1,tSNE2
0,ERR3181860,50.336273,33.536388
1,SRR1002832,-4.253155,62.094372
2,SRR1002834,-22.327620,80.730103
3,SRR10222594,18.469278,-64.362190
4,SRR10240828,-5.255455,54.452606
...,...,...,...
2846,SRR9984488,-10.340121,-25.705935
2847,SRR9984489,-40.617153,-11.294018
2848,SRR9984501,-46.082191,-43.567802
2849,SRR9984504,29.236895,-21.540947


In [25]:
# Export the data 

kmer_infantis.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/infantis/kmer_matrix/kmer_infantis.csv', header = True, index = False)