# Generating the kmer-distance matrix for tSNE analysis (Typhimurium)

In [1]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import io
from sklearn.decomposition import PCA
from Bio.Phylo.TreeConstruction import _Matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 300)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Function to create matrix using kmer-distance produced by akronymer for group 1


def create_matrix(fname):
    d = pd.read_csv(fname, header = 0, sep = '\t')
    d = d.rename(columns = {'Unnamed: 0':''})
    d = d.replace(np.NaN, '')
    d1 = d
    d1_labels = d.columns
    d2_labels = pd.DataFrame(d1_labels)
    d2_labels = d1_labels.transpose()
    names = d1_labels.to_list()
    # first record is empty, remove it
    names.pop(0)
    d.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/kmer_matrix/d.csv', header = False, index = False)
    f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/kmer_matrix/d.csv'
    df1 = pd.read_csv(f1, header = None)
    df2 = df1.drop([0], axis = 1)
    df3 = df2.replace(np.NaN, '')
    df4 = np.array(df3)
    df5 = np.tril(df4)
    df6 = np.array(df5).tolist()
    # extract lower triangualar matrix
    lower = []
    for i in range(0, len(df6)):
        tmp = []
        tmp = df6[i][:i]
        lower.append(tmp)
    # include diagonal to lower triangular matrix
    for i in range(0, len(lower)):
        lower[i].insert(len(lower[i]), 0)
    matrix = lower
    m = _Matrix(names, matrix)
    return m

In [3]:
# Run function to get the matrix

f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/input_data/typhimurium_gr1_core_genome_alignment.aln_output'
matrix = create_matrix(f1)
kmergroup1typh = matrix

In [4]:
# Run the tSNE program

X_tsne = TSNE(learning_rate=200, n_components = 2, n_iter = 1000, random_state = 1).fit_transform(kmergroup1typh)

In [5]:
X_tsne #get the output

array([[  0.9900065, -51.657223 ],
       [  9.152839 ,   7.2662206],
       [  5.443476 ,   8.903496 ],
       ...,
       [ -2.9266844, -16.400454 ],
       [-29.82595  ,  -9.189602 ],
       [ -8.950323 ,  -4.498015 ]], dtype=float32)

In [6]:
# Create a dataframe with tSNE output

np.random.seed(1)
a = pd.DataFrame(X_tsne)
a.columns = ['tSNE1', 'tSNE2']
a = a.reset_index()

In [7]:
a

Unnamed: 0,index,tSNE1,tSNE2
0,0,0.990007,-51.657223
1,1,9.152839,7.266221
2,2,5.443476,8.903496
3,3,9.549383,6.505447
4,4,9.559116,9.266604
...,...,...,...
1071,1071,11.131242,44.399265
1072,1072,-13.664680,34.302704
1073,1073,-2.926684,-16.400454
1074,1074,-29.825951,-9.189602


In [8]:
b = pd.DataFrame(kmergroup1typh.names)
b.columns = ['id']
b = b.reset_index()

In [9]:
b

Unnamed: 0,index,id
0,0,DRR106950
1,1,ERR023784
2,2,ERR023837
3,3,ERR024361
4,4,ERR024365
...,...,...
1071,1071,SRR9984493
1072,1072,SRR9989219
1073,1073,SRR9989254
1074,1074,SRR9989264


In [10]:
# Merge all datasets

kmer_group_1_typhimurium = pd.merge(b, a, on = 'index')
kmer_group_1_typhimurium = kmer_group_1_typhimurium[['id', 'tSNE1', 'tSNE2']]

In [11]:
kmer_group_1_typhimurium

Unnamed: 0,id,tSNE1,tSNE2
0,DRR106950,0.990007,-51.657223
1,ERR023784,9.152839,7.266221
2,ERR023837,5.443476,8.903496
3,ERR024361,9.549383,6.505447
4,ERR024365,9.559116,9.266604
...,...,...,...
1071,SRR9984493,11.131242,44.399265
1072,SRR9989219,-13.664680,34.302704
1073,SRR9989254,-2.926684,-16.400454
1074,SRR9989264,-29.825951,-9.189602


In [12]:
# Export the data 

kmer_group_1_typhimurium.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/typhimurium/group_1/kmer_matrix/kmer_group_1_typhimurium.csv', header = True, index = False)