# Generating the kmer-distance matrix for tSNE analysis (Newport)

In [1]:
# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import io
from sklearn.decomposition import PCA
from Bio.Phylo.TreeConstruction import _Matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
pd.set_option('display.max_columns', 300)
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Function to create matrix using kmer-distance produced by akronymer for group 1


def create_matrix(fname):
    d = pd.read_csv(fname, header = 0, sep = '\t')
    d = d.rename(columns = {'Unnamed: 0':''})
    d = d.replace(np.NaN, '')
    d1 = d
    d1_labels = d.columns
    d2_labels = pd.DataFrame(d1_labels)
    d2_labels = d1_labels.transpose()
    names = d1_labels.to_list()
    # first record is empty, remove it
    names.pop(0)
    d.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/kmer_matrix/d.csv', header = False, index = False)
    f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/kmer_matrix/d.csv'
    df1 = pd.read_csv(f1, header = None)
    df2 = df1.drop([0], axis = 1)
    df3 = df2.replace(np.NaN, '')
    df4 = np.array(df3)
    df5 = np.tril(df4)
    df6 = np.array(df5).tolist()
    # extract lower triangualar matrix
    lower = []
    for i in range(0, len(df6)):
        tmp = []
        tmp = df6[i][:i]
        lower.append(tmp)
    # include diagonal to lower triangular matrix
    for i in range(0, len(lower)):
        lower[i].insert(len(lower[i]), 0)
    matrix = lower
    m = _Matrix(names, matrix)
    return m

In [3]:
# Run function to get the matrix

f1 = '/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/input_data/newport_core_genome_alignment.aln_output'
matrix = create_matrix(f1)
kmernewport = matrix

In [4]:
# Run the tSNE program

X_tsne = TSNE(learning_rate=200, n_components = 2, n_iter = 1000, random_state = 1).fit_transform(kmernewport)

In [5]:
X_tsne #get the output

array([[-28.294851, -19.737156],
       [ 31.618286, -31.05948 ],
       [ 33.777225,  14.183587],
       ...,
       [ 26.211576,  35.969414],
       [ 27.275383,  32.65792 ],
       [ 26.33708 ,  35.877853]], dtype=float32)

In [6]:
# Create a dataframe with tSNE output

np.random.seed(1)
a = pd.DataFrame(X_tsne)
a.columns = ['tSNE1', 'tSNE2']
a = a.reset_index()

In [7]:
a

Unnamed: 0,index,tSNE1,tSNE2
0,0,-28.294851,-19.737156
1,1,31.618286,-31.059481
2,2,33.777225,14.183587
3,3,34.107281,13.906292
4,4,39.224579,9.566011
...,...,...,...
2360,2360,56.707512,20.267324
2361,2361,17.991688,52.862652
2362,2362,26.211576,35.969414
2363,2363,27.275383,32.657921


In [8]:
b = pd.DataFrame(kmernewport.names)
b.columns = ['id']
b = b.reset_index()

In [9]:
b

Unnamed: 0,index,id
0,0,SRR1002805
1,1,SRR1002816
2,2,SRR1002817
3,3,SRR1002827
4,4,SRR1002828
...,...,...
2360,2360,SRR953548
2361,2361,SRR953551
2362,2362,SRR980337
2363,2363,SRR980338


In [10]:
# Merge all datasets

kmer_newport = pd.merge(b, a, on = 'index')
kmer_newport = kmer_newport[['id', 'tSNE1', 'tSNE2']]

In [11]:
kmer_newport

Unnamed: 0,id,tSNE1,tSNE2
0,SRR1002805,-28.294851,-19.737156
1,SRR1002816,31.618286,-31.059481
2,SRR1002817,33.777225,14.183587
3,SRR1002827,34.107281,13.906292
4,SRR1002828,39.224579,9.566011
...,...,...,...
2360,SRR953548,56.707512,20.267324
2361,SRR953551,17.991688,52.862652
2362,SRR980337,26.211576,35.969414
2363,SRR980338,27.275383,32.657921


In [12]:
# Export the data 

kmer_newport.to_csv('/Users/joaocarlosgomesneto/Documents/frontiers_paper_salmonella_newport_typhimurium/data/newport/kmer_matrix/kmer_newport.csv', header = True, index = False)