In [20]:
import numpy       as np
import pandas      as pd
import seaborn     as sb
import altair      as alt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

pd.set_option('display.max_columns', 100)
alt.data_transformers.disable_max_rows();

In [2]:
df = pd.read_csv("../data/macros.csv", index_col="Uniprot Code")
df["Longitud"] = df.Secuencia.str.len() # Add lenght column
df

Unnamed: 0_level_0,Tipo de Macro,Secuencia,Longitud
Uniprot Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
O28751,AF-1521-like,MEVLFEAKVGDITLKLAQGDITQYPAKAIVNAANKRLEHGGGVAYA...,192
D3RWS7,AF-1521-like,MEVEVVRELEMDKLKVKLAGGDITKYPAEAIVNAANKYLEHGGGVA...,193
D2RH24,AF-1521-like,MVVKKFGSVEVVLEKGDITKYPAEAIVNAANKYLEHGGGVALAIAK...,193
A0A0F7ICE9,AF-1521-like,MKPEVVLRFSGVEVRLVQGDITKYPAEAIVNAANRHLEHGGGVAYA...,194
A0A075LQ95,AF-1521-like,MNLTELTFGNLTFKLAQGDITKLPAEAIVNAANKYLEHGGGVALAI...,190
...,...,...,...
P0C6Y5,Virus-type,MSSKQFKILVNEDYQVNVPSLPIRDVLQEIKYCYRNGFEGYVFVPE...,6684
A0A0P0LKV0,Virus-type,MACNRFTLAVASDSEISGTGCATVAQAVRLYSEAAVNGFRACRFVS...,6763
A0A0U2GMU3,Virus-type,MACNRVTLAVASDTEISATGCSTIALAVRRYSEAASNGFRACRFVS...,6763
B1PHJ4,Virus-type,MSSNLVTLAFASDSEISAEGFCDVSSAVYAFSVSAANGFTDCRFVA...,4268


# Número de secuencias por grupo

In [10]:
bars = alt.Chart(df).mark_bar().encode(
    x='count(y):Q',
    y=alt.Y('Tipo de Macro:N', sort='-x')
)

text = bars.mark_text(align='left',baseline='middle',dx=3).encode(text='count(y):Q')
(bars + text).properties(height=300)

# Histograma de longitudes de secuencias

In [12]:
alt.Chart(df).mark_bar().encode(
    x=alt.X('Longitud:Q',  scale=alt.Scale(type='log')), y='count()'
).interactive()

# Count Vectorizer

In [34]:
bow = CountVectorizer(analyzer='char', lowercase=False, ngram_range=(1, 1))
df_count = bow.fit_transform(df.Secuencia)
tokens = bow.get_feature_names()

print("Aminoacids:", len(tokens), tokens)
df_count = pd.DataFrame(data=df_count.toarray(), index=df.index, columns=tokens)
df_count["Tipo de Macro"] = df["Tipo de Macro"]
df_count.head()

Aminoacids: 21 ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y']


Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,X,Y,Tipo de Macro
Uniprot Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
O28751,24,3,7,21,8,17,3,11,18,15,5,4,5,3,6,8,6,19,1,0,8,AF-1521-like
D3RWS7,22,3,7,25,5,14,4,12,23,14,5,4,6,1,7,6,6,20,1,0,8,AF-1521-like
D2RH24,21,3,7,21,10,15,3,12,24,14,3,7,8,2,3,5,5,22,1,0,7,AF-1521-like
A0A0F7ICE9,21,3,3,24,6,16,4,11,13,16,4,5,7,3,13,9,5,22,1,0,8,AF-1521-like
A0A075LQ95,22,2,4,20,6,18,4,15,19,19,3,8,6,2,5,7,7,14,1,0,8,AF-1521-like


# Term Frequency Vectorizer
- **norm="l1"** -> The sum of values of a row is 1

In [35]:
tf = TfidfVectorizer(analyzer='char', use_idf=False, norm="l1", lowercase=False, ngram_range=(1, 1))
df_tf = tf.fit_transform(df.Secuencia)
tokens = tf.get_feature_names()

df_tf = pd.DataFrame(data=df_tf.toarray(), index=df.index, columns=tokens)
df_tf["Tipo de Macro"] = df["Tipo de Macro"]
df_tf.head()

Unnamed: 0_level_0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,X,Y,Tipo de Macro
Uniprot Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
O28751,0.125,0.015625,0.036458,0.109375,0.041667,0.088542,0.015625,0.057292,0.09375,0.078125,0.026042,0.020833,0.026042,0.015625,0.03125,0.041667,0.03125,0.098958,0.005208,0.0,0.041667,AF-1521-like
D3RWS7,0.11399,0.015544,0.036269,0.129534,0.025907,0.072539,0.020725,0.062176,0.119171,0.072539,0.025907,0.020725,0.031088,0.005181,0.036269,0.031088,0.031088,0.103627,0.005181,0.0,0.041451,AF-1521-like
D2RH24,0.108808,0.015544,0.036269,0.108808,0.051813,0.07772,0.015544,0.062176,0.124352,0.072539,0.015544,0.036269,0.041451,0.010363,0.015544,0.025907,0.025907,0.11399,0.005181,0.0,0.036269,AF-1521-like
A0A0F7ICE9,0.108247,0.015464,0.015464,0.123711,0.030928,0.082474,0.020619,0.056701,0.06701,0.082474,0.020619,0.025773,0.036082,0.015464,0.06701,0.046392,0.025773,0.113402,0.005155,0.0,0.041237,AF-1521-like
A0A075LQ95,0.115789,0.010526,0.021053,0.105263,0.031579,0.094737,0.021053,0.078947,0.1,0.1,0.015789,0.042105,0.031579,0.010526,0.026316,0.036842,0.036842,0.073684,0.005263,0.0,0.042105,AF-1521-like


In [32]:
def plot_multidimensional(X, y):
    
    x_pca  = PCA(n_components=2).fit_transform(X)  # Compute PCA
    x_tsne = TSNE(random_state=0).fit_transform(X) # Compute TSNE
    
    df = pd.DataFrame({'pca1': x_pca[:, 0],  'pca2': x_pca[:, 1],
                      'tsne1': x_tsne[:, 0], 'tsne2': x_tsne[:, 1],
                      "y": y})
    
    brush = alt.selection(type='interval', resolve='global')
    
    scatter = alt.Chart(df).mark_circle().encode(
        color=alt.condition(brush, 'y:N', alt.ColorValue('lightgray')),
    ).add_selection(
        brush
    ).properties(
        width=250,
        height=250
    )
    
    bars = alt.Chart(df).mark_bar().encode(
        x='y:N', 
        y='count(y):Q',
        color='y:N',
    ).transform_filter(
        brush
    ).properties(
        width=250,
        height=250
    )

    return scatter.encode(x='pca1', y='pca2') | scatter.encode(x='tsne1', y='tsne2') | bars

plot_multidimensional(df_count, df["Tipo de Macro"])

In [33]:
plot_multidimensional(df_tf, df["Tipo de Macro"])