In [3]:
import pandas as pd
import numpy as np
import mchmm as mc
import seaborn as sbn
from itertools import groupby
from collections import Counter

In [18]:

data ="AAUAU#AUUUU"

cols = list(set(data))
cols.sort()
rows = list(set([ data[i:i+2] for i in range(len(data)-1) ]))
rows.sort()
observed_matrix = pd.DataFrame(0, index=rows, columns=cols)

for i in range(len(data) - 2):
    observed_matrix.loc[data[i:i+2]][data[i+2]] += 1
    
observed_matrix = observed_matrix.drop(index=[ c for c in observed_matrix.index.values if "#" in c])
observed_matrix = observed_matrix.drop(columns="#")
observed_matrix
obs_row_total = observed_matrix.sum(axis=1)
observed_matrix_prob = observed_matrix.values / obs_row_total[:, None]
observed_matrix_prob = pd.DataFrame(observed_matrix_prob, 
                                    index=observed_matrix.index, 
                                    columns=observed_matrix.columns, dtype=float)

observed_matrix_prob

Unnamed: 0,A,U
AA,0.0,1.0
AU,0.5,0.5
UA,0.0,1.0
UU,0.0,1.0


In [3]:
def transition_matrix_prob(df, sep="#"):
    data =sep.join(df.seq.values)
    mm = mc.MarkovChain().from_data(data)
    observed_matrix = pd.DataFrame(mm.observed_matrix, index=mm.states, columns=mm.states, dtype=float)
    observed_matrix = observed_matrix.drop(columns=sep)
    observed_matrix = observed_matrix.drop(index=sep)
    obs_row_total = observed_matrix.sum(axis=1)
    observed_matrix_prob = observed_matrix.values / obs_row_total[:, None]
    return pd.DataFrame(observed_matrix_prob, index=observed_matrix.index, columns=observed_matrix.columns, dtype=float)

def transition_matrix_prob_2(df, sep = "#"):
    data =sep.join(df.seq.values)
    cols = list(set(data))
    cols.sort()
    rows = list(set([ data[i:i+2] for i in range(len(data)-1) ]))
    rows.sort()
    observed_matrix = pd.DataFrame(0, index=rows, columns=cols)
    for i in range(len(data) - 2):
        observed_matrix.loc[data[i:i+2]][data[i+2]] += 1
    observed_matrix = observed_matrix.drop(index=[ c for c in observed_matrix.index.values if "#" in c])
    observed_matrix = observed_matrix.drop(columns=sep)
    obs_row_total = observed_matrix.sum(axis=1)
    observed_matrix_prob = observed_matrix.values / obs_row_total[:, None]
    observed_matrix_prob = pd.DataFrame(observed_matrix_prob, 
                                        index=observed_matrix.index, 
                                        columns=observed_matrix.columns, dtype=float)
    return observed_matrix_prob


def binomial_prob(df):
    data = list("".join(df.seq.values))
    data.sort()
    N = len(data)
    print(N)
    freq = { key : len(list(group)) for key, group in groupby(data)}
    binom_prob = { k : round(freq[k] / N, 6)   for k in freq }
    return pd.DataFrame({"letter" : list(binom_prob.keys()),
                          "prob" : list(binom_prob.values())})

def binomial_prob_2(df):
    data = "".join(df.seq.values)
    data = list([ data[i:i+2] for i in range(len(data)-1) ])
    data.sort()
    N = len(data)
    freq = { key : len(list(group)) for key, group in groupby(data)}
    binom_prob = { k : round(freq[k] / N, 6)   for k in freq }
    return pd.DataFrame({"letter" : list(binom_prob.keys()),
                          "prob" : list(binom_prob.values())})

### Markov Model 1-order

In [70]:
df_m32k25 = pd.read_csv("../data/pdb_str/all_m32k25_cath.csv")

# m32k25_trans_prob = transition_matrix_prob(df_m32k25)
# m32k25_trans_prob.to_csv("transition_prob_m32k25.csv")

# m32k25_binom_prob = binomial_prob(df_m32k25)
# m32k25_binom_prob.to_csv("binom_prob_m32k25.csv")

4644598


In [108]:
df_cs219 = pd.read_csv("../data/pdb_str/all_cs219_cath.csv")
df_cs219 = df_cs219[~df_cs219.seq.isna()]
cs219_trans_prob = transition_matrix_prob(df_cs219, "€")
cs219_binom_prob = binomial_prob(df_cs219)
cs219_binom_prob.to_csv("binom_prob_cs219.csv")
cs219_trans_prob.to_csv("transition_prob_cs219.csv")

In [72]:
df_pb = pd.read_csv("../data/pdb_str/all_pb_cath.csv")
pb_trans_prob = transition_matrix_prob(df_pb)
pb_trans_prob.to_csv("transition_prob_pb.csv")
pb_binom_prob = binomial_prob(df_pb)
pb_binom_prob.to_csv("binom_prob_pb.csv")

13766585


In [5]:
df_aa = pd.read_csv("../data/pdb_str/all_aa_cath.csv")
aa_trans_prob = transition_matrix_prob(df_aa)
aa_trans_prob.to_csv("transition_prob_aa.csv")
aa_binom_prob = binomial_prob(df_aa)
aa_binom_prob.to_csv("binom_prob_aa.csv", index=False)

In [22]:
df_chem = pd.read_csv("../data/pdb_str/all_chem_cath.csv")
chem_trans_prob = transition_matrix_prob(df_chem)
chem_trans_prob.to_csv("transition_prob_chem.csv")
chem_binom_prob = binomial_prob(df_chem)
chem_binom_prob.to_csv("binom_prob_chem.csv", index=False)

4730224


### Markov Model 2-order

In [10]:
df_m32k25 = pd.read_csv("../data/pdb_str/all_m32k25_cath.csv")
# m32k25_trans_prob = transition_matrix_prob_2(df_m32k25)
# # m32k25_trans_prob.to_csv("transition_prob_2_m32k25.csv")

# m32k25_binom_prob = binomial_prob_2(df_m32k25)
# m32k25_binom_prob.to_csv("binom_prob_2_m32k25.csv")

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,...,P,Q,R,S,T,U,V,W,X,Y
AA,0.466667,0.333333,0.0,0.066667,0.066667,0.0,0.000000,0.0,0.066667,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
AB,0.285714,0.142857,0.0,0.000000,0.000000,0.0,0.285714,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
AC,0.000000,0.000000,0.5,0.000000,0.500000,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
AD,0.000000,0.333333,0.0,0.000000,0.333333,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
AE,0.000000,0.333333,0.0,0.000000,0.333333,0.0,0.000000,0.0,0.333333,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
AI,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
BA,0.000000,0.250000,0.0,0.250000,0.500000,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
BB,0.000000,0.500000,0.0,0.000000,0.250000,0.0,0.250000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
BE,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,1.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0
BG,0.200000,0.000000,0.0,0.200000,0.200000,0.2,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0


In [26]:
df_cs219 = pd.read_csv("../data/pdb_str/all_cs219_cath.csv")
df_cs219 = df_cs219[~df_cs219.seq.isna()]
# cs219_trans_prob = transition_matrix_prob_2(df_cs219, "€")
cs219_binom_prob = binomial_prob_2(df_cs219)
cs219_binom_prob.to_csv("binom_prob_2_cs219.csv")
# cs219_trans_prob.to_csv("transition_prob_2_cs219.csv")

In [25]:
df_pb = pd.read_csv("../data/pdb_str/all_pb_cath.csv")
df_pb.shape
# pb_trans_prob = transition_matrix_prob_2(df_pb)
# pb_trans_prob.to_csv("transition_prob_2_pb.csv")
# pb_binom_prob = binomial_prob_2(df_pb)
# pb_binom_prob.to_csv("binom_prob_2_pb.csv")

(55300, 2)

In [24]:
df_aa = pd.read_csv("../data/pdb_str/all_aa_cath.csv")
df_aa.shape
# aa_trans_prob = transition_matrix_prob_2(df_aa)
# aa_trans_prob.to_csv("transition_prob_2_aa.csv")
# aa_binom_prob = binomial_prob_2(df_aa)
# aa_binom_prob.to_csv("binom_prob_2_aa.csv", index=False)

(30741, 2)

In [26]:
df_chem = pd.read_csv("../data/pdb_str/all_chem_cath.csv")
df_chem.shape
# chem_trans_prob = transition_matrix_prob_2(df_chem)
# chem_trans_prob.to_csv("transition_prob_2_chem.csv")
# chem_binom_prob = binomial_prob_2(df_chem)
# chem_binom_prob.to_csv("binom_prob_2_chem.csv", index=False)

(30741, 2)

# 