# run with [pipeline 2](https://github.com/jacksonh1/motif_conservation_in_IDRs) environment
`mamba activate slim_conservation`

# grantham matrix

In [1]:
from pathlib import Path

In [5]:
grantham_directory = Path("../../../data/grantham_matrix/")
grantham_directory.mkdir(parents=True, exist_ok=True)

The grantham matrix is from a paper from 1974 (DOI: 10.1126/science.185.4154.862). I am going to download a tsv of this matrix from the recent [SHARK]( https://git.mpi-cbg.de/tothpetroczylab/shark) tools for convenience.

In [6]:
!curl "https://owncloud.mpi-cbg.de/index.php/s/c8F3El6tUAC97hl/download?path=%2Fdata&files=grantham.tsv" -o "../../../data/grantham_matrix/grantham.tsv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1436  100  1436    0     0   1647      0 --:--:-- --:--:-- --:--:--  1646


In [7]:
def normalize_matrix_df(mat_df):
    # normalize matrix
    mat_df_norm = (mat_df - mat_df.min().min()) / (
        mat_df.max().max() - mat_df.min().min()
    )
    return mat_df_norm

In [8]:
import local_seqtools.substitution_matrices as submats
import pandas as pd

df = pd.read_csv(grantham_directory / 'grantham.tsv', sep='\t', index_col=0)
df_norm = normalize_matrix_df(df)
df_sim_norm = 1-df_norm
gran_df_aligner = round(df_sim_norm.copy()*100)
gran_df_aligner = gran_df_aligner.astype(int)

In [10]:
from pyprojroot import here
here()

PosixPath('/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioinformatics/10-pairk-method')

In [12]:
root = here()
local_data = root / "src" / "local_data"
matrix_folder = local_data / "substitution_matrices"
df.to_csv(matrix_folder / 'grantham.csv', sep=',')
df_sim_norm.to_csv(matrix_folder / 'grantham_similarity_norm.csv', sep=',')
gran_df_aligner.to_csv(matrix_folder / 'grantham_similarity_normx100_aligner_compatible', sep=' ')

# EDSSmat matrices

These are substitution matrices for disordered regions from this publication:<br>
>Trivedi, R., Nagarajaram, H.A. Amino acid substitution scoring matrices specific to intrinsically disordered regions in proteins. Sci Rep 9, 16380 (2019). https://doi.org/10.1038/s41598-019-52532-8

download and unzip the dataset

In [13]:
disorder_directory = Path("../../../data/disorder-matrix/")
disorder_directory.mkdir(parents=True, exist_ok=True)

In [14]:
!curl "http://www.cdfd.org.in/labpages/Matrices_and_Datasets.tar.gz" -o "../../../data/disorder-matrix/Matrices_and_Datasets.tar.gz"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 24.0M  100 24.0M    0     0  1033k      0  0:00:23  0:00:23 --:--:-- 1396k 1570k      0  0:00:15  0:00:07  0:00:08 1246k


In [15]:
!tar -xvzf "../../../data/disorder-matrix/Matrices_and_Datasets.tar.gz" -C "../../../data/disorder-matrix/"

x Matrices_and_Datasets/
x Matrices_and_Datasets/.DS_Store
x Matrices_and_Datasets/Datasets/
x Matrices_and_Datasets/Datasets/LD_test_dataset
x Matrices_and_Datasets/Datasets/._HD_test_dataset
x Matrices_and_Datasets/Datasets/MD_test_dataset
x Matrices_and_Datasets/Datasets/EUMAT_dataset
x Matrices_and_Datasets/Datasets/._MD_test_dataset
x Matrices_and_Datasets/Datasets/HD_test_dataset
x Matrices_and_Datasets/Matrices/
x Matrices_and_Datasets/Matrices/EDSSMat80
x Matrices_and_Datasets/Matrices/EDSSMat70
x Matrices_and_Datasets/Matrices/EDSSMat75
x Matrices_and_Datasets/Matrices/EDSSMat90
x Matrices_and_Datasets/Matrices/EDSSMat50
x Matrices_and_Datasets/Matrices/EDSSMat60
x Matrices_and_Datasets/Matrices/EDSSMat62


In [16]:
!cp "../../../data/disorder-matrix/Matrices_and_Datasets/Matrices/EDSSMat50" "../../local_data/substitution_matrices/"

---

In [19]:
from Bio import Align

In [32]:
list(set([i[0] for i in Align.substitution_matrices.load("BLOSUM62").keys()]))

24

In [37]:
Align.substitution_matrices.load()

list

In [18]:
def load_matrix_as_df(matrix_name):
    # convert to pandas dataframe
    mat = Align.substitution_matrices.load(matrix_name)
    AAs = sorted(
        [
            "V",
            "E",
            "K",
            "I",
            "H",
            "L",
            "G",
            "T",
            "M",
            "N",
            "S",
            "P",
            "A",
            "F",
            "W",
            "Y",
            "Q",
            "R",
            "C",
            "D",
        ]
    )
    AAs.extend(["B", "Z", "X", "*"])
    mat_df = pd.DataFrame(index=AAs, columns=AAs)
    for k in mat.keys():
        mat_df.loc[k[0], k[1]] = mat[k]
    mat_df = mat_df.astype(float)
    return mat_df

In [None]:
matrix_name = "BLOSUM62"
matBLOSUM62_df = load_matrix_as_df(matrix_name)
matBLOSUM62_df.to_csv(matrix_folder / 'BLOSUM62', sep=' ')