# run with [slim_conservation_scoring](https://github.com/jacksonh1/slim_conservation_scoring) environment
`mamba activate slim_conservation_scoring`

# grantham matrix

In [2]:
from pathlib import Path

In [3]:
grantham_directory = Path("../../data/matrices/grantham_matrix/")
grantham_directory.mkdir(parents=True, exist_ok=True)

The grantham matrix is from a paper from 1974 (DOI: 10.1126/science.185.4154.862). I am going to download a tsv of this matrix from the recent [SHARK]( https://git.mpi-cbg.de/tothpetroczylab/shark) tools for convenience.

In [4]:
!curl "https://owncloud.mpi-cbg.de/index.php/s/c8F3El6tUAC97hl/download?path=%2Fdata&files=grantham.tsv" -o "../../data/matrices/grantham_matrix/grantham.tsv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1436  100  1436    0     0    983      0  0:00:01  0:00:01 --:--:--   982


In [5]:
import .substitution_matrices as submats
import pandas as pd

df = pd.read_csv(grantham_directory / 'grantham.tsv', sep='\t', index_col=0)
df.to_csv(grantham_directory / 'grantham.csv')

df_norm = submats.normalize_matrix_df(df)
df_norm.to_csv(grantham_directory / 'grantham_norm.csv')

# convert the distance matrix to a similarity matrix
df_sim_norm = 1-df_norm
df_sim_norm.to_csv(grantham_directory / 'grantham_similarity_norm.csv')

# EDSSmat matrices

These are substitution matrices for disordered regions from this publication:<br>
>Trivedi, R., Nagarajaram, H.A. Amino acid substitution scoring matrices specific to intrinsically disordered regions in proteins. Sci Rep 9, 16380 (2019). https://doi.org/10.1038/s41598-019-52532-8

download and unzip the dataset

In [6]:
disorder_directory = Path("../../data/matrices/disorder-matrix/")
disorder_directory.mkdir(parents=True, exist_ok=True)

In [9]:
!curl "http://www.cdfd.org.in/labpages/Matrices_and_Datasets.tar.gz" -o "../../data/matrices/disorder-matrix/Matrices_and_Datasets.tar.gz"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:44 --:--:--     0    0 --:--:--  0:00:12 --:--:--     0     0 --:--:--  0:00:17 --:--:--     0^C


In [12]:
!tar -xvzf "../../data/matrices/disorder-matrix/Matrices_and_Datasets.tar.gz" -C "../../data/matrices/disorder-matrix/"

x Matrices_and_Datasets/
x Matrices_and_Datasets/.DS_Store
x Matrices_and_Datasets/Datasets/
x Matrices_and_Datasets/Datasets/LD_test_dataset
x Matrices_and_Datasets/Datasets/._HD_test_dataset
x Matrices_and_Datasets/Datasets/MD_test_dataset
x Matrices_and_Datasets/Datasets/EUMAT_dataset
x Matrices_and_Datasets/Datasets/._MD_test_dataset
x Matrices_and_Datasets/Datasets/HD_test_dataset
x Matrices_and_Datasets/Matrices/
x Matrices_and_Datasets/Matrices/EDSSMat80
x Matrices_and_Datasets/Matrices/EDSSMat70
x Matrices_and_Datasets/Matrices/EDSSMat75
x Matrices_and_Datasets/Matrices/EDSSMat90
x Matrices_and_Datasets/Matrices/EDSSMat50
x Matrices_and_Datasets/Matrices/EDSSMat60
x Matrices_and_Datasets/Matrices/EDSSMat62


---

# prepare some normalized and aligner-friendly version of the matrices and save them in the local_data directory
- saving them in the local_data directory allows them to be installed with the local source tools as a package

In [15]:
from pathlib import Path
import pandas as pd
from Bio import Align
from pyprojroot import here

In [16]:
root = here()
local_data = root / "slim_conservation_scoring" / "local_data"
matrix_folder = local_data / "substitution_matrices"
matrix_folder.mkdir(exist_ok=True)

In [17]:
def prep_matrix_dfs(mat_df, mat_name, output_dir: str|Path="./matrices"):
    output_dir = Path(output_dir)
    mat_df = mat_df.copy()
    mat_df_new_diag = submats.matrixdf_diagonal_2_max_off_diagonal(mat_df)
    mat_df_norm = submats.normalize_matrix_df(mat_df)
    mat_df_new_diag_norm = submats.normalize_matrix_df(mat_df_new_diag)
    mat_df_sqrt_norm = submats.sqrt_normalize_matrix_df(mat_df)

    mat_df.to_csv(output_dir / f"{mat_name}.csv")
    mat_df_new_diag.to_csv(output_dir / f"{mat_name}_max_off_diagonal.csv")
    mat_df_norm.to_csv(output_dir / f"{mat_name}_norm.csv")
    mat_df_new_diag_norm.to_csv(output_dir / f"{mat_name}_max_off_diagonal_norm.csv")
    mat_df_sqrt_norm.to_csv(output_dir / f"{mat_name}_sqrt_norm.csv")

In [18]:
matrix_name = "BLOSUM62"
matBLOSUM62_df = submats.load_matrix_as_df(matrix_name)
prep_matrix_dfs(matBLOSUM62_df, matrix_name, output_dir=matrix_folder)

In [20]:
matrix_name = "EDSSMat50"
mat = Align.substitution_matrices.read(
    disorder_directory / "Matrices_and_Datasets/Matrices/EDSSMat50"
)
matEDSS50_df = submats.convert_matrix_array_2_df(mat)
prep_matrix_dfs(matEDSS50_df, matrix_name, output_dir=matrix_folder)
import shutil
shutil.copy(disorder_directory / "Matrices_and_Datasets/Matrices/EDSSMat50", matrix_folder / "EDSSMat50")

PosixPath('/Users/jackson/Dropbox (MIT)/work/07-SLiM_bioinformatics/05-conservation_pipeline/src/local_data/substitution_matrices/EDSSMat50')

In [21]:
matrix_name = "grantham_similarity"
gran_df = pd.read_csv(grantham_directory / "grantham_similarity_norm.csv", index_col=0)
prep_matrix_dfs(gran_df, matrix_name, output_dir=matrix_folder)
gran_df_aligner = round(gran_df*100)
gran_df_aligner = gran_df_aligner.astype(int)
gran_df_aligner.to_csv(matrix_folder / 'grantham_similarity_normx100_aligner_compatible', sep=' ')