In [21]:
import pandas as pd
from src.config import get_interim_dir
import seaborn as sns
from random import choices
from pyod.models.mcd import MCD
from pyod.models.ecod import ECOD
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [22]:
all_data = {}
for t in ["T1", "T2", "T3"]:
    all_data[t] = pd.read_pickle(get_interim_dir() + "agg_pca_analysis_{}.pkl".format(t))

In [23]:
X = all_data["T2"]["X"]

In [24]:
X = all_data[t]["X"]
profiles=[]
def generate_bootstrap_samples(x):
    repair_outcomes = x.index.to_list()
    outcome_frequencies = x.values
    boostrapped_outcomes = pd.Series(choices(repair_outcomes, outcome_frequencies, k=1000)).value_counts()
    boostrapped_outcomes = boostrapped_outcomes/boostrapped_outcomes.sum()
    # profiles.loc[x.name, boostrapped_outcomes.index] = boostrapped_outcomes.values
    profiles.append(pd.DataFrame({
        "Gene": x.name,
        "Frequency" : boostrapped_outcomes.values,
        "Outcome": boostrapped_outcomes.index.to_list(),
        }))

X.apply(generate_bootstrap_samples, axis=1)
profiles = pd.concat(profiles)
profiles = profiles.pivot(index=["Gene"], columns=["Outcome"], values="Frequency")
profiles.sum(axis=1).head()

Gene
0610009B22Rik    1.0
0610010K14Rik    1.0
0610030E20Rik    1.0
0610040J01Rik    1.0
1110004F10Rik    1.0
dtype: float64

In [25]:
# need to deal with zero values
print("{:.2f} of our data is zero".format((profiles.isnull()).sum().sum()/(profiles.shape[0] * profiles.shape[1])))

0.36 of our data is zero


In [26]:
# before applyiing log transformations, we need to add random uniform values between 0.1*DL and DL to our zero values
# where DL is the "Detection Limit". In our case, we have 1000 reads. This means that repair outcomes that occur with less than 1/1000 frequency are unlikely
# to be represented. Thus we choose 0.001 to be our DL
# see https://reader.elsevier.com/reader/sd/pii/S0169743921000162?token=17AE18CDF91419243B4FE3BA84D1010DC2FBE394617633976872735270CE3EB70118E1FD4D02151635B72A2D7EE97ADC&originRegion=eu-west-1&originCreation=20221017112946
# "Comparison of zero replacement strategies for compositional data with large numbers of zeros", 2021

DL = 0.001
M = len(profiles.index)
N = len(profiles.columns)
ran = pd.DataFrame(np.random.uniform(low=0.1*DL, high=DL, size=(M,N)), columns=profiles.columns, index=profiles.index)

ran.update(profiles)
ran.sum(axis=1)
profiles_imp = ran
profiles_imp = profiles_imp.div(profiles_imp.sum(axis=1), axis=0)
print("{:.2f} of our data is zero".format((profiles_imp.isnull()).sum().sum()/(profiles_imp.shape[0] * profiles_imp.shape[1])))

0.00 of our data is zero


In [27]:
# need to perform isometric log ratio transform]
# https://onlinelibrary.wiley.com/doi/epdf/10.1002/env.966
# https://www.youtube.com/watch?v=fQPCeV4MUe4&t=2s
# https://www.youtube.com/watch?v=WQ29fPfOngA&list=PLh35GyCXlQaSLdi-kMCNohZSEKNDVSO-I&index=1


from skbio.stats.composition import ilr, clr

print(profiles_imp.shape)
Z = ilr(profiles_imp.values) # Eq 5
print(Z.shape)
Y = clr(profiles_imp.values)

(17884, 88)
(17884, 87)


In [28]:
D = Y.shape[1]
V = np.array([np.sqrt(i/(i+1)) * np.concatenate((np.repeat(1/i, i), np.array([-1]), np.repeat(0, D - i - 1))) for i in range(1, D)])
V.shape

(87, 88)

In [29]:
from sklearn.covariance import MinCovDet
from sklearn.covariance import EmpiricalCovariance
from numpy.linalg import svd


cov = MinCovDet(random_state=0).fit(Z)
# cov = EmpiricalCovariance().fit(Z)
T = cov.location_
C = cov.covariance_

G, L, G_inv = svd(C) 
# G = eigenvectors
# L = eigenvalues

Z_t = np.dot(Z - T, G)

In [30]:
print("sanity checks...")
print(np.all(np.equal(np.round_(Y, 4), np.round_(np.dot(Z, V), 4))))
print(np.all(np.equal(np.round_(Z, 4), np.round_(np.dot(Y, np.transpose(V)), 4))))
print(np.all(np.equal(np.round(G_inv, 4), np.round(np.linalg.inv(G), 4))))

sanity checks...
True
True
True


In [31]:
Y_t = np.dot(Z_t, V)

In [32]:
C_y = np.dot(np.dot(np.transpose(V), (G * L * G_inv)), V)

In [33]:
G_y = np.dot(G, V)
np.sum(np.round(np.dot(G_y, np.transpose(G_y)), 4)) == (D-1)

True

In [34]:
from scipy.stats import chi2

# calculate mahalanobis distances
distances = cov.mahalanobis(Z)

# select significance threshold
pvalues = 1 - chi2.cdf(distances, Z.shape[1] - 1)
pvalues


array([0.96181325, 0.9436936 , 0.8686101 , ..., 0.13655928, 0.89406776,
       0.44464375])

In [35]:
from statsmodels.stats.multitest import fdrcorrection

rejected, corrected_pvalues = fdrcorrection(pvalues)
sum(rejected)

2050

In [66]:
df = pd.DataFrame({
    "pvalue": pvalues,
    "rejected": rejected,
    "corrected_pvalues": corrected_pvalues,
}, index=profiles.index)
df.sort_values("pvalue").index.get_loc("Gzmg")

2026

### What is the effect of the different types of transformation?


In [15]:
import pandas as pd
from src.config import get_interim_dir

X = pd.read_pickle(get_interim_dir() + "genewise_repair_outcome_profiles.pkl")
counts = X.iloc[:,-1:]
X = X.iloc[:,:-1]
counts.head()

Unnamed: 0_level_0,Feature,counts
Target,Gene,Unnamed: 2_level_1
T1,0610009B22Rik,31948.0
T1,0610010K14Rik,42869.0
T1,0610030E20Rik,16913.0
T1,0610040J01Rik,12071.0
T1,1110004F10Rik,12516.0


In [5]:
X[X>0].min().min()

7.288895367906994e-06

In [37]:
X_unscaled = (X * counts.values)
X_unscaled

Unnamed: 0_level_0,Feature,"1bp insertion - A, 0","1bp insertion - C, 0","1bp insertion - G, 0","1bp insertion - T, 0",">1bp insertion, 0","Deletion 0bp microhomology, 1","Deletion 0bp microhomology, 10","Deletion 0bp microhomology, 11","Deletion 0bp microhomology, 12","Deletion 0bp microhomology, 13",...,"Deletion 3bp microhomology, 9","Deletion 4bp microhomology, 11","Deletion 4bp microhomology, 30","Deletion 5-15bp microhomology, 21","Deletion with insertion, 0","Deletion with templated insertion, 0","Homology Directed Repair, 0","Large Deletion, 0","Tandem Duplication, 0","Very Large Deletion, 0"
Target,Gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T1,0610009B22Rik,2097.620625,91.407758,226.816128,112.804050,727.717777,18.302336,90.436022,223.377588,37.100880,39.990532,...,13.048668,3825.716923,0.000000,0.000000,2024.972415,601.906319,5902.299239,783.011864,13.666469,422.488959
T1,0610010K14Rik,2607.985635,68.542186,248.428514,148.571474,1220.085186,16.388530,95.175620,163.374350,62.293276,54.038748,...,9.161783,5366.522141,0.000000,0.000000,2941.064663,912.275504,7431.315592,1182.424911,22.794529,286.451213
T1,0610030E20Rik,1027.397547,33.145610,127.760163,68.264679,405.934051,6.105776,41.036029,118.611070,4.320327,18.368854,...,0.000000,2345.748931,0.000000,0.000000,1031.555419,291.731141,2884.117524,233.596383,6.949875,269.596220
T1,0610040J01Rik,717.188131,10.438430,135.881278,25.147894,338.033724,14.780816,21.224807,58.251516,11.620238,12.364123,...,3.836326,1651.020060,0.000000,0.000000,903.353850,316.757686,1749.209807,358.759360,11.214701,114.854770
T1,1110004F10Rik,718.572098,15.409608,77.694863,46.389189,237.571709,0.000000,33.737009,27.187348,22.797839,35.756739,...,4.531499,1706.471221,0.000000,0.000000,724.692794,178.302196,2612.817961,326.915533,5.201995,119.362701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T3,Zzef1,22.120295,142.125527,7028.660511,48.197997,218.721965,756.983279,156.130604,66.190867,107.703392,17.235752,...,0.000000,0.000000,85.266365,706.638521,2298.239211,624.560165,4593.280077,634.719727,0.000000,583.185758
T3,Zzz3,68.055249,172.810698,7338.814185,122.759933,227.264071,765.796937,105.187039,140.368597,15.930051,35.383720,...,0.000000,0.000000,69.460922,637.229877,2430.819196,551.249386,5106.010723,704.870543,9.748610,633.401615
T3,a,23.475087,168.117824,6000.243065,44.902977,243.114732,655.735151,95.703609,61.750514,62.869481,6.543185,...,0.000000,0.000000,12.883304,492.588066,1987.732791,383.833910,3841.836175,396.805852,0.000000,389.063580
T3,ccdc198,29.251128,127.000407,6397.007671,68.364319,235.565318,727.220514,68.765969,71.499077,27.226729,8.167438,...,0.000000,0.000000,13.797600,756.740332,2162.838376,588.033579,4575.042617,471.277738,5.289554,352.512678


In [61]:
import numpy as np

def alr(X):
    X_r = X_unscaled.iloc[:,0]
    X_2 = X_unscaled.iloc[:,1:]
    X_3 = X_2.div(X_r, axis=0)
    return np.log2(X_3)

print(X.iloc[:,1:].iloc[:5,4])

print(alr(X_unscaled).iloc[:5,4])

print(alr(X).iloc[:5,4])

Target  Gene         
T1      0610009B22Rik    0.000573
        0610010K14Rik    0.000382
        0610030E20Rik    0.000361
        0610040J01Rik    0.001224
        1110004F10Rik    0.000000
Name: Deletion 0bp microhomology, 1, dtype: float64
Target  Gene         
T1      0610009B22Rik   -6.840582
        0610010K14Rik   -7.314106
        0610030E20Rik   -7.394604
        0610040J01Rik   -5.600554
        1110004F10Rik        -inf
Name: Deletion 0bp microhomology, 1, dtype: float64
Target  Gene         
T1      0610009B22Rik   -6.840582
        0610010K14Rik   -7.314106
        0610030E20Rik   -7.394604
        0610040J01Rik   -5.600554
        1110004F10Rik        -inf
Name: Deletion 0bp microhomology, 1, dtype: float64


In [70]:
X

Unnamed: 0_level_0,Feature,"1bp insertion - A, 0","1bp insertion - C, 0","1bp insertion - G, 0","1bp insertion - T, 0",">1bp insertion, 0","Deletion 0bp microhomology, 1","Deletion 0bp microhomology, 10","Deletion 0bp microhomology, 11","Deletion 0bp microhomology, 12","Deletion 0bp microhomology, 13",...,"Deletion 3bp microhomology, 9","Deletion 4bp microhomology, 11","Deletion 4bp microhomology, 30","Deletion 5-15bp microhomology, 21","Deletion with insertion, 0","Deletion with templated insertion, 0","Homology Directed Repair, 0","Large Deletion, 0","Tandem Duplication, 0","Very Large Deletion, 0"
Target,Gene,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
T1,0610009B22Rik,0.065657,0.002861,0.007100,0.003531,0.022778,0.000573,0.002831,0.006992,0.001161,0.001252,...,0.000408,0.119748,0.000000,0.000000,0.063383,0.018840,0.184747,0.024509,0.000428,0.013224
T1,0610010K14Rik,0.060836,0.001599,0.005795,0.003466,0.028461,0.000382,0.002220,0.003811,0.001453,0.001261,...,0.000214,0.125184,0.000000,0.000000,0.068606,0.021281,0.173349,0.027582,0.000532,0.006682
T1,0610030E20Rik,0.060746,0.001960,0.007554,0.004036,0.024001,0.000361,0.002426,0.007013,0.000255,0.001086,...,0.000000,0.138695,0.000000,0.000000,0.060992,0.017249,0.170527,0.013812,0.000411,0.015940
T1,0610040J01Rik,0.059414,0.000865,0.011257,0.002083,0.028004,0.001224,0.001758,0.004826,0.000963,0.001024,...,0.000318,0.136776,0.000000,0.000000,0.074837,0.026241,0.144910,0.029721,0.000929,0.009515
T1,1110004F10Rik,0.057412,0.001231,0.006208,0.003706,0.018981,0.000000,0.002696,0.002172,0.001821,0.002857,...,0.000362,0.136343,0.000000,0.000000,0.057901,0.014246,0.208758,0.026120,0.000416,0.009537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T3,Zzef1,0.000782,0.005023,0.248424,0.001704,0.007731,0.026755,0.005518,0.002339,0.003807,0.000609,...,0.000000,0.000000,0.003014,0.024976,0.081230,0.022075,0.162347,0.022434,0.000000,0.020612
T3,Zzz3,0.002174,0.005521,0.234467,0.003922,0.007261,0.024466,0.003361,0.004485,0.000509,0.001130,...,0.000000,0.000000,0.002219,0.020359,0.077662,0.017612,0.163131,0.022520,0.000311,0.020236
T3,a,0.000965,0.006913,0.246731,0.001846,0.009997,0.026964,0.003935,0.002539,0.002585,0.000269,...,0.000000,0.000000,0.000530,0.020255,0.081736,0.015783,0.157977,0.016317,0.000000,0.015998
T3,ccdc198,0.001065,0.004625,0.232949,0.002490,0.008578,0.026482,0.002504,0.002604,0.000991,0.000297,...,0.000000,0.000000,0.000502,0.027557,0.078760,0.021413,0.166601,0.017162,0.000193,0.012837
