In [19]:
import numpy as np
import scipy as sp
import pandas as pd
pd.options.display.max_rows = 999
import csv
import gzip
import os
import scipy.io
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%reload_ext autoreload
%autoreload 2
np.set_printoptions(suppress=True)
import sys
sys.path.append('/home/roquero/CausalAggregation/Code')
from generateEnvironment import GenerateEnvironment, generate_constraints
from gmm_estimator import SolveProblem
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
pd.options.display.float_format = "{:,.8f}".format

In [2]:
response = pd.read_pickle('response.pkl')
table_crispr = pd.read_pickle('table_crispr.pkl')
table_crispr = table_crispr.join(response, on='cell')

table_crispr = table_crispr[(table_crispr['umi_count']>=0) & (table_crispr['PRKCB_TSS']==0)]
table_crispr.loc[:,'umi_count'] = table_crispr['umi_count'].transform(lambda u : np.log(1+u))
table_crispr = table_crispr.sample(frac=1).reset_index(drop=True)


vars_list = ['chr16.1822_top_two','chr16.1856_top_two','chr16.1857_top_two','chr16.1863_second_two','chr16.1863_top_two','chr16.1865_top_two','chr16.1866_top_two','chr16.1866_second_two','chr16.1867_top_two','chr16.1897_top_two']
data = table_crispr[vars_list+['umi_count', 'ENSG00000166501']]
data -= data.mean()

In [3]:
data.head()

Unnamed: 0,chr16.1822_top_two,chr16.1856_top_two,chr16.1857_top_two,chr16.1863_second_two,chr16.1863_top_two,chr16.1865_top_two,chr16.1866_top_two,chr16.1866_second_two,chr16.1867_top_two,chr16.1897_top_two,umi_count,ENSG00000166501
0,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.17,0.58
1,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.44,3.58
2,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.58,2.58
3,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.44,-1.42
4,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.72,-0.42


In [4]:
data_env_1 = data.loc[:table_crispr.shape[0]//3].copy()
confounder_env_1 = np.random.binomial(1, 0.5, size=data_env_1.shape[0])
data_env_1.loc[:,'chr16.1857_top_two'] += confounder_env_1 *0.1
data_env_1.loc[:,'chr16.1863_second_two'] += confounder_env_1 *0.1
data_env_1.loc[:,'chr16.1863_top_two'] += confounder_env_1 *0.1
data_env_1.loc[:,'chr16.1865_top_two'] += confounder_env_1 *0.1
data_env_1.loc[:,'chr16.1866_top_two'] += confounder_env_1 *0.1
data_env_1.loc[:,'chr16.1866_second_two'] += confounder_env_1*0.1
data_env_1.loc[:,'chr16.1867_top_two'] += confounder_env_1*0.1
data_env_1.loc[:,'chr16.1897_top_two'] += confounder_env_1*0.1
data_env_1.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_1)*np.random.normal()
data_env_1 = data_env_1 - data_env_1.mean()

In [5]:
data_env_2 = data.loc[table_crispr.shape[0]//3:2*table_crispr.shape[0]//3].copy()
confounder_env_2 = np.random.binomial(1, 0.5, size=data_env_2.shape[0])
data_env_2.loc[:,'chr16.1822_top_two'] += confounder_env_2 *0.1
data_env_2.loc[:,'chr16.1856_top_two'] += confounder_env_2 *0.1
data_env_2.loc[:,'chr16.1865_top_two'] += confounder_env_2 *0.1
data_env_2.loc[:,'chr16.1866_top_two'] += confounder_env_2 *0.1
data_env_2.loc[:,'chr16.1866_second_two'] += confounder_env_2*0.1
data_env_2.loc[:,'chr16.1867_top_two'] += confounder_env_2*0.1
data_env_2.loc[:,'chr16.1897_top_two'] += confounder_env_2*0.1
data_env_2.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_2)*2
data_env_2 = data_env_2 - data_env_2.mean()

In [6]:
data_env_3 = data.loc[2*table_crispr.shape[0]//3:].copy()
confounder_env_3 = np.random.binomial(1, 0.5, size=data_env_3.shape[0])
data_env_3.loc[:,'chr16.1822_top_two'] += confounder_env_3 *0.1
data_env_3.loc[:,'chr16.1856_top_two'] += confounder_env_3 *0.1
data_env_3.loc[:,'chr16.1857_top_two'] += confounder_env_3 *0.1
data_env_3.loc[:,'chr16.1863_second_two'] += confounder_env_3 *0.1
data_env_3.loc[:,'chr16.1863_top_two'] += confounder_env_3 *0.1
data_env_3.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_3)*2
data_env_3 = data_env_3 - data_env_3.mean()

In [7]:
environment_1 = {
    'dataset':data_env_1.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [0,1]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_1.values.shape[0],
}
generate_constraints(environment_1)
environment_2 = {
    'dataset':data_env_2.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [2,3,4]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_2.values.shape[0],
}
generate_constraints(environment_2)
environment_3 = {
    'dataset':data_env_3.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [5,6,7,8,9]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_3.values.shape[0],
}
generate_constraints(environment_3)

In [8]:
OLS_full_uncorrupted = sm.OLS(data.loc[:,'ENSG00000166501'],data.loc[:, data.columns!='ENSG00000166501']).fit()
OLS_env_1 = sm.OLS(data_env_1.loc[:,'ENSG00000166501'],data_env_1.loc[:, data_env_1.columns!='ENSG00000166501']).fit()
OLS_env_2 = sm.OLS(data_env_2.loc[:,'ENSG00000166501'],data_env_2.loc[:, data_env_2.columns!='ENSG00000166501']).fit()
OLS_env_3 = sm.OLS(data_env_3.loc[:,'ENSG00000166501'],data_env_3.loc[:, data_env_3.columns!='ENSG00000166501']).fit()

In [9]:
solver = SolveProblem(np.eye(12), np.arange(10), 11)
beta_GMM, aCov = solver.compute_beta_GMM([environment_1, environment_2, environment_3])
CI = solver.compute_CI(beta_GMM, aCov, table_crispr.shape[0], 0.05)

In [10]:
pd.concat([OLS_full_uncorrupted.params,OLS_full_uncorrupted.bse], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,-0.02,0.08
chr16.1856_top_two,0.15,0.08
chr16.1857_top_two,0.28,0.1
chr16.1863_second_two,0.13,0.08
chr16.1863_top_two,0.07,0.08
chr16.1865_top_two,-0.31,0.07
chr16.1866_top_two,-1.41,0.07
chr16.1866_second_two,-1.33,0.1
chr16.1867_top_two,-0.57,0.09
chr16.1897_top_two,-0.07,0.09


In [11]:
pd.DataFrame({0:beta_GMM,1:(CI[1,:]-CI[0,:])/2})

Unnamed: 0,0,1
0,0.04,0.28
1,0.07,0.29
2,0.13,0.41
3,0.14,0.32
4,-0.03,0.3
5,-0.15,0.26
6,-1.39,0.29
7,-1.05,0.37
8,-0.54,0.32
9,0.1,0.35


In [12]:
pd.concat([OLS_env_1.params,OLS_env_1.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,0.04,0.27
chr16.1856_top_two,0.05,0.28
chr16.1857_top_two,0.91,0.32
chr16.1863_second_two,0.53,0.25
chr16.1863_top_two,0.43,0.26
chr16.1865_top_two,-0.08,0.22
chr16.1866_top_two,-1.05,0.23
chr16.1866_second_two,-0.93,0.31
chr16.1867_top_two,-0.17,0.29
chr16.1897_top_two,0.72,0.29


In [13]:
pd.concat([OLS_env_2.params,OLS_env_2.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,2.46,0.27
chr16.1856_top_two,3.37,0.29
chr16.1857_top_two,0.24,0.37
chr16.1863_second_two,-0.09,0.29
chr16.1863_top_two,0.06,0.27
chr16.1865_top_two,1.39,0.23
chr16.1866_top_two,0.68,0.25
chr16.1866_second_two,1.97,0.3
chr16.1867_top_two,2.75,0.3
chr16.1897_top_two,2.44,0.28


In [14]:
pd.concat([OLS_env_3.params,OLS_env_3.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,2.53,0.26
chr16.1856_top_two,2.76,0.26
chr16.1857_top_two,4.48,0.32
chr16.1863_second_two,2.9,0.27
chr16.1863_top_two,2.55,0.26
chr16.1865_top_two,-0.17,0.24
chr16.1866_top_two,-1.38,0.26
chr16.1866_second_two,-1.15,0.34
chr16.1867_top_two,-0.57,0.29
chr16.1897_top_two,0.04,0.32


In [20]:
OLS_env_3.normalized_cov_params

Unnamed: 0,chr16.1822_top_two,chr16.1856_top_two,chr16.1857_top_two,chr16.1863_second_two,chr16.1863_top_two,chr16.1865_top_two,chr16.1866_top_two,chr16.1866_second_two,chr16.1867_top_two,chr16.1897_top_two,umi_count
chr16.1822_top_two,0.00250942,-0.00039074,-0.00060533,-0.0003986,-0.0003671,-8.44e-06,3.35e-06,-2.188e-05,8.65e-06,-1.61e-05,-2.79e-06
chr16.1856_top_two,-0.00039074,0.00264856,-0.00063349,-0.00042758,-0.00039383,-1.687e-05,2.39e-06,1.77e-06,8.16e-06,-6.31e-06,-2.25e-06
chr16.1857_top_two,-0.00060533,-0.00063349,0.00381208,-0.00067631,-0.00060991,4.83e-06,-6.17e-06,1.305e-05,1.788e-05,1.752e-05,8.2e-07
chr16.1863_second_two,-0.0003986,-0.00042758,-0.00067631,0.00269046,-0.00040147,9.66e-06,1.8e-06,-1.229e-05,-5.203e-05,-6.71e-06,-2.05e-06
chr16.1863_top_two,-0.0003671,-0.00039383,-0.00060991,-0.00040147,0.00249557,3.83e-06,3.14e-06,-9.57e-06,8.44e-06,-4.89e-06,-2.09e-06
chr16.1865_top_two,-8.44e-06,-1.687e-05,4.83e-06,9.66e-06,3.83e-06,0.00216272,-6.14e-06,-1.87e-06,-1.1e-05,-0.0,-5.6e-06
chr16.1866_top_two,3.35e-06,2.39e-06,-6.17e-06,1.8e-06,3.14e-06,-6.14e-06,0.0026001,5.21e-06,8.18e-06,-4.01e-06,-4.8e-06
chr16.1866_second_two,-2.188e-05,1.77e-06,1.305e-05,-1.229e-05,-9.57e-06,-1.87e-06,5.21e-06,0.00429067,1.697e-05,2.5e-07,-5.79e-06
chr16.1867_top_two,8.65e-06,8.16e-06,1.788e-05,-5.203e-05,8.44e-06,-1.1e-05,8.18e-06,1.697e-05,0.00318241,-8.01e-06,-5.82e-06
chr16.1897_top_two,-1.61e-05,-6.31e-06,1.752e-05,-6.71e-06,-4.89e-06,-0.0,-4.01e-06,2.5e-07,-8.01e-06,0.00393889,-6.35e-06
