In [1]:
import numpy as np
import scipy as sp
import pandas as pd
pd.options.display.max_rows = 999
import csv
import gzip
import os
import scipy.io
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%reload_ext autoreload
%autoreload 2
np.set_printoptions(suppress=True)
import sys
sys.path.append('/home/roquero/CausalAggregation/Code')
from generateEnvironment import GenerateEnvironment, generate_constraints
from gmm_estimator import SolveProblem
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
pd.options.display.float_format = "{:,.2f}".format

In [2]:
response = pd.read_pickle('response.pkl')
table_crispr = pd.read_pickle('table_crispr.pkl')
table_crispr = table_crispr.join(response, on='cell')

table_crispr = table_crispr[(table_crispr['umi_count']>=0) & (table_crispr['PRKCB_TSS']==0)]
table_crispr.loc[:,'umi_count'] = table_crispr['umi_count'].transform(lambda u : np.log(1+u))
table_crispr = table_crispr.sample(frac=1).reset_index(drop=True)


vars_list = ['chr16.1822_top_two','chr16.1856_top_two','chr16.1857_top_two','chr16.1863_second_two','chr16.1863_top_two','chr16.1865_top_two','chr16.1866_top_two','chr16.1866_second_two','chr16.1867_top_two','chr16.1897_top_two']
data = table_crispr[vars_list+['umi_count', 'ENSG00000166501']]
data -= data.mean()

In [3]:
data.head()

Unnamed: 0,chr16.1822_top_two,chr16.1856_top_two,chr16.1857_top_two,chr16.1863_second_two,chr16.1863_top_two,chr16.1865_top_two,chr16.1866_top_two,chr16.1866_second_two,chr16.1867_top_two,chr16.1897_top_two,umi_count,ENSG00000166501
0,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.44,0.58
1,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.07,0.58
2,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.36,-2.42
3,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,0.9,-2.42
4,-0.01,-0.0,-0.0,-0.01,-0.01,-0.01,-0.01,-0.0,-0.0,-0.0,-0.35,-2.42


In [4]:
data_env_1 = data.loc[:table_crispr.shape[0]//3].copy()
confounder_env_1 = np.random.binomial(1, 0.5, size=data_env_1.shape[0])
data_env_1.loc[:,'chr16.1857_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1863_second_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1863_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1865_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1866_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1866_second_two'] += confounder_env_1
data_env_1.loc[:,'chr16.1867_top_two'] += confounder_env_1
data_env_1.loc[:,'chr16.1897_top_two'] += confounder_env_1
data_env_1.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_1)*4
data_env_1 = data_env_1 - data_env_1.mean()

In [5]:
data_env_2 = data.loc[table_crispr.shape[0]//3:2*table_crispr.shape[0]//3].copy()
confounder_env_2 = np.random.binomial(1, 0.5, size=data_env_2.shape[0])
data_env_2.loc[:,'chr16.1822_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1856_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1865_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1866_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1866_second_two'] += confounder_env_2
data_env_2.loc[:,'chr16.1867_top_two'] += confounder_env_2
data_env_2.loc[:,'chr16.1897_top_two'] += confounder_env_2
data_env_2.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_2)*4
data_env_2 = data_env_2 - data_env_2.mean()

In [7]:
data_env_3 = data.loc[2*table_crispr.shape[0]//3:].copy()
confounder_env_3 = np.random.binomial(1, 0.5, size=data_env_3.shape[0])
data_env_3.loc[:,'chr16.1822_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1856_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1857_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1863_second_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1863_top_two'] += confounder_env_3 
data_env_3.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_3)*2
data_env_3 = data_env_3 - data_env_3.mean()

In [8]:
environment_1 = {
    'dataset':data_env_1.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [0,1]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_1.values.shape[0],
}
generate_constraints(environment_1)
environment_2 = {
    'dataset':data_env_2.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [2,3,4]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_2.values.shape[0],
}
generate_constraints(environment_2)
environment_3 = {
    'dataset':data_env_3.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [5,6,7,8,9]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_3.values.shape[0],
}
generate_constraints(environment_3)

In [9]:
OLS_full_uncorrupted = sm.OLS(data.loc[:,'ENSG00000166501'],data.loc[:, data.columns!='ENSG00000166501']).fit()
OLS_env_1 = sm.OLS(data_env_1.loc[:,'ENSG00000166501'],data_env_1.loc[:, data_env_1.columns!='ENSG00000166501']).fit()
OLS_env_2 = sm.OLS(data_env_2.loc[:,'ENSG00000166501'],data_env_2.loc[:, data_env_2.columns!='ENSG00000166501']).fit()
OLS_env_3 = sm.OLS(data_env_3.loc[:,'ENSG00000166501'],data_env_3.loc[:, data_env_3.columns!='ENSG00000166501']).fit()

In [10]:
solver = SolveProblem(np.eye(12), np.arange(10), 11)
beta_GMM, aCov = solver.compute_beta_GMM([environment_1, environment_2, environment_3])
CI = solver.compute_CI(beta_GMM, aCov, table_crispr.shape[0], 0.05)

In [11]:
pd.concat([OLS_full_uncorrupted.params,OLS_full_uncorrupted.bse], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,-0.02,0.08
chr16.1856_top_two,0.15,0.08
chr16.1857_top_two,0.28,0.1
chr16.1863_second_two,0.13,0.08
chr16.1863_top_two,0.07,0.08
chr16.1865_top_two,-0.31,0.07
chr16.1866_top_two,-1.41,0.07
chr16.1866_second_two,-1.33,0.1
chr16.1867_top_two,-0.57,0.09
chr16.1897_top_two,-0.07,0.09


In [12]:
pd.DataFrame({0:beta_GMM,1:(CI[1,:]-CI[0,:])/2})

Unnamed: 0,0,1
0,-0.23,0.46
1,-0.11,0.48
2,0.55,0.63
3,-0.19,0.49
4,0.23,0.49
5,-0.26,0.25
6,-1.61,0.26
7,-1.36,0.35
8,-0.39,0.34
9,0.18,0.34


In [13]:
pd.concat([OLS_env_1.params,OLS_env_1.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,-0.05,0.28
chr16.1856_top_two,-0.07,0.29
chr16.1857_top_two,1.5,0.33
chr16.1863_second_two,0.92,0.26
chr16.1863_top_two,1.21,0.26
chr16.1865_top_two,0.06,0.22
chr16.1866_top_two,-0.54,0.25
chr16.1866_second_two,-0.19,0.31
chr16.1867_top_two,0.23,0.28
chr16.1897_top_two,0.83,0.29


In [14]:
pd.concat([OLS_env_2.params,OLS_env_2.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,0.98,0.25
chr16.1856_top_two,1.29,0.27
chr16.1857_top_two,0.51,0.35
chr16.1863_second_two,0.1,0.27
chr16.1863_top_two,-0.06,0.27
chr16.1865_top_two,0.58,0.23
chr16.1866_top_two,-0.52,0.24
chr16.1866_second_two,0.1,0.29
chr16.1867_top_two,0.61,0.28
chr16.1897_top_two,0.91,0.28


In [15]:
pd.concat([OLS_env_3.params,OLS_env_3.bse*1.96], axis=1)

Unnamed: 0,0,1
chr16.1822_top_two,0.26,0.25
chr16.1856_top_two,0.62,0.26
chr16.1857_top_two,0.63,0.3
chr16.1863_second_two,0.39,0.25
chr16.1863_top_two,0.1,0.24
chr16.1865_top_two,-0.26,0.23
chr16.1866_top_two,-1.59,0.23
chr16.1866_second_two,-1.39,0.32
chr16.1867_top_two,-0.43,0.31
chr16.1897_top_two,0.22,0.31
