In [4]:
import numpy as np
import scipy as sp
import pandas as pd
import sys, os
sys.path.append(os.getcwd())
from generateEnvironment import GenerateEnvironment, generate_constraints
from gmm_estimator import SolveProblem
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
pd.options.display.float_format = "{:,.2f}".format

## Load data

In [20]:
response = pd.read_pickle('response.pkl')
table_crispr = pd.read_pickle('table_crispr.pkl')
table_crispr = table_crispr.join(response, on='cell')

table_crispr = table_crispr[(table_crispr['umi_count']>=0) & (table_crispr['PRKCB_TSS']==0)]
table_crispr.loc[:,'umi_count'] = table_crispr['umi_count'].transform(lambda u : np.log(1+u))
table_crispr = table_crispr.sample(frac=1).reset_index(drop=True)

vars_list = ['chr16.1822_top_two','chr16.1856_top_two','chr16.1857_top_two','chr16.1863_second_two','chr16.1863_top_two','chr16.1865_top_two','chr16.1866_top_two','chr16.1866_second_two','chr16.1867_top_two','chr16.1897_top_two']
data = table_crispr[vars_list+['umi_count', 'ENSG00000166501']]
data -= data.mean()

## Create synthetic confounding and responses

In [21]:
data_env_1 = data.loc[:table_crispr.shape[0]//3].copy()
confounder_env_1 = np.random.binomial(1, 0.5, size=data_env_1.shape[0])
data_env_1.loc[:,'chr16.1857_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1863_second_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1863_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1865_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1866_top_two'] += confounder_env_1 
data_env_1.loc[:,'chr16.1866_second_two'] += confounder_env_1
data_env_1.loc[:,'chr16.1867_top_two'] += confounder_env_1
data_env_1.loc[:,'chr16.1897_top_two'] += confounder_env_1
data_env_1.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_1)*4
data_env_1 = data_env_1 - data_env_1.mean()

In [22]:
data_env_2 = data.loc[table_crispr.shape[0]//3:2*table_crispr.shape[0]//3].copy()
confounder_env_2 = np.random.binomial(1, 0.5, size=data_env_2.shape[0])
data_env_2.loc[:,'chr16.1822_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1856_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1865_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1866_top_two'] += confounder_env_2 
data_env_2.loc[:,'chr16.1866_second_two'] += confounder_env_2
data_env_2.loc[:,'chr16.1867_top_two'] += confounder_env_2
data_env_2.loc[:,'chr16.1897_top_two'] += confounder_env_2
data_env_2.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_2)*4
data_env_2 = data_env_2 - data_env_2.mean()

In [23]:
data_env_3 = data.loc[2*table_crispr.shape[0]//3:].copy()
confounder_env_3 = np.random.binomial(1, 0.5, size=data_env_3.shape[0])
data_env_3.loc[:,'chr16.1822_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1856_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1857_top_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1863_second_two'] += confounder_env_3 
data_env_3.loc[:,'chr16.1863_top_two'] += confounder_env_3 
data_env_3.loc[:,'ENSG00000166501'] += -(0.5 - confounder_env_3)*2
data_env_3 = data_env_3 - data_env_3.mean()

In [24]:
environment_1 = {
    'dataset':data_env_1.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [0,1]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_1.values.shape[0],
}
generate_constraints(environment_1)
environment_2 = {
    'dataset':data_env_2.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [2,3,4]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_2.values.shape[0],
}
generate_constraints(environment_2)
environment_3 = {
    'dataset':data_env_3.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [5,6,7,8,9]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_3.values.shape[0],
}
generate_constraints(environment_3)

## Run regressions and causal aggregation

In [25]:
OLS_full_uncorrupted = sm.OLS(data.loc[:,'ENSG00000166501'],data.loc[:, data.columns!='ENSG00000166501']).fit()
OLS_env_1 = sm.OLS(data_env_1.loc[:,'ENSG00000166501'],data_env_1.loc[:, data_env_1.columns!='ENSG00000166501']).fit()
OLS_env_2 = sm.OLS(data_env_2.loc[:,'ENSG00000166501'],data_env_2.loc[:, data_env_2.columns!='ENSG00000166501']).fit()
OLS_env_3 = sm.OLS(data_env_3.loc[:,'ENSG00000166501'],data_env_3.loc[:, data_env_3.columns!='ENSG00000166501']).fit()

In [33]:
solver = SolveProblem(np.eye(12), np.arange(10), 11)
beta_GMM, aCov = solver.compute_beta_GMM([environment_1, environment_2, environment_3])
CI = solver.compute_CI(beta_GMM, aCov, table_crispr.shape[0], 0.05)

## Print results

In [42]:
aggregation = pd.concat([OLS_full_uncorrupted.params,
           OLS_full_uncorrupted.bse, 
           pd.Series(beta_GMM,index=OLS_full_uncorrupted.params.index[:-1]),
           pd.Series((CI[1,:]-CI[0,:])/2,index=OLS_full_uncorrupted.params.index[:-1])]
          , axis=1)
aggregation.columns = ['OLS', 'std', 'aggr', 'std']

In [43]:
aggregation

Unnamed: 0,OLS,std,aggr,std.1
chr16.1822_top_two,-0.02,0.08,0.11,0.52
chr16.1856_top_two,0.15,0.08,0.05,0.55
chr16.1857_top_two,0.28,0.1,-0.04,0.6
chr16.1863_second_two,0.13,0.08,0.01,0.48
chr16.1863_top_two,0.07,0.08,-0.12,0.48
chr16.1865_top_two,-0.31,0.07,-0.37,0.26
chr16.1866_top_two,-1.41,0.07,-1.28,0.28
chr16.1866_second_two,-1.33,0.1,-1.35,0.37
chr16.1867_top_two,-0.57,0.09,-0.61,0.33
chr16.1897_top_two,-0.07,0.09,-0.12,0.33


In [44]:
ols_individual = pd.concat([OLS_env_1.params,OLS_env_1.bse*1.96, OLS_env_2.params,OLS_env_2.bse*1.96,OLS_env_3.params,OLS_env_3.bse*1.96], axis=1)
ols_individual.columns = ['E1', 'std', 'E2', 'std', 'E3', 'std']

In [45]:
ols_individual

Unnamed: 0,E1,std,E2,std.1,E3,std.2
chr16.1822_top_two,0.01,0.28,0.91,0.25,0.24,0.24
chr16.1856_top_two,0.03,0.3,1.32,0.26,0.42,0.25
chr16.1857_top_two,1.84,0.34,0.19,0.34,0.76,0.29
chr16.1863_second_two,1.08,0.26,-0.01,0.27,0.31,0.25
chr16.1863_top_two,0.93,0.26,0.05,0.27,0.24,0.24
chr16.1865_top_two,0.17,0.22,0.48,0.22,-0.41,0.24
chr16.1866_top_two,-0.95,0.24,-0.59,0.23,-1.28,0.25
chr16.1866_second_two,-0.27,0.29,0.02,0.29,-1.39,0.34
chr16.1867_top_two,0.29,0.29,0.65,0.28,-0.58,0.3
chr16.1897_top_two,0.9,0.29,1.21,0.28,-0.16,0.3
