In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999
import csv
import gzip
import os
import scipy.io
import matplotlib.pyplot as plt
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%reload_ext autoreload
%autoreload 2
np.set_printoptions(suppress=True)
import sys
sys.path.append('/home/roquero/CausalAggregation/Code')
from generateEnvironment import GenerateEnvironment, generate_constraints
from gmm_estimator import SolveProblem
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

In [2]:
response = pd.read_pickle('response.pkl')
table_crispr = pd.read_pickle('table_crispr.pkl')
table_crispr = table_crispr.join(response, on='cell')

table_crispr = table_crispr[(table_crispr['umi_count']>=0) & (table_crispr['PRKCB_TSS']==0)]
table_crispr.loc[:,'umi_count'] = table_crispr['umi_count'].transform(lambda u : np.log(1+u))
table_crispr = table_crispr.sample(frac=1).reset_index(drop=True)


vars_list = ['chr16.1822_top_two','chr16.1856_top_two','chr16.1857_top_two','chr16.1863_second_two','chr16.1863_top_two','chr16.1865_top_two','chr16.1866_top_two','chr16.1866_second_two','chr16.1867_top_two','chr16.1897_top_two']
data = table_crispr[vars_list+['umi_count', 'ENSG00000166501']]
data -= data.mean()

In [3]:
data.head()

Unnamed: 0,chr16.1822_top_two,chr16.1856_top_two,chr16.1857_top_two,chr16.1863_second_two,chr16.1863_top_two,chr16.1865_top_two,chr16.1866_top_two,chr16.1866_second_two,chr16.1867_top_two,chr16.1897_top_two,umi_count,ENSG00000166501
0,-0.005056,-0.004601,-0.002996,-0.005047,-0.005247,-0.007009,-0.006084,-0.003563,-0.004097,-0.004033,0.22113,0.579404
1,-0.005056,-0.004601,-0.002996,-0.005047,-0.005247,-0.007009,-0.006084,-0.003563,-0.004097,-0.004033,-0.419546,-2.420596
2,-0.005056,-0.004601,-0.002996,-0.005047,-0.005247,-0.007009,-0.006084,-0.003563,-0.004097,-0.004033,-0.559947,-0.420596
3,-0.005056,-0.004601,-0.002996,-0.005047,-0.005247,-0.007009,-0.006084,-0.003563,-0.004097,-0.004033,0.721804,-1.420596
4,-0.005056,-0.004601,-0.002996,-0.005047,-0.005247,-0.007009,-0.006084,-0.003563,-0.004097,-0.004033,-0.628415,5.579404


In [4]:
data_env_1 = data.loc[:table_crispr.shape[0]//3].copy()
confounder_env_1 = np.random.binomial(1, 0.5, size=data_env_1.shape[0])
data_env_1.loc[:,'chr16.1857_top_two'] += confounder_env_1 *np.random.normal()
data_env_1.loc[:,'chr16.1863_second_two'] += confounder_env_1 *np.random.normal()
data_env_1.loc[:,'chr16.1863_top_two'] += confounder_env_1 *np.random.normal()
data_env_1.loc[:,'chr16.1865_top_two'] += confounder_env_1 *np.random.normal()
data_env_1.loc[:,'chr16.1866_top_two'] += confounder_env_1 *np.random.normal()
data_env_1.loc[:,'chr16.1866_second_two'] += confounder_env_1*np.random.normal()
data_env_1.loc[:,'chr16.1867_top_two'] += confounder_env_1*np.random.normal()
data_env_1.loc[:,'chr16.1897_top_two'] += confounder_env_1*np.random.normal()
data_env_1.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_1)*3
data_env_1 = data_env_1 - data_env_1.mean()

In [5]:
data_env_2 = data.loc[(table_crispr.shape[0]//3+1):(table_crispr.shape[0]//3)*2].copy()
confounder_env_2 = np.random.binomial(1, 0.5, size=data_env_2.shape[0])
data_env_2.loc[:,'chr16.1822_top_two'] += confounder_env_2 *np.random.normal()
data_env_2.loc[:,'chr16.1856_top_two'] += confounder_env_2 *np.random.normal()
data_env_2.loc[:,'chr16.1865_top_two'] += confounder_env_2 *np.random.normal()
data_env_2.loc[:,'chr16.1866_top_two'] += confounder_env_2 *np.random.normal()
data_env_2.loc[:,'chr16.1866_second_two'] += confounder_env_2*np.random.normal()
data_env_2.loc[:,'chr16.1867_top_two'] += confounder_env_2*np.random.normal()
data_env_2.loc[:,'chr16.1897_top_two'] += confounder_env_2*np.random.normal()
data_env_2.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_2)*3
data_env_2 = data_env_2 - data_env_2.mean()

In [6]:
data_env_3 = data.loc[(table_crispr.shape[0]//3)*2 +1:].copy()
confounder_env_3 = np.random.binomial(1, 0.5, size=data_env_3.shape[0])
data_env_3.loc[:,'chr16.1822_top_two'] += confounder_env_3 *np.random.normal()
data_env_3.loc[:,'chr16.1856_top_two'] += confounder_env_3 *np.random.normal()
data_env_3.loc[:,'chr16.1857_top_two'] += confounder_env_3 *np.random.normal()
data_env_3.loc[:,'chr16.1863_second_two'] += confounder_env_3 *np.random.normal()
data_env_3.loc[:,'chr16.1863_top_two'] += confounder_env_3 *np.random.normal()
data_env_3.loc[:,'ENSG00000166501'] += (0.5 + confounder_env_3)*3
data_env_3 = data_env_3 - data_env_3.mean()

In [7]:
environment_1 = {
    'dataset':data_env_1.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [0,1]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_1.values.shape[0],
}
generate_constraints(environment_1)
environment_2 = {
    'dataset':data_env_2.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [2,3,4]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_2.values.shape[0],
}
generate_constraints(environment_2)
environment_3 = {
    'dataset':data_env_3.values.T.astype(np.float_),
    'dict_interventions':{i:{'type':'parental', 'parental_index':[10]} for i in [5,6,7,8,9]},
    'x_indices':np.arange(10),
    'y_index':11,
    'n_samples':data_env_3.values.shape[0],
}
generate_constraints(environment_3)

In [8]:
solver = SolveProblem(np.eye(12), np.arange(10), 11)

In [15]:
beta_GMM, aCov = solver.compute_beta_GMM([environment_1, environment_2, environment_3])

CI = solver.compute_CI(beta_GMM, aCov, table_crispr.shape[0], 0.05)
CI[1,:]-CI[0,:]
beta_GMM
CI

array([0.61900813, 0.65148212, 0.90359849, 0.70311555, 0.67442088,
       0.54491919, 0.57035087, 0.78093399, 0.7014597 , 0.68181426])

array([ 0.12491778,  0.04011653, -0.10700482,  0.29692784, -0.00232069,
       -0.37530669, -1.29022543, -1.63437355, -0.52958351,  0.04422914])

array([[-0.18458628, -0.28562453, -0.55880406, -0.05462993, -0.33953113,
        -0.64776629, -1.57540086, -2.02484055, -0.88031336, -0.29667799],
       [ 0.43442185,  0.36585759,  0.34479442,  0.64848562,  0.33488975,
        -0.1028471 , -1.00504999, -1.24390655, -0.17885366,  0.38513626]])

In [14]:
OLS_full_uncorrupted = sm.OLS(data.loc[:,'ENSG00000166501'],data.loc[:, data.columns!='ENSG00000166501']).fit()
print(OLS_full_uncorrupted.summary())

                                 OLS Regression Results                                
Dep. Variable:        ENSG00000166501   R-squared (uncentered):                   0.069
Model:                            OLS   Adj. R-squared (uncentered):              0.069
Method:                 Least Squares   F-statistic:                              1386.
Date:                Wed, 03 Mar 2021   Prob (F-statistic):                        0.00
Time:                        07:25:28   Log-Likelihood:                     -4.8364e+05
No. Observations:              204296   AIC:                                  9.673e+05
Df Residuals:                  204285   BIC:                                  9.674e+05
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [11]:
OLS_env_1 = sm.OLS(data_env_1.loc[:,'ENSG00000166501'],data_env_1.loc[:, data_env_1.columns!='ENSG00000166501']).fit()
OLS_env_1.params
OLS_env_2 = sm.OLS(data_env_2.loc[:,'ENSG00000166501'],data_env_2.loc[:, data_env_2.columns!='ENSG00000166501']).fit()
OLS_env_2.params

chr16.1822_top_two       0.105423
chr16.1856_top_two       0.012266
chr16.1857_top_two       0.546642
chr16.1863_second_two    0.095091
chr16.1863_top_two       0.038776
chr16.1865_top_two      -0.379624
chr16.1866_top_two      -1.482662
chr16.1866_second_two   -1.083722
chr16.1867_top_two      -0.632803
chr16.1897_top_two      -0.466301
umi_count                0.766263
dtype: float64

chr16.1822_top_two       0.106505
chr16.1856_top_two       0.716952
chr16.1857_top_two       0.157346
chr16.1863_second_two    0.237391
chr16.1863_top_two      -0.103712
chr16.1865_top_two      -0.565238
chr16.1866_top_two      -0.976510
chr16.1866_second_two   -1.158204
chr16.1867_top_two      -1.314901
chr16.1897_top_two       1.184870
umi_count                0.771435
dtype: float64

In [12]:
print(OLS_env_2.summary())

                                 OLS Regression Results                                
Dep. Variable:        ENSG00000166501   R-squared (uncentered):                   0.287
Model:                            OLS   Adj. R-squared (uncentered):              0.287
Method:                 Least Squares   F-statistic:                              2495.
Date:                Wed, 03 Mar 2021   Prob (F-statistic):                        0.00
Time:                        07:24:25   Log-Likelihood:                     -1.6110e+05
No. Observations:               68098   AIC:                                  3.222e+05
Df Residuals:                   68087   BIC:                                  3.223e+05
Df Model:                          11                                                  
Covariance Type:            nonrobust                                                  
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [13]:
np.random.normal()

-0.28809113565718686