Read in pickle files

In [53]:
OUTPUT_DIR = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/reference_predictions'

#import pickle as pkl
import numpy as np
import pandas as pd, collections
import _pickle as cPickle
import gc
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

# Note, use CV for cross-validation as requested in the question
from sklearn.linear_model import LogisticRegressionCV

# Load some other sklearn functions
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
Gata3_first = 983 # CHIP:GATA3:T47D treated with 0.02% dimethyl sulfoxide for 1 hour
Gata3_second = 1417 # CHIP:GATA3:SH-SY5Y
Gata3_third = 2740 # CHIP:GATA3:MCF-7

In [3]:
region_data = pd.read_csv('/projects/covid-ct/imlab/users/temi/projects/TFXcan/log/logging_predictions.csv')
region_data.head()

Unnamed: 0,motif,status
0,TP358,completed
1,TP359,completed
2,TP360,completed
3,TP361,completed
4,TP362,completed


In [4]:
motif_y = ['TP' if m.startswith('TP') else 'TN' for m in region_data.motif.values]
region_data['motif_y'] = motif_y

In [5]:
collections.Counter(motif_y)

Counter({'TP': 1214, 'TN': 14652})

I should randomly select like 800 TPs and 800 TNs and train on these

In [6]:
np.random.seed(2022)

In [7]:
train_info = region_data.groupby('motif_y').sample(n=800, random_state=2022)
test_info = region_data[~region_data.index.isin(train_info.index.values)]

In [8]:

def Threshold_tracks(selected_tracks, thresh=None):
    
    if isinstance(selected_tracks, type({})):
        output = {}
        for k, v in selected_tracks.items():

            if isinstance(None, type(thresh)):
                thresh = np.mean(v, axis=0)

            output[k] = np.where(v >= thresh, 1, 0)
    
    if isinstance(selected_tracks, type(np.zeros((1, 3)))):
        
        if isinstance(thresh, type(None)):
            thresh = np.mean(selected_tracks, axis=0)
            
        output = np.where(selected_tracks >= thresh, 1, 0)
    
    return output

In [31]:
# define some motifs
train_motifs = train_info.motif.values   #['TP1', 'TN14722']#region_data.motif.values #['TP1', 'TN14722']
test_motifs = test_info.motif.values

In [12]:
# this step is slow - needs to be optimized, but it is reading a lot of data, I guess

train_enformer_predictions = {}

for m in train_motifs:
    with open(str(f'{OUTPUT_DIR}/GATA3_reference_{m}_predictions_2022-07-19.pkl'), 'rb') as input_obj:
        
        gc.disable()
        
        # just select what you need
        temp = cPickle.load(input_obj)[:,[Gata3_first, Gata3_second, Gata3_third]] # selects only GATA3 tracks
        
        gc.enable()
        
        m_thresholds = Threshold_tracks(temp) # thresholds them as needed, by average
        train_enformer_predictions[m] = m_thresholds
        
#         for i, track in enumerate([Gata3_first, Gata3_second, Gata3_third]):
#             m_thresholds[:, i]
    
#m_predictions

In [13]:
train_enformer_predictions = {k: np.transpose(v) for k, v in train_enformer_predictions.items()}

In [18]:
X_train = np.array([v for v in train_enformer_predictions.values()])
X_train.shape

(1600, 3, 896)

In [20]:
X_train = np.vstack(X_train)
X_train

array([[0, 1, 1, ..., 0, 0, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [0, 1, 1, ..., 1, 1, 0],
       [0, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [27]:
y_train = np.hstack(np.array([[1] * 3 if k.startswith('TP') else [0]*3 for k in train_enformer_predictions.keys()]))
y_train

array([0, 0, 0, ..., 1, 1, 1])

In [35]:
# this step is slow - needs to be optimized, but it is reading a lot of data, I guess

test_enformer_predictions = {}

for m in test_motifs:
    with open(str(f'{OUTPUT_DIR}/GATA3_reference_{m}_predictions_2022-07-19.pkl'), 'rb') as input_obj:
        
        gc.disable()
        
        # just select what you need
        temp = cPickle.load(input_obj)[:,[Gata3_first, Gata3_second, Gata3_third]] # selects only GATA3 tracks
        
        gc.enable()
        
        m_thresholds = Threshold_tracks(temp) # thresholds them as needed, by average
        test_enformer_predictions[m] = m_thresholds
        
#         for i, track in enumerate([Gata3_first, Gata3_second, Gata3_third]):
#             m_thresholds[:, i]
    
#m_predictions

In [36]:
test_enformer_predictions = {k: np.transpose(v) for k, v in test_enformer_predictions.items()}
X_test = np.array([v for v in test_enformer_predictions.values()])
X_test = np.vstack(X_test)
y_test = np.hstack(np.array([[1] * 3 if k.startswith('TP') else [0]*3 for k in test_enformer_predictions.keys()]))


((4800, 896), (42798,))

In [37]:
X_test.shape, y_test.shape

((42798, 896), (42798,))

In [38]:
X_test, y_test

(array([[1, 1, 0, ..., 1, 0, 0],
        [0, 1, 1, ..., 1, 0, 0],
        [0, 0, 0, ..., 1, 0, 0],
        ...,
        [1, 1, 0, ..., 1, 1, 1],
        [0, 0, 0, ..., 1, 0, 0],
        [1, 0, 0, ..., 0, 0, 0]]),
 array([1, 1, 1, ..., 0, 0, 0]))

Save the numpy arrays

In [40]:
# save the data this way; there are other formats like npz

np.save(file='/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/X_test.npy', arr=X_test)
np.save(file='/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/y_test.npy', arr=y_test)
np.save(file='/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/X_train.npy', arr=X_train)
np.save(file='/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data/y_train.npy', arr=y_train)

In [41]:
# load in the data this way

train_test_dir = '/projects/covid-ct/imlab/users/temi/projects/TFXcan/output/train-test-data'
X_train = np.load(f'{train_test_dir}/X_train.npy')
y_train = np.load(f'{train_test_dir}/y_train.npy')
X_test = np.load(f'{train_test_dir}/X_test.npy')
y_test = np.load(f'{train_test_dir}/y_test.npy')

In [55]:
# Basic LogisticRegression algorithm
logistic_regression_classifier = LogisticRegressionCV(cv=3, max_iter=1000)
# SAGA should be considered more advanced and used over SAG. For more information, see: https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions
# Note, you should probably tune this, these values are arbitrary
elastic_net_classifier = LogisticRegressionCV(cv=3, penalty='elasticnet', l1_ratios=[0.1, 0.5, 0.9], solver='saga', max_iter=1000)

# Train the models
logistic_regression_classifier.fit(X_train, y_train)
elastic_net_classifier.fit(X_train, y_train)

LogisticRegressionCV(cv=3, l1_ratios=[0.1, 0.5, 0.9], max_iter=1000,
                     penalty='elasticnet', solver='saga')

In [56]:
# Test the models
print("Logistic Regression: {} || Elasticnet: {}".format(logistic_regression_classifier.score(X_test, y_test), elastic_net_classifier.score(X_test, y_test)))

Logistic Regression: 0.7311790270573392 || Elasticnet: 0.7237721388849946


In [57]:
# Print out some more metrics
print("Logistic Regression")
print(classification_report(y_test, logistic_regression_classifier.predict(X_test)))
print("Elastic Net")
print(classification_report(y_test, elastic_net_classifier.predict(X_test)))

Logistic Regression
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     41556
           1       0.08      0.77      0.14      1242

    accuracy                           0.73     42798
   macro avg       0.53      0.75      0.49     42798
weighted avg       0.96      0.73      0.82     42798

Elastic Net
              precision    recall  f1-score   support

           0       0.99      0.72      0.84     41556
           1       0.08      0.84      0.15      1242

    accuracy                           0.72     42798
   macro avg       0.54      0.78      0.49     42798
weighted avg       0.97      0.72      0.82     42798



In [67]:
elastic_net_classifier.coef_[0]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  9.91441597e-02,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -2.41297178e-02,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -3.95458963e-02,  0.00000000e+00,  0.00000000e+00,
       -3.17802059e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00, -

In [47]:
model.fit(X_train, y_train)

ElasticNet()

In [50]:
predictions = model.predict(X_test)
predictions

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5])

array([0.5, 0.5, 0.5, ..., 0.5, 0.5, 0.5])

In [None]:
a0 = Threshold_tracks(GATA3_tracks['TP1']['HG00096'])[0][:, 0]
b0 = Threshold_tracks(GATA3_tracks['TP2']['HG00096'])[0][:, 0]
c0 = Threshold_tracks(GATA3_tracks['TN10267']['HG00096'])[0][:, 0]
d0 = Threshold_tracks(GATA3_tracks['TN9464']['HG00096'])[0][:, 0]

a1 = Threshold_tracks(GATA3_tracks['TP1']['HG00096'])[0][:, 1]
b1 = Threshold_tracks(GATA3_tracks['TP2']['HG00096'])[0][:, 1]
c1 = Threshold_tracks(GATA3_tracks['TN10267']['HG00096'])[0][:, 1]
d1 = Threshold_tracks(GATA3_tracks['TN9464']['HG00096'])[0][:, 1]

a2 = Threshold_tracks(GATA3_tracks['TP1']['HG00096'])[0][:, 2]
b2 = Threshold_tracks(GATA3_tracks['TP2']['HG00096'])[0][:, 2]
c2 = Threshold_tracks(GATA3_tracks['TN10267']['HG00096'])[0][:, 2]
d2 = Threshold_tracks(GATA3_tracks['TN9464']['HG00096'])[0][:, 2]

In [None]:
X = np.row_stack((a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2))
y = np.array([1,1,0,0, 1,1,0,0, 1,1,0,0])

In [None]:
X

In [None]:
y