## Statistical analysis with LASSO algorithm

### Import package

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from core.core import *

### Load sensing matrix

In [2]:
pool_matrix = pd.read_excel('data/sensing_matrix_15drugs.xlsx')
pool_matrix = pool_matrix.fillna(0)

In [3]:
# Note:
# Since Staurosporine is in our drug list, we need to remove the staurosporine pools 
# from the negatives when processing other kinase inhibitor. For most applications,
# Staurosporine will not be included, so this step can be skipped.

drug_names = ['Palbociclib', 'Panobinostat', 'Raltitrexed', 'Methotrexate',
               'Vemurafenib', 'Fimepinostat', 'Olaparib', 'Bafetinib', 'SCIO-469',
               'OTS964', 'SL-327', 'Abemaciclib', 'CCT137690', 'Belumosudil',
               'Staurosporine']
kin_inhibitor = [True, False, False, False, True, False, False, True, True, True, 
                 True, True, True, True, False]

# Here klist is the valid pools when processing the specific drug.
klist = []
for i in range(len(kin_inhibitor)):
    if not kin_inhibitor[i]:
        k = np.arange(len(pool_matrix))
    else:
        k = np.where(np.logical_or(pool_matrix.iloc[:,-1] == 0, pool_matrix.iloc[:,i] == 1))[0]
    klist.append(k)

### Run the method

In [4]:
# data path
paths = os.listdir('data/preprocessed')
print (paths)

['PL_293T_F.csv', 'PL_HCT116_F.csv', 'PL_HepG2_F.csv', 'PL_K562_F.csv', 'PL_MCF7_F.csv']


In [5]:
for p in paths:
    print('processing {}'.format(p))
    input_path = 'data/preprocessed/{}'.format(p)
    protein_table = pd.read_csv(input_path)
    scores, fold_changes = post_analysis(protein_table, pool_matrix, klist, drug_num = 3)
    output_path = input_path.replace('data/preprocessed', 'results')
    scores.to_csv(output_path.replace('.csv', '_scores.csv'))
    fold_changes.to_csv(output_path.replace('.csv', '_fold_changes.csv'))

processing PL_293T_F.csv


100%|█████████████████████████████████████████████████████████████████████████████| 4181/4181 [00:38<00:00, 107.51it/s]


processing PL_HCT116_F.csv


100%|█████████████████████████████████████████████████████████████████████████████| 3706/3706 [00:35<00:00, 105.29it/s]


processing PL_HepG2_F.csv


100%|█████████████████████████████████████████████████████████████████████████████| 4056/4056 [00:37<00:00, 108.60it/s]


processing PL_K562_F.csv


100%|█████████████████████████████████████████████████████████████████████████████| 3833/3833 [00:35<00:00, 108.65it/s]


processing PL_MCF7_F.csv


100%|█████████████████████████████████████████████████████████████████████████████| 3997/3997 [00:37<00:00, 106.95it/s]
