# MCP regression

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# %load common_.py
from common import *

In [3]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import LeaveOneGroupOut, KFold
from ehv import correlation
from sklearn.preprocessing import scale, minmax_scale
from sklearn import metrics
import multiprocessing

In [4]:
samples = None
df = e_load.load_raw_ideas_dir(
    Path("/data/weizmann/EhV/high_time_res"), 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/representations/ideas_features/"), 
    "ALL", 
    Path("/data/weizmann/EhV/weizmann-ehv-metadata/cell_populations/manual_gating/"),
    samples, "Low/*.cif")
df = e_load.remove_unwanted_features(df)
df = e_load.tag_columns(df)
df = e_load.clean_column_names(df)

In [5]:
df = df[df["meta_label_coi"]]
df.shape

(339045, 113)

In [6]:
with open("data/selected_features_low.txt") as fh:
    features = [i.lower() for i in fh.read().split(",")]
features.extend(df.filter(regex="meta").columns.values.tolist())

df = df[features]

In [7]:
import re
reg = r"^meta_label_(.+)$"
label_vec = numpy.full((df.shape[0]), fill_value="unknown", dtype=object)
for col in df.filter(regex="(?i)meta_label_.*psba.*"):
    label_vec[df[col].values] = re.match(reg, col).groups(1)
    
df["meta_label"] = label_vec

In [8]:
def do_low_variance_filter(df):
    v = VarianceThreshold(0.01).fit(df.filter(regex="feat"))
    cols = df.filter(regex="feat").columns[v.get_support()]
    cols = cols.append(df.filter(regex="meta").columns)
    return df[cols]

df = do_low_variance_filter(df)

In [9]:
df.shape

(339045, 77)

In [10]:
correlated_features = correlation.get_correlated_features(df.filter(regex="feat"), thresh=0.89)
df = df.drop(columns=correlated_features)
df.shape

(339045, 60)

In [11]:
cv = LeaveOneGroupOut()

In [27]:
def do_minmax_scale(df):
    df[df.filter(regex="feat").columns] = minmax_scale(df.filter(regex="feat"))
    return df

scaled_df = df.groupby(["meta_timepoint", "meta_replicate"]).apply(do_minmax_scale)

In [28]:
def do_zscore_scale(df):
    df[df.filter(regex="feat").columns] = scale(df.filter(regex="feat"))
    return df

scaled_df = scaled_df.groupby(["meta_timepoint", "meta_replicate"]).apply(do_zscore_scale)

In [29]:
mcp_df = scaled_df.filter(regex="ch11|m11|mcp")
X = scaled_df.drop(columns=mcp_df.columns).filter(regex="feat")

In [30]:
from sklearn.linear_model import LassoCV
def lasso_func(X, y):
    cv = KFold(n_splits=5)
    model = LassoCV(cv=cv).fit(X, y)

In [31]:
from sklearn.ensemble import RandomForestRegressor
def rf_func(X, y):
    return RandomForestRegressor(n_estimators=50).fit(X, y)

In [32]:
%%time
def fold_func(X, y, index_tuple, fit_func):
    numpy.random.seed(42)
    
    # select data
    train_index, test_index = index_tuple
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # fit model
    model = fit_func(X_train, y_train)
    
    # evaluate model
    y_preds = model.predict(X_test)
    
    # return metric
    return {
        "mse": metrics.mean_squared_error(y_test, y_preds),
        "r2": metrics.r2_score(y_test, y_preds),
#         "alpha": model.alpha_
    }
    
futures = []
with multiprocessing.Pool(processes=12) as pool:
    for idx_tuple in cv.split(X=numpy.arange(len(df)), groups=df["meta_timepoint"]):
        futures.append(
            pool.apply_async(
                fold_func, 
                (X.values, mcp_df["feat_intensity_mc_ch11"].values.ravel(), idx_tuple, rf_func)
            ))
    
    results = []
    for i, future in enumerate(futures):
        results.append(future.get())
        print(i, end=" ")
    print("")

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
CPU times: user 1.68 s, sys: 2.26 s, total: 3.93 s
Wall time: 49min 58s


In [33]:
results

[{'mse': 0.45950532245227405, 'r2': 0.5330349736269804},
 {'mse': 0.45827095018111086, 'r2': 0.540037382767812},
 {'mse': 0.45793960801278344, 'r2': 0.5405480915970806},
 {'mse': 0.4336985370316852, 'r2': 0.5440537756026762},
 {'mse': 0.4715248162819179, 'r2': 0.5638771523882183},
 {'mse': 0.5015918078926649, 'r2': 0.5268765624983646},
 {'mse': 0.4641030455744703, 'r2': 0.5234009220627578},
 {'mse': 0.4460424393423345, 'r2': 0.5378470426129254},
 {'mse': 0.46586476833502216, 'r2': 0.5330418361012256},
 {'mse': 0.4814143033614943, 'r2': 0.5250522372857627},
 {'mse': 0.4548481761581075, 'r2': 0.5408435269331395},
 {'mse': 0.46245092123818987, 'r2': 0.5393679906267526},
 {'mse': 0.4870910431437364, 'r2': 0.5127051461860819},
 {'mse': 0.4447723530378455, 'r2': 0.5613442617222701},
 {'mse': 0.47509290111590147, 'r2': 0.5236659484726676},
 {'mse': 0.4658868171334138, 'r2': 0.5186245155495919}]

In [34]:
pandas.DataFrame(results).to_csv("/data/weizmann/EhV/weizmann-ehv-metadata/regression/Low/%s.csv" % uuid.uuid4(), index=False)