In [2]:
# start coding here

In [57]:
from anndata import read_h5ad, AnnData
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [58]:
coexp_adata = read_h5ad(snakemake.input['tm_ccle_coexp'])
metmap_tissue = snakemake.params['metmap_tissue']

In [59]:
metmap_tissue

In [60]:
coexp_adata.obs

In [61]:
coef_df = pd.DataFrame(columns=["cell_ontology_id", "metmap_tissue", "MSE", "R2"])
coef_arr = []

In [62]:
# Build a regression model for each cell type
cell_ontology_ids = coexp_adata.obs['cell_ontology_id'].unique().tolist()
for cell_ontology_id in cell_ontology_ids:
    ct_coexp_adata = coexp_adata[coexp_adata.obs['cell_ontology_id'] == cell_ontology_id, :]
    # Metastatic potential, the response variable
    y = ct_coexp_adata.obs['met_potential_mean'].values

    X = ct_coexp_adata.X.toarray()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=2445)
    
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(X_train, y_train)

    # Make predictions using the testing set
    y_pred = regr.predict(X_test)

    #print('Coefficients: \n', regr.coef_)
    print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
    # The coefficient of determination: 1 is perfect prediction
    print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
    
    coef_df = coef_df.append({
        "cell_ontology_id": cell_ontology_id,
        "metmap_tissue": metmap_tissue,
        "MSE": mean_squared_error(y_test, y_pred),
        "R2": r2_score(y_test, y_pred)
    }, ignore_index=True)
    coef_arr.append(regr.coef_)

In [63]:
coef_X = np.stack(coef_arr, axis=-1).T

In [64]:
coef_df.shape

In [65]:
coef_adata = AnnData(X=coef_X, obs=coef_df)
coef_adata.write(snakemake.output[0])

In [66]:
coef_adata.obs