# Linear Prediction Models

The goal is to predict the gene effect scores using the CHRONOS scores computed for the tRCC cell lines. This is done by training linear models for each gene of interest on gene expression data and their associated gene effect scores across various cancer cell lines as the samples. This will act as the benchmark for subsequent, more complex deep-learning-based models trained on this dataset.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import os
os.chdir('../..')

from analysis import analysis_functions as af

In [22]:
### DepMap Gene Expression (TPMLogp1)
#####################################
depmap_gene_exp_23Q2 = pd.read_csv("datasets/depmap_datasets/23Q2/OmicsExpressionProteinCodingGenesTPMLogp1.csv")
depmap_gene_exp_23Q2.set_index("Unnamed: 0", inplace=True)
depmap_gene_exp_23Q2.index.name = None
depmap_gene_exp_23Q2.sort_index(axis=0, inplace=True)
depmap_gene_exp_23Q2.sort_index(axis=1, inplace=True)
depmap_gene_exp_23Q2.dropna(axis=1, inplace=True)
depmap_gene_exp_23Q2.columns = af.extract_gene_names(depmap_gene_exp_23Q2.columns)

### DepMap Gene Effects (CHRONOS)
#################################
depmap_gene_effect_23Q2 = pd.read_csv('datasets/depmap_datasets/23Q2/CRISPRGeneEffect.csv')
depmap_gene_effect_23Q2.set_index("ModelID", inplace=True)
depmap_gene_effect_23Q2.index.name = None
depmap_gene_effect_23Q2.sort_index(axis=0, inplace=True)
depmap_gene_effect_23Q2.sort_index(axis=1, inplace=True)
depmap_gene_effect_23Q2.dropna(axis=1, inplace=True)
depmap_gene_effect_23Q2.columns = af.extract_gene_names(depmap_gene_effect_23Q2.columns)

### TRCC Gene Expression (TPMLogp1)
###################################
DFCI_gene_exp = pd.read_csv("datasets/tRCC_cell_lines/raw/RSEM_summary_all_samples_gene_TPM.txt", sep="\t").set_index("gene_id")

STFE1 = pd.read_csv("datasets/tRCC_cell_lines/raw/F1.genes.results", sep="\t").set_index("gene_id")["TPM"]
STFE2 = pd.read_csv("datasets/tRCC_cell_lines/raw/F2.genes.results", sep="\t").set_index("gene_id")["TPM"]
STFE3 = pd.read_csv("datasets/tRCC_cell_lines/raw/F3.genes.results", sep="\t").set_index("gene_id")["TPM"]
STFE_means = pd.concat([STFE1, STFE2, STFE3], axis=1).dropna(axis=1).mean(axis=1)
STFE_means.name = "STFE"

FUUR1_means = DFCI_gene_exp[['B19', 'B20', 'B21']].mean(axis=1)
UOK109_means = DFCI_gene_exp[['B10', 'B11', 'B12']].mean(axis=1)

tRCC_gene_exp = pd.DataFrame({
    'FUUR1': FUUR1_means,
    'UOK109': UOK109_means
}, index=DFCI_gene_exp.index)

tRCC_gene_exp = tRCC_gene_exp.join(STFE_means, how="outer")

tRCC_gene_exp.index.name = None
tRCC_gene_exp.index = tRCC_gene_exp.index.str.split('_').str[-1]
tRCC_gene_exp.sort_index(axis=0, inplace=True)
tRCC_gene_exp.sort_index(axis=1, inplace=True)
tRCC_gene_exp = np.log1p(tRCC_gene_exp)
tRCC_gene_exp = tRCC_gene_exp.groupby(tRCC_gene_exp.index).sum().T
tRCC_gene_exp.to_csv("datasets/tRCC_cell_lines/tRCC_gene_exp_TPMLogp1.csv")

### TRCC Gene Effects (CHRONOS)
###############################
DFCI_chronos_dataset = pd.read_csv("datasets/tRCC_cell_lines/Chronos/tRCC_chronos_summary_for_BL_ASPS_updated.csv")
DFCI_chronos_CCLs = DFCI_chronos_dataset[["Gene", "PC3", "CAKI2", "CAKI1", "786O", "DU145", "HCT116", "NCIH460", "FUUR1", "STFE", "UOK109"]].T
DFCI_chronos_CCLs.columns = DFCI_chronos_CCLs.iloc[0]
DFCI_chronos_CCLs.columns.name = None
DFCI_chronos_CCLs.drop(DFCI_chronos_CCLs.index[0], inplace=True)
DFCI_chronos_CCLs = DFCI_chronos_CCLs.loc[:, ~(DFCI_chronos_CCLs == 'Unknown').any(axis=0)].apply(pd.to_numeric, errors='coerce').dropna(axis=1)

tRCC_chronos_gene_effects = DFCI_chronos_CCLs.loc[["FUUR1", "STFE", "UOK109"]]
tRCC_chronos_gene_effects.sort_index(axis=0, inplace=True)
tRCC_chronos_gene_effects.sort_index(axis=1, inplace=True)

### Standardise Rows and Columns Between dataframes
###################################################
depmap_gene_exp_23Q2, depmap_gene_effect_23Q2 = af.Intersect_DF([depmap_gene_exp_23Q2, depmap_gene_effect_23Q2])
tRCC_gene_exp, tRCC_chronos_gene_effects = af.Intersect_DF([tRCC_gene_exp, tRCC_chronos_gene_effects])
depmap_gene_exp_23Q2, depmap_gene_effect_23Q2, tRCC_gene_exp, tRCC_chronos_gene_effects = af.Intersect_DF([depmap_gene_exp_23Q2, depmap_gene_effect_23Q2, tRCC_gene_exp, tRCC_chronos_gene_effects], match="columns")

print("DepMap dataset row/column check: ", af.Check_DF_Similarity([depmap_gene_exp_23Q2, depmap_gene_effect_23Q2]))
print("tRCC dataset row/column check: ", af.Check_DF_Similarity([tRCC_gene_exp, tRCC_chronos_gene_effects]))
print("Check columns across DepMap and tRCC dataframes: ", af.Check_DF_Similarity([depmap_gene_exp_23Q2, depmap_gene_effect_23Q2, tRCC_gene_exp, tRCC_chronos_gene_effects], check="columns"))

### RCC Gene Effect Scores
##########################
depmap_RCC_data = pd.read_csv("datasets/depmap_datasets/CCLIDs/RCC_depmap_data.csv")
RCC_depmap_gene_exp_23Q2 = depmap_gene_exp_23Q2[depmap_gene_exp_23Q2.index.isin(depmap_RCC_data["depmapId"])]
RCC_depmap_gene_effect_23Q2 = depmap_gene_effect_23Q2[depmap_gene_effect_23Q2.index.isin(depmap_RCC_data["depmapId"])]

### ccRCC Gene Effect Scores
############################
depmap_ccRCC_data = pd.read_csv('datasets/depmap_datasets/CCLIDs/ccRCC_depmap_data.csv')
ccRCC_depmap_gene_exp_23Q2 = depmap_gene_exp_23Q2[depmap_gene_exp_23Q2.index.isin(depmap_ccRCC_data["depmapId"])]
ccRCC_depmap_gene_effect_23Q2 = depmap_gene_effect_23Q2[depmap_gene_effect_23Q2.index.isin(depmap_ccRCC_data["depmapId"])]

### Essential Genes
###################
top_common_essential_genes = pd.read_csv("analysis/testGenes/top_common_essential_genes")
top_essential_genes = pd.read_csv("analysis/testGenes/top_essential_genes")

DepMap dataset row/column check:  {'all_match': True, 'column_differences': [[], []], 'row_differences': [[], []]}
tRCC dataset row/column check:  {'all_match': True, 'column_differences': [[], []], 'row_differences': [[], []]}
Check columns across DepMap and tRCC dataframes:  {'all_match': True, 'column_differences': [[], [], [], []]}


## Linear models on Common Essential Genes



In [23]:
geneModels = {}
for gene in top_common_essential_genes["gene"]:
    
    X = depmap_gene_exp_23Q2[gene].values.reshape(-1, 1)
    Y = depmap_gene_effect_23Q2[gene]

    model = LinearRegression()
    model.fit(X, Y)
    geneModels[gene] = model

predictions = {}
for gene, model in geneModels.items():
    X_test = tRCC_gene_exp[gene].values.reshape(-1, 1)
    predictions[gene] = model.predict(X_test)

for gene, values in predictions.items():
    print("Gene: {0}\nPredicted {1}, True: {2}\n".format(gene, values, tRCC_chronos_gene_effects[gene].to_list()))

Gene: KRT18
Predicted [0.04111441 0.02350891 0.08593388], True: [-1.293836514, -1.352427687, -1.27570749]

Gene: RPEL1
Predicted [0.07407084 0.07474453 0.07507805], True: [-1.080619993, -1.109748257, -1.206364839]

Gene: TFE3
Predicted [-0.04144401 -0.04280626 -0.03242379], True: [-1.209508416, -1.186803415, -1.111237163]

Gene: PMVK
Predicted [-0.4044655  -0.40048788 -0.36726137], True: [-1.420036935, -1.29704714, -1.297872565]

Gene: AQP7
Predicted [-0.43980455 -0.43341001 -0.43803025], True: [-1.50661814, -1.446371693, -1.469061472]

Gene: CENPI
Predicted [-0.59122111 -0.56802708 -0.5980506 ], True: [-1.575822893, -1.542695758, -1.577641935]

Gene: PGM3
Predicted [-0.34612031 -0.29315629 -0.38051931], True: [-1.179829614, -1.20170109, -1.290588503]

Gene: TMED10
Predicted [-0.60256323 -0.60148922 -0.60173331], True: [-1.540005463, -1.487532686, -1.447838877]

Gene: KAT5
Predicted [-0.65311153 -0.65449452 -0.65100205], True: [-1.537572855, -1.555941768, -1.522524601]

Gene: INTS10
Pr

In [21]:
print(depmap_gene_exp_23Q2["KRT18"])
print(abs(depmap_gene_effect_23Q2["PET117"]).mean())
print(tRCC_gene_exp["KRT18"])

ACH-000001    10.087595
ACH-000004     5.604664
ACH-000005     4.493775
ACH-000007    10.545447
ACH-000009    11.966253
                ...    
ACH-002800     9.911722
ACH-002834     7.295264
ACH-002847     7.480992
ACH-002922     4.837943
ACH-002926     2.207893
Name: KRT18, Length: 1019, dtype: float64
0.2711419214896689
FUUR1     7.052868
STFE      7.782717
UOK109    5.194844
Name: KRT18, dtype: float64


In [24]:
geneModels = {}
for gene in top_common_essential_genes["gene"]:
    
    X = RCC_depmap_gene_exp_23Q2[gene].values.reshape(-1, 1)
    Y = RCC_depmap_gene_effect_23Q2[gene]

    model = LinearRegression()
    model.fit(X, Y)
    geneModels[gene] = model

predictions = {}
for gene, model in geneModels.items():
    X_test = tRCC_gene_exp[gene].values.reshape(-1, 1)
    predictions[gene] = model.predict(X_test)

for gene, values in predictions.items():
    print("Gene: {0}\nPredicted {1}, True: {2}\n".format(gene, values, tRCC_chronos_gene_effects[gene].to_list()))

Gene: KRT18
Predicted [0.12207871 0.07320232 0.24650654], True: [-1.293836514, -1.352427687, -1.27570749]

Gene: RPEL1
Predicted [0.08822779 0.09091186 0.09224068], True: [-1.080619993, -1.109748257, -1.206364839]

Gene: TFE3
Predicted [0.13753907 0.15108332 0.04785482], True: [-1.209508416, -1.186803415, -1.111237163]

Gene: PMVK
Predicted [-0.33606103 -0.33102263 -0.28893507], True: [-1.420036935, -1.29704714, -1.297872565]

Gene: AQP7
Predicted [-0.54552717 -0.52749228 -0.54052299], True: [-1.50661814, -1.446371693, -1.469061472]

Gene: CENPI
Predicted [-0.47087355 -0.48085154 -0.46793553], True: [-1.575822893, -1.542695758, -1.577641935]

Gene: PGM3
Predicted [-0.44579996 -0.40356483 -0.47323077], True: [-1.179829614, -1.20170109, -1.290588503]

Gene: TMED10
Predicted [-0.79003079 -0.79483613 -0.79374403], True: [-1.540005463, -1.487532686, -1.447838877]

Gene: KAT5
Predicted [-0.45689555 -0.41810731 -0.51605965], True: [-1.537572855, -1.555941768, -1.522524601]

Gene: INTS10
Predi

In [26]:
geneModels = {}
for gene in top_common_essential_genes["gene"]:
    
    X = ccRCC_depmap_gene_exp_23Q2[gene].values.reshape(-1, 1)
    Y = ccRCC_depmap_gene_effect_23Q2[gene]

    model = LinearRegression()
    model.fit(X, Y)
    geneModels[gene] = model

predictions = {}
for gene, model in geneModels.items():
    X_test = tRCC_gene_exp[gene].values.reshape(-1, 1)
    predictions[gene] = model.predict(X_test)

for gene, values in predictions.items():
    print("Gene: {0}\nPredicted {1}, True: {2}\n".format(gene, values, tRCC_chronos_gene_effects[gene].to_list()))

Gene: KRT18
Predicted [0.25296251 0.27090576 0.2072832 ], True: [-1.293836514, -1.352427687, -1.27570749]

Gene: RPEL1
Predicted [0.12165675 0.12293357 0.1235657 ], True: [-1.080619993, -1.109748257, -1.206364839]

Gene: TFE3
Predicted [0.21260192 0.2231801  0.14255767], True: [-1.209508416, -1.186803415, -1.111237163]

Gene: PMVK
Predicted [0.46457684 0.45859135 0.4085925 ], True: [-1.420036935, -1.29704714, -1.297872565]

Gene: AQP7
Predicted [ 0.41201822 -0.05572508  0.28223268], True: [-1.50661814, -1.446371693, -1.469061472]

Gene: CENPI
Predicted [0.44312181 0.4611639  0.43780931], True: [-1.575822893, -1.542695758, -1.577641935]

Gene: PGM3
Predicted [0.41047908 0.40907514 0.4113909 ], True: [-1.179829614, -1.20170109, -1.290588503]

Gene: TMED10
Predicted [0.75707048 0.76557236 0.76364015], True: [-1.540005463, -1.487532686, -1.447838877]

Gene: KAT5
Predicted [0.4685856  0.41333154 0.55286518], True: [-1.537572855, -1.555941768, -1.522524601]

Gene: INTS10
Predicted [0.1639595