# CSE 280A Project

In [64]:
import pandas as pd
import numpy as np
from scipy import stats
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt

In [2]:
## Load the reference file (classification for each sample)
ref = pd.read_csv("../data/pcaa-master-platinum_set_2020.tsv", sep="\t", header=0, usecols=[0,7,8,12,13,28,32])
print(ref.shape)
ref.head()

(3212, 7)


Unnamed: 0,study,patient_barcode,sample_barcode,min_cnv,number_amp_region,sample_classification_nam_2019,sample_classification
0,PCWG,DO51086,SA530559,4,30,,Non-circular
1,PCWG,DO48759,SA515463,4,5,,Non-circular
2,PCWG,DO48916,SA517420,4,0,,No SCNA detected
3,PCWG,DO51046,SA530430,4,2,,No SCNA detected
4,PCWG,DO45251,SA501507,4,3,,Circular


In [3]:
## Only keep TCGA samples
ref = ref[ref['study'] == 'TCGA']
print(ref.shape)
ref.head()

(1921, 7)


Unnamed: 0,study,patient_barcode,sample_barcode,min_cnv,number_amp_region,sample_classification_nam_2019,sample_classification
1291,TCGA,TCGA-02-2483,TCGA-02-2483-01,1,20,Distal,Heavily-rearranged
1292,TCGA,TCGA-02-2485,TCGA-02-2485-01,1,33,Circular,Circular
1293,TCGA,TCGA-04-1331,TCGA-04-1331-01,1,4,Linear,Non-circular
1294,TCGA,TCGA-04-1347,TCGA-04-1347-01,1,12,Circular,Circular
1295,TCGA,TCGA-04-1349,TCGA-04-1349-01,1,26,Distal,Heavily-rearranged


In [4]:
## Load the oncoprint matrix after querying genes against all TCGA studies
res = pd.read_csv("../data/PATIENT_DATA_oncoprint.tsv", sep="\t", header=0, skiprows=[1,2,3,4])
print(res.shape)
res.head()

(315, 10955)


Unnamed: 0,track_name,track_type,TCGA-ER-A195,TCGA-XK-AAJA,TCGA-CH-5788,TCGA-OR-A5JX,TCGA-CJ-4886,TCGA-EQ-A4SO,TCGA-CE-A3MD,TCGA-DM-A28K,...,TCGA-97-A4M2,TCGA-97-7552,TCGA-97-8552,TCGA-98-A53C,TCGA-98-A53D,TCGA-98-A53H,TCGA-99-AA5R,TCGA-13-2066,TCGA-BP-4345,TCGA-OR-A5OG
0,APC,CNA,homdel_rec,homdel_rec,homdel_rec,Amplification,Amplification,,,homdel_rec,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [32]:
## Additional information of the oncoprint matrix
info = pd.read_csv("../data/PATIENT_DATA_oncoprint.tsv", sep="\t", header=0, skiprows=lambda x: x not in range(5))
print(info.shape)
info.head()

(4, 10955)


Unnamed: 0,track_name,track_type,TCGA-ER-A195,TCGA-XK-AAJA,TCGA-CH-5788,TCGA-OR-A5JX,TCGA-CJ-4886,TCGA-EQ-A4SO,TCGA-CE-A3MD,TCGA-DM-A28K,...,TCGA-97-A4M2,TCGA-97-7552,TCGA-97-8552,TCGA-98-A53C,TCGA-98-A53D,TCGA-98-A53H,TCGA-99-AA5R,TCGA-13-2066,TCGA-BP-4345,TCGA-OR-A5OG
0,Study of origin,CLINICAL,"Skin Cutaneous Melanoma (TCGA, PanCancer Atlas)","Prostate Adenocarcinoma (TCGA, PanCancer Atlas)","Prostate Adenocarcinoma (TCGA, PanCancer Atlas)","Adrenocortical Carcinoma (TCGA, PanCancer Atlas)","Kidney Renal Clear Cell Carcinoma (TCGA, PanCa...","Stomach Adenocarcinoma (TCGA, PanCancer Atlas)","Thyroid Carcinoma (TCGA, PanCancer Atlas)","Colorectal Adenocarcinoma (TCGA, PanCancer Atlas)",...,"Lung Adenocarcinoma (TCGA, PanCancer Atlas)","Lung Adenocarcinoma (TCGA, PanCancer Atlas)","Lung Adenocarcinoma (TCGA, PanCancer Atlas)","Lung Squamous Cell Carcinoma (TCGA, PanCancer ...","Lung Squamous Cell Carcinoma (TCGA, PanCancer ...","Lung Squamous Cell Carcinoma (TCGA, PanCancer ...","Lung Adenocarcinoma (TCGA, PanCancer Atlas)","Ovarian Serous Cystadenocarcinoma (TCGA, PanCa...","Kidney Renal Clear Cell Carcinoma (TCGA, PanCa...","Adrenocortical Carcinoma (TCGA, PanCancer Atlas)"
1,# Samples per Patient,CLINICAL,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,Profiled for copy number alterations,CLINICAL,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,,,
3,Profiled for mutations,CLINICAL,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,...,Yes,Yes,Yes,Yes,,Yes,Yes,,,


In [5]:
## Filter samples out - only keep samples/patients with ecDNA classification
res = res[['track_name', 'track_type'] + res.columns[res.columns.isin(ref['patient_barcode'])].tolist()]
print(res.shape)
res.head()


(315, 1902)


Unnamed: 0,track_name,track_type,TCGA-CH-5788,TCGA-FS-A1ZG,TCGA-DD-A3A8,TCGA-AD-6964,TCGA-A2-A0EY,TCGA-EJ-7784,TCGA-EJ-5531,TCGA-CH-5748,...,TCGA-VD-A8K8,TCGA-VD-A8KA,TCGA-VD-A8KD,TCGA-VD-A8KE,TCGA-VD-A8KF,TCGA-VD-A8KH,TCGA-VD-A8KL,TCGA-VD-A8KN,TCGA-VD-AA8O,TCGA-VD-AA8P
0,APC,CNA,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [42]:
## Group the columns into ecDNA+ vs ecDNA-
## Strategy 1: ecDNA- = Non-circular; ecDNA+ = otherwise
# ecdna_neg = ref[ref['sample_classification'] == 'Non-circular']['patient_barcode']
# ecdna_pos = ref[ref['sample_classification'] != 'Non-circular']['patient_barcode']
# pos1 = list(res.columns[res.columns.isin(ecdna_pos)])
# print("Oncoprint1: {}, {}".format(pos1[-1], len(pos1)))
# cols_met1 = ['track_name', 'track_type'] + pos1 + list(res.columns[res.columns.isin(ecdna_neg)])
# df1 = res[cols_met1]
# print(df1.shape)
# df1.head()

In [39]:
## Save this oncoprint matrix
# df1.to_csv("../data/oncoprint_1.tsv", sep="\t", index=False)

In [105]:
# ## Strategy 2: ecDNA+ = Circular; ecDNA- = otherwise
# ecdna_pos = ref[ref['sample_classification'] == 'Circular']['patient_barcode']
# ecdna_neg = ref[ref['sample_classification'] != 'Circular']['patient_barcode']
# pos = res.columns[res.columns.isin(ecdna_pos)].tolist()
# print("Oncoprint: {}, {}".format(pos[-1], len(pos)))
# cols_met = ['track_name', 'track_type'] + pos + res.columns[res.columns.isin(ecdna_neg)].tolist()
# df = res[cols_met]
# print(df.shape)
# df.head()

In [49]:
## Save this oncoprint matrix
# df.to_csv("../data/oncoprint.tsv", sep="\t", index=False)

In [6]:
ref2 = ref[ref['patient_barcode'].isin(res.columns)]
print(ref2.shape)
ref2.head()

(1900, 7)


Unnamed: 0,study,patient_barcode,sample_barcode,min_cnv,number_amp_region,sample_classification_nam_2019,sample_classification
1291,TCGA,TCGA-02-2483,TCGA-02-2483-01,1,20,Distal,Heavily-rearranged
1292,TCGA,TCGA-02-2485,TCGA-02-2485-01,1,33,Circular,Circular
1293,TCGA,TCGA-04-1331,TCGA-04-1331-01,1,4,Linear,Non-circular
1294,TCGA,TCGA-04-1347,TCGA-04-1347-01,1,12,Circular,Circular
1295,TCGA,TCGA-04-1349,TCGA-04-1349-01,1,26,Distal,Heavily-rearranged


In [7]:
cnt = Counter(ref2['sample_classification'])
cnt

Counter({'Heavily-rearranged': 190,
         'Circular': 335,
         'Non-circular': 165,
         'No SCNA detected': 1041,
         'BFB': 169})

In [8]:
## Collect all possible entries for each track type
entry = {}
for track in res['track_type'].unique():
    entry[track] = set()
    df_sub = res[res['track_type'] == track]
    for i in range(2, len(res.columns)):
        entry[track] = entry[track].union(set(df_sub[df_sub.columns[i]].dropna().unique()))
entry


{'CNA': {'Amplification', 'Deep Deletion', 'amp_rec', 'homdel_rec'},
 'MUTATIONS': {'Inframe Mutation (putative driver)',
  'Inframe Mutation (putative passenger)',
  'Missense Mutation (putative driver)',
  'Missense Mutation (putative passenger)',
  'Truncating mutation (putative driver)',
  'Truncating mutation (putative passenger)',
  'splice',
  'splice_rec'},
 'MRNA': set(),
 'PROTEIN': set(),
 'FUSION': {'Fusion'}}

In [9]:
## Since the mRNA track and Protein track are empty, we delete these data
res = res[res['track_type'] != 'MRNA']
print(res.shape)
res = res[res['track_type'] != 'PROTEIN']
print(res.shape)
res.head()

(252, 1902)
(189, 1902)


Unnamed: 0,track_name,track_type,TCGA-CH-5788,TCGA-FS-A1ZG,TCGA-DD-A3A8,TCGA-AD-6964,TCGA-A2-A0EY,TCGA-EJ-7784,TCGA-EJ-5531,TCGA-CH-5748,...,TCGA-VD-A8K8,TCGA-VD-A8KA,TCGA-VD-A8KD,TCGA-VD-A8KE,TCGA-VD-A8KF,TCGA-VD-A8KH,TCGA-VD-A8KL,TCGA-VD-A8KN,TCGA-VD-AA8O,TCGA-VD-AA8P
0,APC,CNA,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,homdel_rec,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [10]:
## Strategy 1: ecDNA+ = Circular; ecDNA- = otherwise
ecdna_pos = ref[ref['sample_classification'] == 'Circular']['patient_barcode']
ecdna_neg = ref[ref['sample_classification'] != 'Circular']['patient_barcode']
pos = res.columns[res.columns.isin(ecdna_pos)].tolist()
n_pos = len(pos)
print("Oncoprint: {}, {}".format(pos[-1], n_pos)
cols_met = ['track_name', 'track_type'] + pos + res.columns[res.columns.isin(ecdna_neg)].tolist()
df = res[cols_met]
print(df.shape)
df.head()

Oncoprint: TCGA-G2-A2EK, 335
(189, 1902)


Unnamed: 0,track_name,track_type,TCGA-D3-A2JC,TCGA-DX-A23R,TCGA-05-4402,TCGA-D7-6528,TCGA-A6-2677,TCGA-CV-6948,TCGA-A6-5656,TCGA-BR-4267,...,TCGA-VD-A8K8,TCGA-VD-A8KA,TCGA-VD-A8KD,TCGA-VD-A8KE,TCGA-VD-A8KF,TCGA-VD-A8KH,TCGA-VD-A8KL,TCGA-VD-A8KN,TCGA-VD-AA8O,TCGA-VD-AA8P
0,APC,CNA,homdel_rec,Amplification,,,,,,,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [101]:
## Save this oncoprint matrix
df.to_csv("../data/oncoprint1.tsv", sep="\t", index=False)

In [21]:
## Strategy 2: ecDNA+ = Circular; ecDNA- = no SCNA detected
ecdna_pos = ref[ref['sample_classification'] == 'Circular']['patient_barcode']
ecdna_neg = ref[ref['sample_classification'] == 'No SCNA detected']['patient_barcode']
pos = res.columns[res.columns.isin(ecdna_pos)].tolist()
print("Oncoprint: {}, {}".format(pos[-1], len(pos)))
neg = res.columns[res.columns.isin(ecdna_neg)].tolist()
n_neg2 = len(neg) 
cols_met = ['track_name', 'track_type'] + pos + neg
df2 = res[cols_met]
print(df2.shape)
df2.head()

Oncoprint: TCGA-G2-A2EK, 335
(189, 1378)


Unnamed: 0,track_name,track_type,TCGA-D3-A2JC,TCGA-DX-A23R,TCGA-05-4402,TCGA-D7-6528,TCGA-A6-2677,TCGA-CV-6948,TCGA-A6-5656,TCGA-BR-4267,...,TCGA-VD-A8K7,TCGA-VD-A8KA,TCGA-VD-A8KD,TCGA-VD-A8KE,TCGA-VD-A8KF,TCGA-VD-A8KH,TCGA-VD-A8KL,TCGA-VD-A8KN,TCGA-VD-AA8O,TCGA-VD-AA8P
0,APC,CNA,homdel_rec,Amplification,,,,,,,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [104]:
## Save this oncoprint matrix
df2.to_csv("../data/oncoprint2.tsv", sep="\t", index=False)

## Using the strategy 1 matrix

### 1) Loss of Function

In [13]:
print(df.shape)
df.head()

(189, 1902)


Unnamed: 0,track_name,track_type,TCGA-D3-A2JC,TCGA-DX-A23R,TCGA-05-4402,TCGA-D7-6528,TCGA-A6-2677,TCGA-CV-6948,TCGA-A6-5656,TCGA-BR-4267,...,TCGA-VD-A8K8,TCGA-VD-A8KA,TCGA-VD-A8KD,TCGA-VD-A8KE,TCGA-VD-A8KF,TCGA-VD-A8KH,TCGA-VD-A8KL,TCGA-VD-A8KN,TCGA-VD-AA8O,TCGA-VD-AA8P
0,APC,CNA,homdel_rec,Amplification,,,,,,,...,,,,,,,,,,
1,ARHGEF12,CNA,,,,,,,,,...,,,,,,,,,,
2,ATM,CNA,,,,,,,,,...,,,,,,,,,,
3,BCL11B,CNA,,,,,,,,,...,,,,,,,,,,
4,BLM,CNA,,,,,,,,,...,,,,,,,,,,


In [59]:
## Convert the oncoprint matrix into a numerical matrix
genes = df['track_name'].unique()
n_gene = len(genes)
n_neg = df.shape[1] - 2 - n_pos
print("{} ecDNA+ samples; {} ecDNA- samples".format(n_pos, n_neg))
L1 = np.zeros(shape=(len(genes), df.shape[1]-2))

335 ecDNA+ samples; 1565 ecDNA- samples


In [24]:
G1 = np.zeros(shape=(len(genes), df.shape[1]-2))

In [23]:
## All possible mutations
entry

{'CNA': {'Amplification', 'Deep Deletion', 'amp_rec', 'homdel_rec'},
 'MUTATIONS': {'Inframe Mutation (putative driver)',
  'Inframe Mutation (putative passenger)',
  'Missense Mutation (putative driver)',
  'Missense Mutation (putative passenger)',
  'Truncating mutation (putative driver)',
  'Truncating mutation (putative passenger)',
  'splice',
  'splice_rec'},
 'MRNA': set(),
 'PROTEIN': set(),
 'FUSION': {'Fusion'}}

In [39]:
## List of LoF mutations
loss = ['Truncating mutation (putative driver)', 'Truncating mutation (putative passenger)', \
        'Missense Mutation (putative driver)', 'Inframe Mutation (putative driver)', \
        'Deep Deletion', 'homdel_rec']

In [40]:
## List of GoF mutations
gain = ['Amplification', 'amp_rec']

In [37]:
## Iterate over each genes
gene = 'APC'
df_sel = df[(df['track_name'] == gene) & (df['track_type'] != 'FUSION')]
for col in df_sel.columns[2:]:


In [None]:
## Iterate over each genes & ignore FUSION at this moment
for i in range(n_gene):
    df_sel = df[(df['track_name'] == genes[i]) & (df['track_type'] != 'FUSION')]
    for j in range(2, df.shape[2]):
        ## Either way to count LoF
        L1[i,j-2] = sum(df_sel[df_sel.columns[j]].isin(loss)).     # method 1
        # L1[i,j-2] = df_sel[df_sel.columns[j]].isin(loss).any()   # method 2


In [None]:
## Construct the contigency table and perform statistical tests
## i.e.    ecDNA+ | ecDNA-  
## Loss | 
## Not  |
## Sum up rows and generate 2 matrices where rows are genes, and columns are counts
loss_pos = np.sum(L1[:,:n_pos], axis=1)
loss_neg = np.sum(L1[:,n_pos:], axis=1)

fisher, chi2 = np.zeros(n_gene), np.zeros(n_gene)
alpha = 0.05
for i in genes:
    # depends on how we computed the frequencies of mutations
    # e.g. method 2
    contigency_table = np.array([[loss_pos[i], loss_neg[i]], \
                                 [n_pos - loss_pos[i], n_neg - loss_neg[i]]])
    # one-sided if our hypothesis is that LoF mutations is MORE frequent in ecDNA+ samples 
    # since we only have tumor supressor genes
    _, fisher[i] = stats.fisher_exact(contigency_table, alternative="greater")
    _, chi2[i], _, _ = stats.chi2_contingency(contigency_table)

In [None]:
## Compute the magnitude = log(#LoF / #not LoF)
magnitude = np.zeros(n_gene)
for i in range(n_gene):
    magnitude[i] = np.log2(sum(L1[i,:]) / (n_pos + n_neg - sum(L1[i,:])))

In [None]:
## Volcano plot
f, ax = plt.subplots(figsize=(12,8))
fisher = -np.log10(fisher)
chi2 = -np.log10(chi2)
ax.scatter(magnitude, fisher)
ax.plot(magnitude, [-np.log10(alpha)] * n_gene, color='red', linestyle='dashed')
ax.set_xlabel('$\log_2(#LoF / #non-LoF)$')
ax.set_ylabel('$-\log_10$(P-value)')
ax.set_title('P-values vs. Magnitude')
plt.show()
