In [1]:
import sys
import os
import csv
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import umap.umap_ as umap
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import math
from itertools import combinations
import matplotlib as mpl
import random
from sklearn.cluster import AgglomerativeClustering
from sklearn.linear_model import LogisticRegression
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay,classification_report
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, label_binarize, StandardScaler
from sklearn.cluster import KMeans 
from scipy.spatial.distance import cdist
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
import statsmodels.stats.multitest as multi
from sklearn.feature_selection import RFE
import matplotlib.ticker as tck
from matplotlib import patches as mpatches
from pandas.plotting import parallel_coordinates
from mpl_toolkits.axes_grid1 import make_axes_locatable


# initialization, but double check patient id for gtex

In [2]:
integrin_list=['ITGA1', 'ITGA10', 'ITGA11', 'ITGA2', 'ITGA2B', 'ITGA3', 'ITGA4',
       'ITGA5', 'ITGA6', 'ITGA7', 'ITGA8', 'ITGA9', 'ITGAD', 'ITGAE', 'ITGAL',
       'ITGAM', 'ITGAV', 'ITGAX', 'ITGB1', 'ITGB2', 'ITGB3', 'ITGB4', 'ITGB5',
       'ITGB6', 'ITGB7', 'ITGB8', 'ITGBL1']

In [3]:
pan_tcga_sample_info = pd.read_csv("TcgaTargetGTEX_phenotype.txt.gz",sep='\t',encoding='latin-1')
pan_tcga_sample_info


Unnamed: 0,sample,detailed_category,primary disease or tissue,_primary_site,_sample_type,_gender,_study
0,TCGA-V4-A9EE-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
1,TCGA-VD-AA8N-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
2,TCGA-V4-A9EI-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
3,TCGA-VD-AA8O-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
4,TCGA-WC-A888-01,Uveal Melanoma,Uveal Melanoma,Eye,Primary Tumor,Male,TCGA
...,...,...,...,...,...,...,...
19126,TARGET-20-PANPKN-09,Acute Myeloid Leukemia,Acute Myeloid Leukemia,White blood cell,Primary Blood Derived Cancer - Bone Marrow,Male,TARGET
19127,TARGET-20-PANLIR-09,Acute Myeloid Leukemia,Acute Myeloid Leukemia,White blood cell,Primary Blood Derived Cancer - Bone Marrow,Male,TARGET
19128,TARGET-20-PAPAWN-09,Acute Myeloid Leukemia,Acute Myeloid Leukemia,White blood cell,Primary Blood Derived Cancer - Bone Marrow,Male,TARGET
19129,TARGET-20-PANPTM-09,Acute Myeloid Leukemia,Acute Myeloid Leukemia,White blood cell,Primary Blood Derived Cancer - Bone Marrow,Male,TARGET


In [None]:
pan_tcga = pd.read_csv("TcgaTargetGtex_rsem_gene_tpm.gz",sep='\t')
pan_tcga=pan_tcga.set_index('sample').copy()
#remove TARGET dataset columns
pan_tcga=pan_tcga[pan_tcga.columns[~pan_tcga.columns.str.contains('TARGET-')]].copy()
#next
gene_map = pd.read_csv("probeMap_gencode.v23.annotation.gene.probemap",sep='\t')
gene_map = gene_map[~gene_map.index.duplicated(keep='first')]
gene_id = gene_map.id.values.tolist()
gene_name = gene_map["gene"].values.tolist()
print('before dropping duplicates',len(gene_id),len(gene_name))
gene_duplicate = gene_map[gene_map.duplicated(subset="gene")]
gene_duplicate = gene_duplicate.sort_values('gene', axis='index')
gene_map.drop_duplicates(subset="gene", inplace=True, keep='first')
gene_id = gene_map.index.values.tolist()
gene_name = gene_map["gene"].values.tolist()
print('after dropping duplicates',len(gene_id),len(gene_name))
# Drop all those duplicated genes from gene expression DataFrame
duplicate_drop = gene_duplicate.id.values  #these are the gene ids that are duplicated
pan_tcga.drop(duplicate_drop, axis='index',inplace=True)
gene_map_dict = dict(zip(gene_map.id, gene_map.gene))
pan_tcga=pd.concat([pan_tcga,gene_map.set_index('id')],axis=1).set_index('gene').iloc[:,0:pan_tcga.shape[1]]

#sample info
pan_tcga_sample_info = pd.read_csv("TcgaTargetGTEX_phenotype.txt.gz",sep='\t',encoding='latin-1')
pan_tcga_sample_info=pan_tcga_sample_info.set_index('sample')
pan_tcga_sample_info=pan_tcga_sample_info[pan_tcga_sample_info._study!='TARGET']
pan_tcga_sample_info=pan_tcga_sample_info.dropna(axis=0,how='any')




In [None]:
common_sample_list=pan_tcga.T.index.intersection(pan_tcga_sample_info.index)
tcga_pan_w_info=pd.concat([pan_tcga[common_sample_list],
                           pan_tcga_sample_info[pan_tcga_sample_info.index.isin(common_sample_list)][['detailed_category','primary disease or tissue','_primary_site','_sample_type','_gender','_study']].T],axis=0)
tcga_pan_w_info.loc['patient_id']=[sample_id.split('-')[2] for sample_id in tcga_pan_w_info.columns]

tcga_pan_w_info=tcga_pan_w_info.dropna(axis=0,how='any')
del pan_tcga, pan_tcga_sample_info

In [None]:
#make changes to UMAP conditions here in this cell
#for tcga breast biomarkers, unwanted integrins not dropped bc there are no integrins 
unwanted_integrins=['ITGB1P1','ITGB3BP', 'ITGB1BP1', 'ITGB1BP2','ITGA9-AS1','ITGB2-AS1','ITGB5-AS1']


tcga_int = tcga_pan_w_info.loc[(tcga_pan_w_info.index[tcga_pan_w_info.index.str.contains("ITG")]) | tcga_pan_w_info.index[tcga_pan_w_info.index.isin(['_sample_type','detailed_category','_primary_site'])]]


tcga_int = tcga_int.drop(unwanted_integrins, axis=0)
input_sample_type=["Primary Tumor"]#,'Primary Blood Derived Cancer - Peripheral Blood'] #this sampletype for blood cancer
#old cancer list
# cancers=['breast invasive carcinoma',
#              'brain lower grade glioma',
#              'lung adenocarcinoma',
#              'lung squamous cell carcinoma',
#              'prostate adenocarcinoma',
#              'liver hepatocellular carcinoma',
#              'ovarian serous cystadenocarcinoma',
#              'acute myeloid leukemia',
#              'diffuse large B-cell lymphoma']

#normal tissue list
#NOTE: did not include transverse colon because colon adenocarcinoma is uncommon in that one
#as stated in this link: https://onlinelibrary.wiley.com/doi/full/10.1002/ags3.12380#:~:text=When%20compared%20to%20the%20rest,stomach%2C%20omentum%2C%20and%20pancreas.
normal_tissue_list=['Breast - Mammary Tissue',
              'Pancreas',
              'Lung',
              'Testis',
              'Liver',
              'Prostate',
               'Colon - Sigmoid',
               'Stomach']
#present cancer list
cancers = ['Breast Invasive Carcinoma',
           'Pancreatic Adenocarcinoma',
           'Lung Squamous Cell Carcinoma',
           'Lung Adenocarcinoma',
          'Testicular Germ Cell Tumor',
           'Liver Hepatocellular Carcinoma',
           'Prostate Adenocarcinoma',
          'Colon Adenocarcinoma',
          'Stomach Adenocarcinoma']

tcga_int = tcga_int.loc[:,tcga_int.loc["detailed_category"].isin(cancers)]
tcga_int = tcga_int.loc[:,tcga_int.loc["_sample_type"].isin(input_sample_type)]

#separating out normal tissue from gtex (WARNING: code is different from tcga_pan.ipynb)
normal_sample_type = ["Normal Tissue"]
normal_tcga_int = tcga_pan_w_info.loc[(tcga_pan_w_info.index[tcga_pan_w_info.index.str.contains("ITG")]) | tcga_pan_w_info.index[tcga_pan_w_info.index.isin(['_sample_type','detailed_category','_primary_site'])]]
normal_tcga_int = normal_tcga_int.loc[:,normal_tcga_int.loc["detailed_category"].isin(normal_tissue_list)]
normal_tcga_int = normal_tcga_int.loc[:,normal_tcga_int.loc["_sample_type"].isin(normal_sample_type)]
normal_tcga_int = normal_tcga_int.drop(unwanted_integrins, axis=0)

#separating out metastasis
met_sample_type = ["Metastatic"]
met_tcga_int = tcga_pan_w_info.loc[(tcga_pan_w_info.index[tcga_pan_w_info.index.str.contains("ITG")]) | tcga_pan_w_info.index[tcga_pan_w_info.index.isin(['_sample_type','detailed_category','_primary_site'])]]
met_tcga_int = met_tcga_int.loc[:,met_tcga_int.loc["_sample_type"].isin(met_sample_type)]
met_tcga_int = met_tcga_int.loc[:,met_tcga_int.loc["detailed_category"].isin(cancers)]
met_tcga_int = met_tcga_int.drop(unwanted_integrins, axis=0)


In [6]:
tcga_pan_w_info.loc[:,(tcga_pan_w_info.loc['_primary_site']=='Breast') & (tcga_pan_w_info.loc['_study']=='GTEX') & (tcga_pan_w_info.loc['_gender']=='Female')]


Unnamed: 0,GTEX-X4EP-2926-SM-3P5YQ,GTEX-11P81-1926-SM-5BC53,GTEX-13FTY-2226-SM-5J1ND,GTEX-S32W-2026-SM-4AD6E,GTEX-13O3O-0826-SM-5K7WE,GTEX-13QIC-2326-SM-5LU5N,GTEX-VJWN-0726-SM-3GIJ8,GTEX-131YS-0626-SM-5EGKL,GTEX-T6MO-0326-SM-32QOK,GTEX-11ILO-2226-SM-5A5L1,...,GTEX-XMD2-0926-SM-4WWEF,GTEX-X15G-1626-SM-3NMB3,GTEX-13N11-1726-SM-5J1OJ,GTEX-ZQG8-0726-SM-5P9H9,GTEX-YB5K-1626-SM-5IFIN,GTEX-R53T-1526-SM-48FEK,GTEX-ZAJG-0626-SM-5HL8X,GTEX-ZXES-0826-SM-5E43C,GTEX-TML8-1226-SM-32QON,GTEX-1117F-2826-SM-5GZXL
RP11-368I23.2,-3.1714,-0.6643,-2.5479,-1.2481,-9.9658,-9.9658,-9.9658,-1.5951,-1.4699,-3.6259,...,-4.035,-1.8314,-9.9658,-2.114,-1.1488,1.2636,-9.9658,-9.9658,-0.5543,-3.6259
RP11-167B3.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
RP11-742D12.2,-6.5064,-6.5064,-6.5064,-5.5735,-9.9658,-9.9658,-3.816,-9.9658,-2.4659,-6.5064,...,-3.816,-9.9658,-3.458,-3.1714,-9.9658,-5.5735,-9.9658,-6.5064,-9.9658,-9.9658
RAB4B,5.1211,4.49,5.5811,5.3893,4.7247,5.3245,5.0326,4.9308,4.6445,5.0968,...,4.9524,5.31,5.6918,4.9709,4.796,4.8719,5.1514,4.7976,5.0197,4.9402
AC104071.1,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,...,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658,-9.9658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
_primary_site,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast,...,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast,Breast
_sample_type,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,...,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue,Normal Tissue
_gender,Female,Female,Female,Female,Female,Female,Female,Female,Female,Female,...,Female,Female,Female,Female,Female,Female,Female,Female,Female,Female
_study,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,...,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX,GTEX


In [7]:
itg_list=tcga_int.loc[tcga_int.index.str.contains('ITG')].index
itg_list

Index(['ITGA1', 'ITGA10', 'ITGA11', 'ITGA2', 'ITGA2B', 'ITGA3', 'ITGA4',
       'ITGA5', 'ITGA6', 'ITGA7', 'ITGA8', 'ITGA9', 'ITGAD', 'ITGAE', 'ITGAL',
       'ITGAM', 'ITGAV', 'ITGAX', 'ITGB1', 'ITGB2', 'ITGB3', 'ITGB4', 'ITGB5',
       'ITGB6', 'ITGB7', 'ITGB8', 'ITGBL1'],
      dtype='object')

# initialization complete