In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('CDKN2A main data.csv')
data

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,GENE_CDS_LENGTH,HGNC_ID,SAMPLE_NAME,ID_SAMPLE,ID_TUMOUR,PRIMARY_SITE,SITE_SUBTYPE_1,SITE_SUBTYPE_2,...,RESISTANCE_MUTATION,MUTATION_SOMATIC_STATUS,PUBMED_PMID,ID_STUDY,SAMPLE_TYPE,TUMOUR_ORIGIN,AGE,HGVSP,HGVSC,HGVSG
0,CDKN2A,ENST00000498124.1,504,1787,2692607,2692607,2551579,thyroid,NS,NS,...,-,Variant of unknown origin,28351340.0,,surgery-fixed,NS,,ENSP00000418915.1:p.Pro70GlnfsTer49,ENST00000498124.1:c.209_210del,9:g.21971150_21971151del
1,CDKN2A,ENST00000498124.1,504,1787,902550,902550,820104,pleura,NS,NS,...,-,Variant of unknown origin,12117769.0,,surgery fresh/frozen,primary,62.0,ENSP00000418915.1:p.His83Gln,ENST00000498124.1:c.249C>A,9:g.21971110G>T
2,CDKN2A,ENST00000498124.1,504,1787,S87946,943503,860138,cervix,NS,NS,...,-,Variant of unknown origin,10408854.0,,surgery fresh/frozen,NS,,ENSP00000418915.1:p.Gly150=,ENST00000498124.1:c.450T>C,9:g.21970909A>G
3,CDKN2A,ENST00000498124.1,504,1787,ME024,1838446,1732547,skin,trunk,NS,...,-,Variant of unknown origin,22817889.0,,NS,primary,44.0,ENSP00000418915.1:p.Arg87GlyfsTer59,ENST00000498124.1:c.259del,9:g.21971102del
4,CDKN2A,ENST00000498124.1,504,1787,S88227,943766,860401,pancreas,NS,NS,...,-,Variant of unknown origin,11753042.0,,surgery - NOS,primary,,ENSP00000418915.1:p.Glu61AlafsTer55,ENST00000498124.1:c.177_187del,9:g.21971172_21971182del
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3711,CDKN2A,ENST00000498124.1,504,1787,P-0009960-T01-IM5,2725787,2584608,lung,NS,NS,...,-,Confirmed somatic variant,28481359.0,,surgery-fixed,primary,,ENSP00000418915.1:p.His123Asn,ENST00000498124.1:c.367C>A,9:g.21970992G>T
3712,CDKN2A,ENST00000498124.1,504,1787,TCGA-85-7699-01,2195194,2063472,lung,NS,NS,...,-,Confirmed somatic variant,,418.0,NS,NS,73.0,ENSP00000418915.1:p.Trp110Ter,ENST00000498124.1:c.329G>A,9:g.21971030C>T
3713,CDKN2A,ENST00000498124.1,504,1787,GBM18-I1,2813453,2669845,central_nervous_system,brain,NS,...,-,Confirmed somatic variant,28263318.0,,surgery fresh/frozen,NS,40.0,ENSP00000418915.1:p.Asp84Gly,ENST00000498124.1:c.251A>G,9:g.21971108T>C
3714,CDKN2A,ENST00000498124.1,504,1787,933237,933237,849894,pancreas,ampulla_of_Vater,NS,...,-,Confirmed somatic variant,15014024.0,,surgery-fixed,primary,,ENSP00000418915.1:p.Glu69Val,ENST00000498124.1:c.206_207delinsTA,9:g.21971152_21971153delinsTA


In [4]:
#filtering for mutation type missense substitution mutation

data = data[data[' MUTATION_DESCRIPTION'] == 'Substitution - Missense']
data.shape

(1386, 37)

In [5]:
#filter to remove mutation points where there is no genomic reference coordinate
#this is indicated by value 'null' in 'GRCH' column

data = data[data[' GRCH'] != 'null']
data.shape

(1386, 37)

In [6]:
#this is good, all our records has known substitution information

In [7]:
#now we remove all rows with insertion, duplication, inversion or deletion information

data = data[~data[' HGVSG'].str.contains('del|in|dup', case=False, na=False, regex=True)]
data.shape

(1351, 37)

We are done with all the filtering we need. We will now arrange the dataframe to better do our analyses. First, we build out our genomic level features.

In [8]:
columns_to_copy = ['GENE_NAME', ' ACCESSION_NUMBER', ' GENE_CDS_LENGTH', ' HGVSG', ' HGVSC', ' MUTATION_CDS']
new_df = data[columns_to_copy].copy()

In [9]:
#extract position of mutation
new_df['CDNA_POSITION'] = new_df[' MUTATION_CDS'].str.extract('(\d+)')
new_df['CDNA_POSITION'] = pd.to_numeric(new_df['CDNA_POSITION'], errors='coerce')

In [10]:
#extract mutation event, base and mutant alleles
new_df['MUTATION_EVENT'] = new_df[' MUTATION_CDS'].str[-3:]
new_df['BASE_ALLELE'] = new_df['MUTATION_EVENT'].str[0]
new_df['MUTANT_ALLELE'] = new_df['MUTATION_EVENT'].str[-1]

Now we get our proteomic level features.

In [11]:
aa_mapping = {
    'A': 'Ala',
    'R': 'Arg',
    'N': 'Asn',
    'D': 'Asp',
    'C': 'Cys',
    'Q': 'Gln',
    'E': 'Glu',
    'G': 'Gly',
    'H': 'His',
    'I': 'Ile',
    'L': 'Leu',
    'K': 'Lys',
    'M': 'Met',
    'F': 'Phe',
    'P': 'Pro',
    'S': 'Ser',
    'T': 'Thr',
    'W': 'Trp',
    'Y': 'Tyr',
    'V': 'Val'
}

In [12]:
new_df[['HGVSP', 'MUTATION_AA']] = data[[' HGVSP', ' MUTATION_AA']]

In [13]:
#extract wild-type and mutant amino acids and mutation position in 1-letter and 3-letter format
new_df['WT_AA_1'] = new_df['MUTATION_AA'].str[2]
new_df['WT_AA_3'] = new_df['WT_AA_1'].map(aa_mapping)

new_df['CODON_POSITION'] = new_df['MUTATION_AA'].str.extract('(\d+)')
new_df['CODON_POSITION'] = pd.to_numeric(new_df['CODON_POSITION'], errors='coerce')

new_df['MT_AA_1'] = new_df['MUTATION_AA'].str[-1]
new_df['MT_AA_3'] = new_df['MT_AA_1'].map(aa_mapping)

Add all your site and histology specific features now.

In [14]:
 new_df[['MUTATION_SOMATIC_STATUS', 'PRIMARY_SITE', 'PRIMARY_HISTOLOGY', 'SAMPLE_TYPE', 'TUMOUR_ORIGIN', 'GENOMIC_MUTATION_ID', 'PUBMED_PMID']] = data[[' MUTATION_SOMATIC_STATUS', ' PRIMARY_SITE', ' PRIMARY_HISTOLOGY', ' SAMPLE_TYPE', ' TUMOUR_ORIGIN', ' GENOMIC_MUTATION_ID', ' PUBMED_PMID']]

In [15]:
new_df['PRIMARY_SITE'] = new_df['PRIMARY_SITE'].str.replace('_', ' ')
new_df['PRIMARY_HISTOLOGY'] = new_df['PRIMARY_HISTOLOGY'].str.replace('_', ' ')

In [16]:
new_df

Unnamed: 0,GENE_NAME,ACCESSION_NUMBER,GENE_CDS_LENGTH,HGVSG,HGVSC,MUTATION_CDS,CDNA_POSITION,MUTATION_EVENT,BASE_ALLELE,MUTANT_ALLELE,...,CODON_POSITION,MT_AA_1,MT_AA_3,MUTATION_SOMATIC_STATUS,PRIMARY_SITE,PRIMARY_HISTOLOGY,SAMPLE_TYPE,TUMOUR_ORIGIN,GENOMIC_MUTATION_ID,PUBMED_PMID
1,CDKN2A,ENST00000498124.1,504,9:g.21971110G>T,ENST00000498124.1:c.249C>A,c.249C>A,249,C>A,C,A,...,83,Q,Gln,Variant of unknown origin,pleura,mesothelioma,surgery fresh/frozen,primary,COSV58684711,12117769.0
16,CDKN2A,ENST00000498124.1,504,9:g.21971171A>G,ENST00000498124.1:c.188T>C,c.188T>C,188,T>C,T,C,...,63,P,Pro,Variant of unknown origin,skin,malignant melanoma,surgery - NOS,metastasis,COSV58716983,20703244.0
17,CDKN2A,ENST00000498124.1,504,9:g.21971135G>A,ENST00000498124.1:c.224C>T,c.224C>T,224,C>T,C,T,...,75,L,Leu,Variant of unknown origin,lung,carcinoma,surgery-fixed,metastasis,COSV58688496,26164066.0
18,CDKN2A,ENST00000498124.1,504,9:g.21974742C>T,ENST00000498124.1:c.86G>A,c.86G>A,86,G>A,G,A,...,29,Q,Gln,Variant of unknown origin,upper aerodigestive tract,carcinoma,surgery-fixed,NS,COSV58704956,19690981.0
20,CDKN2A,ENST00000498124.1,504,9:g.21971203C>G,ENST00000498124.1:c.156G>C,c.156G>C,156,G>C,G,C,...,52,I,Ile,Variant of unknown origin,breast,carcinoma,cell-line,NS,COSV58687020,19593635.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3700,CDKN2A,ENST00000498124.1,504,9:g.21971117G>A,ENST00000498124.1:c.242C>T,c.242C>T,242,C>T,C,T,...,81,L,Leu,Confirmed somatic variant,skin,malignant melanoma,surgery-fixed,metastasis,COSV58683884,28481359.0
3706,CDKN2A,ENST00000498124.1,504,9:g.21971109C>T,ENST00000498124.1:c.250G>A,c.250G>A,250,G>A,G,A,...,84,N,Asn,Confirmed somatic variant,central nervous system,primitive neuroectodermal tumour-medulloblastoma,NS,NS,COSV58683289,
3711,CDKN2A,ENST00000498124.1,504,9:g.21970992G>T,ENST00000498124.1:c.367C>A,c.367C>A,367,C>A,C,A,...,123,N,Asn,Confirmed somatic variant,lung,carcinoma,surgery-fixed,primary,COSV58709909,28481359.0
3713,CDKN2A,ENST00000498124.1,504,9:g.21971108T>C,ENST00000498124.1:c.251A>G,c.251A>G,251,A>G,A,G,...,84,G,Gly,Confirmed somatic variant,central nervous system,glioma,surgery fresh/frozen,NS,COSV58688961,28263318.0


In [18]:
# distinct_primary_histology_values = new_df['PRIMARY_HISTOLOGY'].unique()

# print("Distinct Values in PRIMARY_HISTOLOGY column:")
# print()
# for value in distinct_primary_histology_values:
#     print(value)
    
    
distinct_primary_histology_values = new_df['PRIMARY_HISTOLOGY'].unique()

print("Counts of each distinct value in PRIMARY_HISTOLOGY column:")
print(new_df['PRIMARY_HISTOLOGY'].value_counts())

Counts of each distinct value in PRIMARY_HISTOLOGY column:
carcinoma                                              982
malignant melanoma                                     163
other                                                   39
lymphoid neoplasm                                       37
glioma                                                  35
haematopoietic neoplasm                                 24
adnexal tumour                                          20
meningioma                                               7
gastrointestinal stromal tumour                          3
germ cell tumour                                         3
carcinoid-endocrine tumour                               3
benign melanocytic nevus                                 3
primitive neuroectodermal tumour-medulloblastoma         2
mesothelioma                                             2
pancreatic intraepithelial neoplasia (PanIN)             2
sarcoma                                                 

In [19]:
cancer_type_mappings = {
    'mesothelioma': 'Other',
    'malignant melanoma': 'Skin Cancer',
    'carcinoma': 'Carcinoma',
    'lymphoid neoplasm': 'Lymphoma',
    'haematopoietic neoplasm': 'Leukemia',
    'adnexal tumour': 'Skin Cancer',
    'glioma': 'Brain Tumor',
    'gastrointestinal stromal tumour': 'Other',
    'fibrosarcoma': 'Sarcoma',
    'rhabdomyosarcoma': 'Sarcoma',
    'other': 'Other',
    'carcinoid-endocrine tumour': 'Other',
    'pheochromocytoma': 'Other',
    'meningioma': 'Brain Tumor',
    'malignant melanoma of soft parts-clear cell sarcoma': 'Sarcoma',
    'Wilms tumour': 'Other',
    'calcifying epithelial odontogenic tumour': 'Other',
    'low malignant potential (borderline) tumour': 'Other',
    'thymic carcinoma': 'Other',
    'sarcoma': 'Sarcoma',
    'primitive neuroectodermal tumour-medulloblastoma': 'Brain Tumor',
    'germ cell tumour': 'Other',
    'adenoma': 'Other',
    'benign melanocytic nevus': 'Skin Cancer',
    'fibroepithelial neoplasm': 'Other',
    'pancreatic intraepithelial neoplasia (PanIN)': 'Other',
    'in situ neoplasm': 'Other',
    'neuroblastoma': 'Brain Tumor',
    'leiomyosarcoma': 'Sarcoma',
    'NS': 'Other',
    'paraganglioma': 'Other',
    'malignant fibrous histiocytoma-pleomorphic sarcoma': 'Sarcoma',
    'chondrosarcoma': 'Other',
    'in situ epithelial neoplasm': 'Carcinoma',
    'adrenal cortical carcinoma': 'Other'
}

In [20]:
new_df['CANCER_TYPE'] = 'Other'  
new_df['CANCER_TYPE'] = new_df['PRIMARY_HISTOLOGY'].map(cancer_type_mappings)
print(new_df[['PRIMARY_HISTOLOGY', 'CANCER_TYPE']])

                                     PRIMARY_HISTOLOGY  CANCER_TYPE
1                                         mesothelioma        Other
16                                  malignant melanoma  Skin Cancer
17                                           carcinoma    Carcinoma
18                                           carcinoma    Carcinoma
20                                           carcinoma    Carcinoma
...                                                ...          ...
3700                                malignant melanoma  Skin Cancer
3706  primitive neuroectodermal tumour-medulloblastoma  Brain Tumor
3711                                         carcinoma    Carcinoma
3713                                            glioma  Brain Tumor
3715                                         carcinoma    Carcinoma

[1351 rows x 2 columns]


In [24]:
distinct_cancers = new_df['CANCER_TYPE'].unique()

print("Distinct Values in CANCER_TYPE column:")
print(new_df['CANCER_TYPE'].value_counts())

Distinct Values in CANCER_TYPE column:
Carcinoma      983
Skin Cancer    186
Other           66
Brain Tumor     46
Lymphoma        37
Leukemia        24
Sarcoma          9
Name: CANCER_TYPE, dtype: int64


In [26]:
new_df.to_csv('filtered_df.csv', index=False)

Now we have our filtered file which we can use for analysis.