## KEGG Pathways Preprocessing  

### Mapping HGNC ID to Ensembl Gene ID

#### HGNC Dataset Importation

In [1]:
import pandas as pd
import numpy as np
HGNC=pd.read_csv('HGNC.csv')

HGNC.head(1)

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Ensembl gene ID
0,5,A1BG,alpha-1-B glycoprotein,ENSG00000121410


Renaming Columns:

In [2]:
print(list(HGNC.columns.values))

HGNC.columns = ['HGNC_ID', 'Approved_Symbol', 'Approved_name','gene_sliced']
print(list(HGNC.columns.values))

['HGNC ID', 'Approved symbol', 'Approved name', 'Ensembl gene ID']
['HGNC_ID', 'Approved_Symbol', 'Approved_name', 'gene_sliced']


In [3]:
HGNC.set_index('gene_sliced', drop=True, inplace=True)
HGNC.head(1)

Unnamed: 0_level_0,HGNC_ID,Approved_Symbol,Approved_name
gene_sliced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000121410,5,A1BG,alpha-1-B glycoprotein


#### Counts Dataset Importation

In [6]:
counts=pd.read_csv('cpm_renamed.csv', index_col = 0)
counts.head(1)

Unnamed: 0,2001-01,2001-02,2001-03,2002-01,2002-02,2002-03,2003-01,2003-02,2003-03,2004-01,...,2082-03,2083-01,2083-02,2083-03,2084-01,2084-02,2084-03,2085-01,2085-02,2085-03
ENSG00000237973.1,41.1537,32.840876,33.472636,68.599342,55.83454,85.471215,82.970549,75.094779,34.152149,29.143781,...,,,,,,,,,,


Data preprocessing requires that Ensemble gene ID is integer; therefore we remove the decimal values.

In [7]:
gene_id=list(counts.index.values)

In [8]:
gene_sliced=[]
for gene in gene_id:
    gene_new=gene.split('.')[0]
    
    gene_sliced.append(gene_new)
    
print(gene_sliced)    

['ENSG00000237973', 'ENSG00000248527', 'ENSG00000187583', 'ENSG00000187608', 'ENSG00000078808', 'ENSG00000176022', 'ENSG00000160087', 'ENSG00000127054', 'ENSG00000107404', 'ENSG00000162576', 'ENSG00000175756', 'ENSG00000221978', 'ENSG00000224870', 'ENSG00000242485', 'ENSG00000160075', 'ENSG00000215014', 'ENSG00000248333', 'ENSG00000189339', 'ENSG00000215790', 'ENSG00000008130', 'ENSG00000078369', 'ENSG00000162585', 'ENSG00000157933', 'ENSG00000116151', 'ENSG00000157916', 'ENSG00000272449', 'ENSG00000157873', 'ENSG00000228037', 'ENSG00000158109', 'ENSG00000116213', 'ENSG00000235169', 'ENSG00000130764', 'ENSG00000116198', 'ENSG00000169598', 'ENSG00000069424', 'ENSG00000116251', 'ENSG00000116237', 'ENSG00000215788', 'ENSG00000162408', 'ENSG00000162413', 'ENSG00000237436', 'ENSG00000171735', 'ENSG00000049245', 'ENSG00000236266', 'ENSG00000116288', 'ENSG00000142599', 'ENSG00000074800', 'ENSG00000230679', 'ENSG00000049239', 'ENSG00000171621', 'ENSG00000171608', 'ENSG00000171603', 'ENSG000001

In [12]:
counts['gene_sliced'] = gene_sliced

counts = counts.set_index('gene_sliced', drop=True, inplace = False)
counts.head(1)

Unnamed: 0_level_0,2001-01,2001-02,2001-03,2002-01,2002-02,2002-03,2003-01,2003-02,2003-03,2004-01,...,2082-03,2083-01,2083-02,2083-03,2084-01,2084-02,2084-03,2085-01,2085-02,2085-03
gene_sliced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000237973,41.1537,32.840876,33.472636,68.599342,55.83454,85.471215,82.970549,75.094779,34.152149,29.143781,...,,,,,,,,,,


#### Merging Counts and Gene Expression Matrix

In [15]:
genes = counts.merge(HGNC, on="gene_sliced", how ='left')
genes.head(1)

Unnamed: 0_level_0,2001-01,2001-02,2001-03,2002-01,2002-02,2002-03,2003-01,2003-02,2003-03,2004-01,...,2083-03,2084-01,2084-02,2084-03,2085-01,2085-02,2085-03,HGNC_ID,Approved_Symbol,Approved_name
gene_sliced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000237973,41.1537,32.840876,33.472636,68.599342,55.83454,85.471215,82.970549,75.094779,34.152149,29.143781,...,,,,,,,,52014.0,MTCO1P12,MT-CO1 pseudogene 12


In [42]:
genes.to_csv('all_genes.csv')