Initial Pre-Processing notebook

In [61]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from collections import Counter
from pprint import pprint
from Bio import Entrez
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, Formula
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

#### S1 Raw data

In [3]:
# targets = pd.read_csv('../data/temp/targets.txt', sep='\t')
lst = ["septic patient_survivor", "septic patient_survivor", "septic patient_survivor", 
       "septic patient_survivor", "septic patient_survivor", "septic patient_survivor", 
       "septic patient_non-survivor", "septic patient_non-survivor", "septic patient_non-survivor", 
       "septic patient_non-survivor", "septic patient_non-survivor", "septic patient_non-survivor", 
       "septic patient_survivor", "septic patient_survivor", "septic patient_non-survivor", 
       "septic patient_non-survivor", "septic patient_non-survivor", "septic patient_non-survivor", 
       "septic patient_survivor", "septic patient_survivor", "healthy control", 
       "healthy control", "healthy control"]
# targets['Target'] = lst

In [5]:
Counter(lst)

Counter({'septic patient_survivor': 10,
         'septic patient_non-survivor': 10,
         'healthy control': 3})

In [12]:
targets.to_csv('targets.txt', sep='\t')

#### S2 Raw data

In [6]:
design_raw = pd.read_csv('../data/E-MTAB-5273/E-MTAB-5273.sdrf.txt', sep='\t')
adf_design = pd.read_csv('../data/E-MTAB-5273/A-MEXP-2210.adf.txt', 
                         sep='\t', usecols=['Reporter Name','Reporter Database Entry[hugo]']
                        )
adf_design['Reporter Name'] = adf_design['Reporter Name'].map(lambda x: x[len('ILMN_'):])

In [7]:
id2gene = dict(zip(adf_design['Reporter Name'], adf_design['Reporter Database Entry[hugo]']))

In [8]:
design_matrix = design_raw[['Source Name', 
                            'Characteristics[disease]', 
                            'Characteristics[clinical information]'
                          ]].copy()
design_matrix = design_matrix.rename({'Characteristics[disease]': 'disease', 
                                      'Characteristics[clinical information]': 'clinical information',
                                      'Source Name': 'Source_Name'}, 
                                     axis=1)

In [9]:
design_matrix = design_matrix[design_matrix.disease != 'faecal peritonitis']
design_matrix

Unnamed: 0,Source_Name,disease,clinical information
0,CAP0003.B.1,community-acquired pneumonia,alive at 28 day survival
1,CAP0003.B.3,community-acquired pneumonia,alive at 28 day survival
2,CAP0003.B.5,community-acquired pneumonia,alive at 28 day survival
3,CAP0004.B,community-acquired pneumonia,alive at 28 day survival
4,CAP0013.B,community-acquired pneumonia,alive at 28 day survival
...,...,...,...
135,CON0006,normal,
136,CON0007,normal,
137,CON0008,normal,
138,CON0009,normal,


These 3 ID's are not available in the expression data

In [10]:
design_matrix = design_matrix[~design_matrix.Source_Name.isin(('CAP0056.B.5', 'CAP0140.B.5', 'CAP0383'))].copy()

In [11]:
disease_mapping = {np.nan:'control', 'alive at 28 day survival':'SS', 'dead at 28 day survival':'SNS'}
design_matrix['Target'] = design_matrix['clinical information'].map(disease_mapping)
design_matrix.drop(columns=['disease', 'clinical information'], inplace=True)
design_matrix

Unnamed: 0,Source_Name,Target
0,CAP0003.B.1,SS
1,CAP0003.B.3,SS
2,CAP0003.B.5,SS
3,CAP0004.B,SS
4,CAP0013.B,SS
...,...,...
135,CON0006,control
136,CON0007,control
137,CON0008,control
138,CON0009,control


In [12]:
Counter(design_matrix.Target)

Counter({'SS': 98, 'SNS': 29, 'control': 10})

In [66]:
design_matrix.to_csv('../data/E-MTAB-5273/targets.txt', sep='\t')

In [112]:
exp_raw = pd.read_csv('../data/E-MTAB-5273/Burnham_sepsis_discovery_normalised_231.txt', sep='\t', index_col=0)
exp_proc = exp_raw[design_matrix.Source_Name.tolist()].copy()
exp_proc['genes'] = exp_proc.index.map(id2gene)

In [113]:
exp_proc.to_csv('../data/E-MTAB-5273/exp.txt', sep='\t')
exp_proc.head()

Unnamed: 0_level_0,CAP0003.B.1,CAP0003.B.3,CAP0003.B.5,CAP0004.B,CAP0013.B,CAP0015.B,CAP0017.B.1,CAP0017.B.5,CAP0020.B,CAP0022.B,...,CON0001,CON0002,CON0003,CON0004,CON0005,CON0006,CON0007,CON0008,CON0009,CON0010
ProbeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6450255,5.869515,5.874574,5.545272,6.309058,5.982874,6.068814,6.048042,5.598508,5.831552,6.190745,...,6.062548,5.664449,5.991724,6.197618,5.70066,5.7726,6.057904,5.888936,5.569821,5.705339
2570615,6.367828,5.927608,5.780757,6.218092,5.996902,6.23371,6.351661,5.686366,5.898212,5.866858,...,5.562242,6.076635,6.060375,5.69156,5.686096,5.85317,6.005532,5.970979,5.912448,5.643596
2000519,6.253729,5.957813,5.897107,6.309058,6.132696,6.041685,6.020529,5.898557,6.017853,6.042999,...,5.510514,6.158226,6.069921,5.728501,5.729353,5.905035,5.933458,5.999184,5.568004,5.838481
7050209,5.533996,5.657911,5.56966,5.783038,5.843885,6.131271,6.46214,5.581942,5.647566,5.923552,...,5.881739,5.614365,5.983666,5.893732,5.982073,6.254991,6.124156,6.019524,5.726249,4.984725
1580181,6.026217,6.213358,5.572077,5.780686,6.392451,5.920333,5.448536,5.787356,5.620937,6.190745,...,5.109064,5.844014,6.056539,5.8497,5.769816,5.756296,5.899942,5.530243,5.774312,5.275199


#### S3 Raw Data

In [112]:
# design_raw = pd.read_csv('../data/GSE65682/phenodata.txt', sep='\t')
# xml = ET.parse('../data/GSE65682/GSE65682_family.xml')
# ns = {'namespace':'http://www.ncbi.nlm.nih.gov/geo/info/MINiML'}
platform = pd.read_csv('../data/GSE65682/GPL13667-15572.txt', sep='\t', index_col=0)
exp = pd.read_csv('../data/GSE65682/exp.txt', sep='\t')
# root = xml.getroot()

  interactivity=interactivity, compiler=compiler, result=result)


In [113]:
platform['Entrez Gene'] = [gene.split(' /// ')[0] for gene in platform['Entrez Gene']]
probe2gene_mapping = dict(zip(platform.index, platform['Entrez Gene']))

In [114]:
exp['genes'] = exp['genes'].map(probe2gene_mapping)

In [116]:
exp = exp[exp.genes != '---']

In [111]:
Entrez.email = "s0vibhar@uni-bonn.de"

request = Entrez.epost("gene",id=','.join(exp.genes.tolist()))
result = Entrez.read(request)

webEnv = result["WebEnv"]
queryKey = result["QueryKey"]
data = Entrez.esummary(db="gene", webenv=webEnv, query_key =
        queryKey)
annotations = Entrez.read(data)
genes = [annotations['DocumentSummarySet']['DocumentSummary'][idx]['NomenclatureSymbol'] for idx in range(len(lst))]

RuntimeError: IDs contain invalid characters which was treated as delimiters.

In [117]:
Counter(exp.genes.tolist())['---']

0

In [119]:
exp.to_csv('../data/GSE65682/exp2.txt', sep='\t', index=False)

In [6]:
tags = ['pneumonia diagnoses', 'mortality_event_28days', 'time_to_event_28days']

for i in tags:
    temp_lst = []
    for char in root.findall(f"./namespace:Sample/namespace:Channel/namespace:Characteristics[@tag='{i}']", ns):
        temp_var = char.text.replace('\n','').strip()
        temp_lst.append(np.nan if temp_var == 'NA' else temp_var)
    design_raw[f'{i}'] = temp_lst

NameError: name 'root' is not defined

In [80]:
Counter(design_raw['pneumonia diagnoses'].tolist())

Counter({'no-cap': 33, 'cap': 108, nan: 577, 'hap': 84})

In [82]:
design_matrix = design_raw[design_raw['pneumonia diagnoses'].isin(('cap', 'no-cap', 'hap'))].copy()

In [83]:
from collections import Counter
Counter(design_matrix['pneumonia diagnoses'].tolist())

Counter({'no-cap': 33, 'cap': 108, 'hap': 84})

In [84]:
survivor_mapping = {np.nan:np.nan, '0':'SS', '1':'SNS'}
control_mapping = {'no-cap':'control'}
design_matrix['Target'] = design_matrix['mortality_event_28days'].map(survivor_mapping)
design_matrix['Control'] = design_matrix['pneumonia diagnoses'].map(control_mapping)

In [89]:
final_design = pd.DataFrame(data = design_matrix.FileName, columns=['FileName'])

In [90]:
final_design['Target'] = pd.concat([design_matrix['Target'].dropna(), design_matrix['Control'].dropna()]).reindex_like(design_matrix).tolist()

In [104]:
final_design = final_design[~final_design.Target.isna()]

In [105]:
final_design.reset_index(inplace=True, drop=True)

In [109]:
final_design.to_csv('../data/GSE65682/targets.txt', sep='\t')