In [140]:
from __future__ import print_function
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 
from scipy.stats import zscore
import seaborn as sns
import sys,os
import gzip
import ftplib
import re

In [141]:
def download_GEO_matrix(fname,ftppath,destination=os.getcwd(),ftp='ftp.ncbi.nlm.nih.gov'):
    ftp = ftplib.FTP(ftp)   
    ftp.login() 
    ftp.cwd(ftppath)
    #ftp.retrlines('LIST')
    file_handle = open(destination+"/"+fname, 'wb')
    ftp.retrbinary('RETR '+fname, file_handle.write)
    file_handle.close()
    ftp.quit()
    return destination+"/"+fname

def read_matrix(fname,index = "GSM"):
    df = {}
    with gzip.open(fname) as infile:
        for line in infile.readlines():
            if line.startswith("!"):
                line = line.rstrip().replace('"','').split("\t")
                line =  map(lambda x : x.rstrip().lstrip(),line)
                #print(line)
                if line[0] == '!Sample_title':
                    df["title"] = line[1:]
                if line[0] == '!Sample_geo_accession':
                    df["GSM"] = line[1:]
                if line[0] == '!Sample_source_name_ch1':
                    df["source"] = line[1:]
                if line[0] == '!Sample_characteristics_ch1':
                    if ":" in line[1]:
                        sep = ": "
                    elif "=" in line[1]:
                        sep = "= "
                    else:
                        pass
                    field = line[1].split(sep)[0]
                    df[field] = map(lambda x : x.replace(field+sep,"").rstrip().lstrip(),line[1:])
    df=pd.DataFrame.from_dict(df)
    df.set_index(index,drop=True,inplace=True)
    return df

## GSE28796 - Docetaxel

In [142]:
tmp_dir = "/home/olya/SFU/Hossein/NEW_Project/raw_data/downloads/"
fpath = download_GEO_matrix("GSE28796_series_matrix.txt.gz",'/geo/series/GSE28nnn/GSE28796/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df = df.loc[:,["pathological complete response","replicate","sample"]]
df.columns = ["pathological complete response","replicate","patient"]
df["drug"] = "Docetaxel"
df.index.name = "sample_name"
df["response"] = "R"
df.loc[df["pathological complete response"] == "yes","response"] = "S"
df = df.loc[:,["drug","response","patient","replicate","pathological complete response"]]
df.to_csv("../raw_data/response/GSE28796_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,patient,replicate,pathological complete response
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSM713198,Docetaxel,R,1,1,no
GSM713199,Docetaxel,R,1,2,no
GSM713200,Docetaxel,R,2,1,no
GSM713201,Docetaxel,R,2,2,no
GSM713202,Docetaxel,R,3,1,no
GSM713203,Docetaxel,R,3,2,no
GSM713204,Docetaxel,S,4,1,yes
GSM713205,Docetaxel,S,4,2,yes
GSM713206,Docetaxel,R,5,1,no
GSM713207,Docetaxel,R,5,2,no


## Paclitaxel 
 * GSE15622 (Paclitaxel pre-treatment)
 * GSE22513

In [143]:
fpath = download_GEO_matrix("GSE15622_series_matrix.txt.gz",'/geo/series/GSE15nnn/GSE15622/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df= df.loc[df["treatment"] == "Paclitaxel",:]
df["drug"] = "Paclitaxel"
df.index.name = "sample_name"
df.shape
df = df.loc[df["title"].str.contains("pre-treatment"), :]
df["patient"] = df["title"].apply(lambda x : x.split(",")[0])
df.loc[df["clinical group"]=="pre_pacli_resist","response"] = "R"
df.loc[df["clinical group"]=="pre_pacli_sens","response"] = "S"
df = df.loc[:,["drug","patient","response","clinical group","ca-125 coefficient"]]
df.to_csv("../raw_data/response/GSE15622_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,patient,response,clinical group,ca-125 coefficient
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSM391038,Paclitaxel,Patient 12,R,pre_pacli_resist,-0.042112197
GSM391039,Paclitaxel,Patient 14,R,pre_pacli_resist,0.200103469
GSM391040,Paclitaxel,Patient 15,S,pre_pacli_sens,-0.986454414
GSM391041,Paclitaxel,Patient 16,S,pre_pacli_sens,-1.497938659
GSM391042,Paclitaxel,Patient 19,R,pre_pacli_resist,-0.287581511
GSM391043,Paclitaxel,Patient 24,S,pre_pacli_sens,-0.533008941
GSM391044,Paclitaxel,Patient 27,R,pre_pacli_resist,-0.478574974
GSM391045,Paclitaxel,Patient 28,R,pre_pacli_resist,-0.286193553
GSM391046,Paclitaxel,Patient 2,S,pre_pacli_sens,-0.819586317
GSM391047,Paclitaxel,Patient 33,R,pre_pacli_resist,0.267918317


In [144]:
fpath = download_GEO_matrix("GSE22513_series_matrix.txt.gz",'/geo/series/GSE22nnn/GSE22513/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df["drug"] = "Paclitaxel"
df["title"] = df["title"].apply(lambda x : x.replace("non-pCR breast biopsy, ",""))
df["title"] = df["title"].apply(lambda x : x.replace("pCR breast biopsy, ",""))
df["patient"] = df["title"].apply(lambda x : x.split(" ")[1])
df["replicate"] = df["title"].apply(lambda x : x.split(" ")[3])
df.index.name = "sample_name"
df["response"] = "R"
df.loc[df["pathological complete response"] == "yes","response"] = "S"
df = df.loc[:,["drug", "response","patient","replicate","pathological complete response"]]
df.to_csv("../raw_data/response/GSE22513_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,patient,replicate,pathological complete response
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
GSM559042,Paclitaxel,R,1,1,no
GSM559043,Paclitaxel,R,1,2,no
GSM559044,Paclitaxel,R,2,1,no
GSM559045,Paclitaxel,R,2,2,no
GSM559046,Paclitaxel,R,3,1,no
GSM559047,Paclitaxel,R,3,2,no
GSM559048,Paclitaxel,R,4,1,no
GSM559049,Paclitaxel,R,4,2,no
GSM559050,Paclitaxel,R,5,1,no
GSM559051,Paclitaxel,R,5,2,no


## Sorafenib 
* GSE109211
* GSE31428 - table requred manual correction because columns were named differently

In [145]:
fpath = download_GEO_matrix("GSE109211_series_matrix.txt.gz",'/geo/series/GSE109nnn/GSE109211/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df= df.loc[df["treatment"] == "Sor",:]
df["drug"] = "Sorafenib"
df = df.sort_values("title")
df["response"] = ""
df.loc[df["outcome"]=="non-responder","response"] = "R"
df.loc[df["outcome"]=="responder","response"] = "S"
df["sample"] = df["title"].apply(lambda x : x.replace(" FFPE sample",""))
df = df.loc[:,["drug","response","sample"]]
df.to_csv("../raw_data/response/GSE109211_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,sample
GSM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GSM2935356,Sorafenib,R,BS101
GSM2935413,Sorafenib,S,BS107
GSM2935411,Sorafenib,S,BS117
GSM2935320,Sorafenib,R,BS119
GSM2935397,Sorafenib,R,BS129
GSM2935333,Sorafenib,S,BS139
GSM2935414,Sorafenib,R,BS149
GSM2935407,Sorafenib,R,BS155
GSM2935361,Sorafenib,S,BS157
GSM2935416,Sorafenib,R,BS161


In [147]:
fpath = download_GEO_matrix("GSE31428_series_matrix.txt.gz",'/geo/series/GSE31nnn/GSE31428/matrix/',destination=tmp_dir)
df = read_matrix(fpath)
df.to_csv("../raw_data/response/GSE31428_incorrect_columns.tsv",sep ="\t")
df = pd.read_csv("../raw_data/response/GSE31428_corrected_columns.tsv",sep ="\t",index_col=0)
cols = ['treatment',"8-week disease control (1=yes, 0=no)","pfsc (1=progressed; 0=not progressed)",'pfsm (month)',
        'randomization_date','biopsy site (grouped)',
        u'smoking_status',u'source', u'stage_at_diagnosis','gender','race','egfr index']
df["drug"] = "Sorafenib"
df['response']= "R"
df.loc[df["8-week disease control (1=yes, 0=no)"]==1,'response']= "S" 
df=df.loc[:,["drug","response"]+cols]
df.to_csv("../raw_data/response/GSE31428_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,treatment,"8-week disease control (1=yes, 0=no)",pfsc (1=progressed; 0=not progressed),pfsm (month),randomization_date,biopsy site (grouped),smoking_status,source,stage_at_diagnosis,gender,race,egfr index
GSM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
GSM677317,Sorafenib,R,sorafenib,0,1,1.6756,2008-02-06,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Male,White,0.21
GSM677322,Sorafenib,S,sorafenib,1,1,9.1663,2008-05-29,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IIIB,Male,White,1.13
GSM677333,Sorafenib,S,sorafenib,1,1,2.7598,2009-05-13,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Male,White,0.35
GSM677335,Sorafenib,S,sorafenib,1,1,3.614,2009-05-27,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Male,White,-0.89
GSM677336,Sorafenib,S,sorafenib,1,1,3.5483,2009-06-05,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Female,White,-0.59
GSM677337,Sorafenib,R,sorafenib,0,1,0.8214,2009-10-02,8: Deep lymph nodes,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Female,White,-1.77
GSM677338,Sorafenib,R,sorafenib,0,1,1.117,2009-10-14,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Male,White,1.01
GSM780611,Sorafenib,S,sorafenib,1,1,3.6468,2007-12-12,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Female,White,0.26
GSM780612,Sorafenib,S,sorafenib,1,1,3.6468,2008-01-11,7: Shallow lymph nodes,Never,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Female,White,1.17
GSM780613,Sorafenib,R,sorafenib,0,1,1.807,2008-01-31,1: Lung,Former,BATTLE_trial_core biopsy_chemorefractory non-s...,IV,Female,White,0.87


## Bortezomib
* GSE55145

In [148]:
fpath = download_GEO_matrix("GSE55145_series_matrix.txt.gz",'/geo/series/GSE55nnn/GSE55145/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df["drug"] = "Bortezomib"
df["response"] = "S" # S = 39 (8 RCEP + 6 CR + 10 RCIF +15 TBRP)
# R = 28 (12 Prog + 6 RMin + 10 stable)
df.loc[df["post-induction (bortezomib) treatment response"] == "Prog (Progressive disease, no treatment response)","response"] = "R"
df.loc[df["post-induction (bortezomib) treatment response"] == "RMin (Minimal treatment response)","response"] = "R"
df.loc[df["post-induction (bortezomib) treatment response"] == "stable (stable disease, no treatment response)","response"] = "R"
df = df.loc[:,["drug","response",'microarray_batch', u'post-induction (bortezomib) treatment response',
       u'source', u'title']] 
df.to_csv("../raw_data/response/GSE55145_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,microarray_batch,post-induction (bortezomib) treatment response,source,title
GSM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
GSM1336382,Bortezomib,R,1,"stable (stable disease, no treatment response)",CD-138 purified plasma cells from a newly diag...,ifm67_01_VELCADE MM_6199_3_Stable_1
GSM1336383,Bortezomib,R,1,"stable (stable disease, no treatment response)",CD-138 purified plasma cells from a newly diag...,ifm67_02_VELCADE MM_6159_3_Stable_1
GSM1336384,Bortezomib,S,1,RCEP (nCR or near Complete Response),CD-138 purified plasma cells from a newly diag...,ifm67_03_VELCADE MM_6145_3_RCEP_1
GSM1336385,Bortezomib,R,1,"Prog (Progressive disease, no treatment response)",CD-138 purified plasma cells from a newly diag...,ifm67_04_VELCADE MM_6073_3_Prog_1
GSM1336386,Bortezomib,R,1,RMin (Minimal treatment response),CD-138 purified plasma cells from a newly diag...,ifm67_05_VELCADE MM_6041_3_RMin_1
GSM1336387,Bortezomib,S,1,RCEP (nCR or near Complete Response),CD-138 purified plasma cells from a newly diag...,ifm67_06_VELCADE MM_5994_3_RCEP_1
GSM1336388,Bortezomib,S,1,RCIF (Complete treatment response),CD-138 purified plasma cells from a newly diag...,ifm67_07_VELCADE MM_5843_3_RCIF_1
GSM1336389,Bortezomib,R,1,"Prog (Progressive disease, no treatment response)",CD-138 purified plasma cells from a newly diag...,ifm67_08_VELCADE MM_5809_3_Prog_1
GSM1336390,Bortezomib,S,1,RCIF (Complete treatment response),CD-138 purified plasma cells from a newly diag...,ifm67_09_VELCADE MM_5745_3_RCIF_1
GSM1336391,Bortezomib,R,1,"Prog (Progressive disease, no treatment response)",CD-138 purified plasma cells from a newly diag...,ifm67_10_VELCADE MM_5735_3_Prog_1


## Cisplatin 
 * GSE23554

In [150]:
fpath = download_GEO_matrix("GSE23554_series_matrix.txt.gz",'/geo/series/GSE23nnn/GSE23554/matrix/'
                    ,destination=tmp_dir)
df = read_matrix(fpath)
df["drug"] = "Cisplatin"
cols = df.columns.values
df["response"] = "R"
df.loc[df["cisplatin response (complete response or incomplete response)"] == "CR","response"] = "S"
df = df.loc[:,["drug","response"] + list(cols)]
df.to_csv("../raw_data/response/GSE23554_response.tsv",sep ="\t")
df

Unnamed: 0_level_0,drug,response,cisplatin response (complete response or incomplete response),debulking,grade,overall survival in days,source,title,vital status,drug
GSM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GSM577823,Cisplatin,R,IR,S,3,313,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D1462,Dead,Cisplatin
GSM577824,Cisplatin,R,IR,S,2,760,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D1858,Dead,Cisplatin
GSM577825,Cisplatin,R,IR,S,3,540,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2147,Dead,Cisplatin
GSM577826,Cisplatin,S,CR,S,3,3268,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2159,Alive,Cisplatin
GSM577827,Cisplatin,S,CR,O,3,5765,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2171,Alive,Cisplatin
GSM577828,Cisplatin,S,CR,S,3,3399,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2247,Dead,Cisplatin
GSM577829,Cisplatin,R,IR,O,3,1153,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2251,Dead,Cisplatin
GSM577830,Cisplatin,R,IR,S,2,184,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2287,Dead,Cisplatin
GSM577831,Cisplatin,R,IR,O,3,73,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2443,Alive,Cisplatin
GSM577832,Cisplatin,R,IR,S,3,151,advanced-stage serous epithelial OVCAs,Ovarian Cancer - Sample D2457,Dead,Cisplatin


In [151]:
df.groupby("response").size()

response
R    10
S    18
dtype: int64