# Breast cancer relapse dataset

Author: Hank Feild  
Date: 2025-01-09

...

In [1]:
import pandas as pd

## Read in gene data

In [4]:
genes = pd.read_table('../data/breast-cancer-relapse-geo/GSE2034_genes.tsv.gz', index_col=False)
genes.head()

Unnamed: 0,ID_REF,GSM36777,GSM36778,GSM36779,GSM36780,GSM36781,GSM36782,GSM36783,GSM36784,GSM36785,...,GSM37053,GSM37054,GSM37055,GSM37056,GSM37057,GSM37058,GSM37059,GSM37060,GSM37061,GSM37062
0,1007_s_at,3848.1,6520.9,5285.7,4043.7,4263.6,2949.8,5498.9,3863.1,3370.4,...,4058.2,4017.6,2841.0,2914.2,3681.0,3066.9,2773.0,2984.3,3540.0,2620.0
1,1053_at,228.9,112.5,178.4,398.7,417.7,221.2,280.4,198.2,304.7,...,183.4,356.1,234.6,169.4,94.5,265.5,209.8,160.0,285.7,180.5
2,117_at,213.1,189.8,269.7,312.4,327.1,225.0,243.5,244.4,348.5,...,326.6,234.9,369.6,149.5,236.4,347.9,226.7,252.9,135.1,191.8
3,121_at,1009.4,2083.3,1203.4,1104.4,1043.3,1117.6,1085.4,1423.1,1196.4,...,1041.3,1195.6,751.5,1117.8,1022.4,1127.4,1071.8,1178.5,1256.7,1284.6
4,1255_g_at,31.8,145.8,42.5,108.2,69.2,47.4,84.3,102.0,22.8,...,143.5,32.7,62.6,43.0,100.5,47.0,45.1,146.3,75.9,87.4


Transpose the table so that rows are patients and columns are genes. The patient id is stored in a column called `geoId`.

In [10]:
genesT = genes.set_index('ID_REF').transpose()

genesT.reset_index(inplace=True)
genesT.rename(columns={'index': 'geoId'}, inplace=True)
genesT.head()

ID_REF,geoId,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-Hs18SrRNA-3_s_at,AFFX-r2-Hs18SrRNA-5_at,AFFX-r2-Hs18SrRNA-M_x_at,AFFX-r2-Hs28SrRNA-3_at,AFFX-r2-Hs28SrRNA-5_at,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at
0,GSM36777,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,6289.8,5865.6,1192.4,1093.1,737.5,1920.8,619.9,661.5,33168.9,25644.4
1,GSM36778,6520.9,112.5,189.8,2083.3,145.8,802.8,278.4,28.3,449.0,...,11238.1,9912.1,1010.5,467.9,822.2,725.2,511.3,716.1,54401.4,40720.0
2,GSM36779,5285.7,178.4,269.7,1203.4,42.5,557.5,183.3,56.4,101.9,...,12469.8,13108.1,3567.7,3118.9,2724.6,4172.0,1618.4,1189.3,61244.1,50878.7
3,GSM36780,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,...,9050.9,9052.7,1582.9,664.8,757.8,1881.9,780.5,801.1,62292.1,46870.8
4,GSM36781,4263.6,417.7,327.1,1043.3,69.2,653.2,185.8,21.8,3629.3,...,8562.3,9722.1,420.1,27.2,114.3,435.8,276.0,191.1,57295.1,40847.1


In [40]:
from sklearn.preprocessing import MinMaxScaler

genesTScaled = genesT.copy()
genesTNumeric = genesT.drop(columns=['geoId'])
numericColumns = genesTNumeric.columns

scaler = MinMaxScaler()
genesTScaled[numericColumns] = scaler.fit_transform(genesTNumeric)
genesTScaled.head()

ID_REF,geoId,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-r2-Ec-bioD-3_at,AFFX-r2-Ec-bioD-5_at,AFFX-r2-Hs18SrRNA-3_s_at,AFFX-r2-Hs18SrRNA-5_at,AFFX-r2-Hs18SrRNA-M_x_at,AFFX-r2-Hs28SrRNA-3_at,AFFX-r2-Hs28SrRNA-5_at,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at
0,GSM36777,0.392454,0.258227,0.045583,0.218682,0.135484,0.15757,0.283205,0.024671,0.029581,...,0.079776,0.080168,0.143762,0.345904,0.230652,0.366146,0.274844,0.329105,0.135056,0.113171
1,GSM36778,0.71505,0.118942,0.037561,0.789936,0.748387,0.325855,0.527911,0.092105,0.043384,...,0.261647,0.203904,0.119804,0.144038,0.257473,0.129248,0.224366,0.359749,0.296577,0.261227
2,GSM36779,0.565966,0.197798,0.065069,0.321879,0.193011,0.161588,0.299086,0.207648,0.00909,...,0.306918,0.301633,0.456615,1.0,0.85988,0.812202,0.738961,0.625323,0.348631,0.360995
3,GSM36780,0.416062,0.46141,0.07977,0.269216,0.546237,0.168955,0.309673,0.148849,0.087855,...,0.181258,0.177624,0.195195,0.207614,0.23708,0.358438,0.349493,0.407453,0.356604,0.321634
4,GSM36781,0.442603,0.484145,0.084831,0.236715,0.336559,0.225675,0.305101,0.065378,0.357606,...,0.1633,0.198094,0.042042,0.001744,0.033312,0.071906,0.114995,0.065103,0.31859,0.262476


## Read in patient data

In [11]:
patientData = pd.read_table('../data/breast-cancer-relapse-geo/GSE2034_patient_data.tsv')
patientData.head()

Unnamed: 0,PID,GEO asscession number,lymph node status,time to relapse or last follow-up (months),relapse (1=True),ER Status,"Brain relapses (1=yes, 0=no)"
0,3,GSM36793,negative,101,0,ER-,0
1,5,GSM36796,negative,118,0,ER+,0
2,6,GSM36797,negative,9,1,ER-,0
3,7,GSM36798,negative,106,0,ER-,0
4,8,GSM36800,negative,37,1,ER-,0


Rename the columns so they're shorter and easier to refer to

**Key:**
  * pid -- "PID"
  * geoId -- "GEO asscession number"
  * lymphNodeStatus -- "lymph node status"
  * relapseMonths -- "time to relapse or last follow-up (months)"
  * relapsed -- "relapse (1=True)"
  * erStatus -- "ER Status"
  * brainRelapsed -- "Brain relapses (1=yes, 0=no)"

In [18]:
patientData.rename(columns={oldName: newName for oldName, newName in zip(patientData.columns, ['pid', 'geoId', 'lymphNodeStatus', 'relapseMonths', 'relapsed', 'erStatus', 'brainRelapsed'])}, inplace=True)

In [21]:
patientData.head()

Unnamed: 0,pid,geoId,lymphNodeStatus,relapseMonths,relapsed,erStatus,brainRelapsed
0,3,GSM36793,negative,101,0,ER-,0
1,5,GSM36796,negative,118,0,ER+,0
2,6,GSM36797,negative,9,1,ER-,0
3,7,GSM36798,negative,106,0,ER-,0
4,8,GSM36800,negative,37,1,ER-,0


In [27]:
patientData = pd.get_dummies(patientData, columns=['lymphNodeStatus', 'erStatus'])

In [28]:
patientData.head()

Unnamed: 0,pid,geoId,relapseMonths,relapsed,brainRelapsed,lymphNodeStatus_negative,erStatus_ER+,erStatus_ER-
0,3,GSM36793,101,0,0,1,0,1
1,5,GSM36796,118,0,0,1,1,0
2,6,GSM36797,9,1,0,1,0,1
3,7,GSM36798,106,0,0,1,0,1
4,8,GSM36800,37,1,0,1,0,1


Create a normalized version

In [39]:
from sklearn.preprocessing import MinMaxScaler

patientDataScaled = patientData.copy()
patientDataNumeric = patientData.drop(columns=['pid','geoId'])
patientDataNumeric.head()
numericColumns = patientDataNumeric.columns

scaler = MinMaxScaler()
patientDataScaled[numericColumns] = scaler.fit_transform(patientDataNumeric)
patientDataScaled.head()

Unnamed: 0,pid,geoId,relapseMonths,relapsed,brainRelapsed,lymphNodeStatus_negative,erStatus_ER+,erStatus_ER-
0,3,GSM36793,0.585799,0.0,0.0,0.0,0.0,1.0
1,5,GSM36796,0.686391,0.0,0.0,0.0,1.0,0.0
2,6,GSM36797,0.04142,1.0,0.0,0.0,0.0,1.0
3,7,GSM36798,0.615385,0.0,0.0,0.0,0.0,1.0
4,8,GSM36800,0.207101,1.0,0.0,0.0,0.0,1.0


## Combine patient data with gene info

In [30]:
genesAndPatientData = genesT.merge(patientData, on='geoId')

In [31]:
genesAndPatientData.head()

Unnamed: 0,geoId,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,...,AFFX-r2-Hs28SrRNA-M_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,pid,relapseMonths,relapsed,brainRelapsed,lymphNodeStatus_negative,erStatus_ER+,erStatus_ER-
0,GSM36777,3848.1,228.9,213.1,1009.4,31.8,551.5,176.7,11.9,309.3,...,661.5,33168.9,25644.4,277,79,0,0,1,1,0
1,GSM36778,6520.9,112.5,189.8,2083.3,145.8,802.8,278.4,28.3,449.0,...,716.1,54401.4,40720.0,278,50,1,0,1,1,0
2,GSM36779,5285.7,178.4,269.7,1203.4,42.5,557.5,183.3,56.4,101.9,...,1189.3,61244.1,50878.7,798,132,0,0,1,1,0
3,GSM36780,4043.7,398.7,312.4,1104.4,108.2,568.5,187.7,42.1,899.1,...,801.1,62292.1,46870.8,846,84,0,0,1,0,1
4,GSM36781,4263.6,417.7,327.1,1043.3,69.2,653.2,185.8,21.8,3629.3,...,191.1,57295.1,40847.1,765,147,0,0,1,1,0
