In [113]:
%matplotlib inline
import pandas as pd
import json
import gzip
import os
import numpy as np
import sys
from pandas.io.json import json_normalize
import seaborn as sns

# Clinical

## Importing the clinical data

In [114]:
clinical = pd.read_json('clinical.json')
clinical.head()

Unnamed: 0,diagnoses,case_id,demographic,exposures
0,"[{'year_of_diagnosis': 2010, 'classification_o...",dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,{'updated_datetime': '2019-04-28T14:06:27.1878...,"[{'cigarettes_per_day': None, 'weight': None, ..."
1,"[{'year_of_diagnosis': 1995, 'classification_o...",3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,{'updated_datetime': '2019-04-28T13:48:46.4035...,"[{'cigarettes_per_day': None, 'weight': None, ..."
2,"[{'year_of_diagnosis': 1994, 'classification_o...",451e1a67-47e6-4738-99d7-fb7771ef61a3,{'updated_datetime': '2019-04-28T13:49:33.3043...,"[{'cigarettes_per_day': None, 'weight': None, ..."
3,"[{'year_of_diagnosis': 1994, 'classification_o...",178b2c48-c07d-422e-ae17-8bcfd996ad51,{'updated_datetime': '2019-04-28T13:44:29.1433...,"[{'cigarettes_per_day': None, 'weight': None, ..."
4,"[{'year_of_diagnosis': 2011, 'classification_o...",dddd8e2f-e540-418a-b02e-698d18a12c14,{'updated_datetime': '2019-04-28T13:41:37.6619...,"[{'cigarettes_per_day': None, 'weight': None, ..."


In [115]:
clinical.shape

(1036, 4)

## Extracting the nested dicts

In [116]:
def get_nested(e, key):
    e_dict = e
    result = None
    if type(e) is dict:
        e_dict = np.array([e])
        
    try:
        result = e_dict[0][key]
    except KeyError:
        result = np.nan
        
    return result

In [117]:
columns = ['diagnoses', 'demographic', 'exposures', 'treatments']
for c in columns:
    column_dict = clinical[c][0]
    if type(column_dict) is dict:
        column_dict = np.array([column_dict])

    keys = list(column_dict[0].keys())
    for key in keys:
        clinical[key] = clinical[c].apply(get_nested, args=(key,))

    clinical = clinical.drop(columns=[c])

In [118]:
clinical.head()

Unnamed: 0,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,morphology,...,initial_disease_status,treatment_type,treatment_id,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,2010.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-D8-A1JB_treatment_1,stage iib,19822.0,8500/3,...,,"Pharmaceutical Therapy, NOS",60dc52b4-276e-575d-9e03-21293919d0f6,,,,,,,yes
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,1995.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Lobular carcinoma, NOS",TCGA-B6-A0IE_treatment_1,stage iiia,13982.0,8520/3,...,,"Pharmaceutical Therapy, NOS",d9a24d31-6ed6-5231-a545-edd78387c611,,,,,,,yes
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,1994.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,Infiltrating duct and lobular carcinoma,TCGA-B6-A0RP_treatment_1,not reported,26941.0,8522/3,...,,"Pharmaceutical Therapy, NOS",22b0d7de-607e-5eb2-9260-dd22294c1842,,,,,,,yes
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,1994.0,not reported,not reported,2019-04-28T13:44:29.143389-05:00,"Infiltrating duct carcinoma, NOS",TCGA-B6-A0X1_treatment,not reported,17624.0,8500/3,...,,"Radiation Therapy, NOS",3ec5b0b5-a3d6-5673-8ecb-9aac1ded9b12,,,,,,,yes
4,dddd8e2f-e540-418a-b02e-698d18a12c14,2011.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-A7-A26H_treatment_1,stage iia,26423.0,8500/3,...,,"Pharmaceutical Therapy, NOS",8f13abc3-c782-524e-bce4-7cffcc4d648f,,,,,,,yes


In [119]:
clinical.shape

(1036, 54)

## Removing columns that add no info

### Unique values:

In [120]:
NUM_RECORDS = len(clinical)
for col in clinical.columns:
    if len(clinical[col].value_counts()) == NUM_RECORDS:
        print(col)

case_id
submitter_id
diagnosis_id
demographic_id
exposure_id
treatment_id


**case_id** and **submitter_id** will be kept in the DF for now, since we need it in order to match the two tables later

All of the other unique columns can be removed since they don't have any information that we need:

In [121]:
clinical = clinical.drop(columns=['diagnosis_id', 'demographic_id', 'exposure_id', 'treatment_id'])

In [122]:
clinical.shape

(1036, 50)

I will change the content of the column 'submitter_id' by removing the ending:

In [123]:
clinical['submitter_id'] = clinical['submitter_id'].apply(lambda x: x[:x.find("_")])

In [124]:
clinical.head()

Unnamed: 0,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,morphology,...,treatment_effect,initial_disease_status,treatment_type,therapeutic_agents,regimen_or_line_of_therapy,treatment_intent_type,treatment_anatomic_site,treatment_outcome,days_to_treatment_end,treatment_or_therapy
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,2010.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-D8-A1JB,stage iib,19822.0,8500/3,...,,,"Pharmaceutical Therapy, NOS",,,,,,,yes
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,1995.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Lobular carcinoma, NOS",TCGA-B6-A0IE,stage iiia,13982.0,8520/3,...,,,"Pharmaceutical Therapy, NOS",,,,,,,yes
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,1994.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,Infiltrating duct and lobular carcinoma,TCGA-B6-A0RP,not reported,26941.0,8522/3,...,,,"Pharmaceutical Therapy, NOS",,,,,,,yes
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,1994.0,not reported,not reported,2019-04-28T13:44:29.143389-05:00,"Infiltrating duct carcinoma, NOS",TCGA-B6-A0X1,not reported,17624.0,8500/3,...,,,"Radiation Therapy, NOS",,,,,,,yes
4,dddd8e2f-e540-418a-b02e-698d18a12c14,2011.0,not reported,not reported,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-A7-A26H,stage iia,26423.0,8500/3,...,,,"Pharmaceutical Therapy, NOS",,,,,,,yes


### Column of all NaNs:

In [125]:
all_nones = []
for col in clinical.columns:
    if sum(clinical[col].isnull()) == NUM_RECORDS:
        all_nones.append(col)
print(all_nones)

['days_to_last_known_disease_status', 'days_to_recurrence', 'cigarettes_per_day', 'weight', 'alcohol_intensity', 'bmi', 'years_smoked', 'height', 'days_to_treatment_start', 'treatment_effect', 'initial_disease_status', 'therapeutic_agents', 'regimen_or_line_of_therapy', 'treatment_intent_type', 'treatment_anatomic_site', 'treatment_outcome', 'days_to_treatment_end']


Those columns will be removed since they don't have any informatyion in them:

In [126]:
clinical = clinical.drop(columns=all_nones)

In [127]:
clinical.shape

(1036, 33)

### One value for all of the records:

In [128]:
all_same = []
for col in clinical.columns:
    if len(clinical[col].value_counts()) == 1:
        all_same.append(col)
all_same

['classification_of_tumor',
 'last_known_disease_status',
 'state',
 'tumor_grade',
 'days_to_diagnosis',
 'progression_or_recurrence',
 'gender',
 'alcohol_history']

All of there columns will also be removed since they don't have any added value:

In [129]:
clinical = clinical.drop(columns=all_same)

In [130]:
clinical.shape

(1036, 25)

## Checking out all of the columns

In [131]:
clinical.head()

Unnamed: 0,case_id,year_of_diagnosis,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,morphology,created_datetime,prior_treatment,...,days_to_last_follow_up,year_of_birth,race,days_to_birth,ethnicity,vital_status,age_at_index,year_of_death,treatment_type,treatment_or_therapy
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,2010.0,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-D8-A1JB,stage iib,19822.0,8500/3,2019-04-28T14:06:27.187807-05:00,No,...,1688.0,1956.0,white,-19822.0,not hispanic or latino,Alive,54,,"Pharmaceutical Therapy, NOS",yes
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,1995.0,2019-05-03T14:03:43.043868-05:00,"Lobular carcinoma, NOS",TCGA-B6-A0IE,stage iiia,13982.0,8520/3,2019-04-28T13:48:46.403564-05:00,No,...,,1957.0,black or african american,-13982.0,not hispanic or latino,Dead,38,2000.0,"Pharmaceutical Therapy, NOS",yes
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,1994.0,2019-05-03T14:03:43.043868-05:00,Infiltrating duct and lobular carcinoma,TCGA-B6-A0RP,not reported,26941.0,8522/3,2019-04-28T13:49:33.304314-05:00,No,...,,1921.0,white,-26941.0,not hispanic or latino,Dead,73,2002.0,"Pharmaceutical Therapy, NOS",yes
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,1994.0,2019-04-28T13:44:29.143389-05:00,"Infiltrating duct carcinoma, NOS",TCGA-B6-A0X1,not reported,17624.0,8500/3,,No,...,5677.0,1946.0,white,-17624.0,not hispanic or latino,Dead,48,,"Radiation Therapy, NOS",yes
4,dddd8e2f-e540-418a-b02e-698d18a12c14,2011.0,2019-05-03T14:03:43.043868-05:00,"Infiltrating duct carcinoma, NOS",TCGA-A7-A26H,stage iia,26423.0,8500/3,2019-04-28T13:41:37.661995-05:00,No,...,724.0,1939.0,white,-26423.0,not hispanic or latino,Alive,72,,"Pharmaceutical Therapy, NOS",yes


### updated_datetime

In [132]:
clinical['updated_datetime'].value_counts().head()

2019-05-03T14:03:43.043868-05:00    524
2019-04-28T13:54:32.082307-05:00      1
2019-04-28T13:35:21.963003-05:00      1
2019-04-28T13:52:28.485796-05:00      1
2019-04-28T13:30:18.658451-05:00      1
Name: updated_datetime, dtype: int64

It seems like the column 'updated_datetime' has information about the last date in which the information was updated. Since this is not information that we care about, I will remove it:

In [133]:
clinical = clinical.drop(columns=['updated_datetime'])

In [134]:
clinical.shape

(1036, 24)

### primary_diagnosis

In [135]:
clinical['primary_diagnosis'].value_counts()

Infiltrating duct carcinoma, NOS                            763
Lobular carcinoma, NOS                                      201
Infiltrating duct and lobular carcinoma                      28
Infiltrating duct mixed with other types of carcinoma        19
Infiltrating lobular mixed with other types of carcinoma      6
Intraductal papillary adenocarcinoma with invasion            6
Medullary carcinoma, NOS                                      6
Intraductal micropapillary carcinoma                          3
Paget disease and infiltrating duct carcinoma of breast       3
Secretory carcinoma of breast                                 1
Name: primary_diagnosis, dtype: int64

I don't think that this is information that we care about (please correct me if I'm wrong), so I will remove it:

In [136]:
clinical = clinical.drop(columns=['primary_diagnosis'])

In [137]:
clinical.shape

(1036, 23)

### tumor_stage (the target)

In [138]:
sum(clinical['tumor_stage'].isnull())

0

It has no nulls, which is a good thing. The problem is that it has 11 records that are 'not reported'. In addition, stage X means that the stage wasn't determined.

In [139]:
clinical['tumor_stage'].value_counts()

stage iia       337
stage iib       245
stage iiia      149
stage i          83
stage ia         83
stage iiic       62
stage iiib       23
stage iv         19
stage x          12
not reported     11
stage ib          5
stage ii          5
stage iii         2
Name: tumor_stage, dtype: int64

For this column, we need to do a few things (I guess we should do it together):

* decide what to do with the missing values (not reported and stage x)
* decide whether to keep stages with the same number seperated (maybe we can apply models and see?)
* change the values to numbers according to the previous one

### age_at_diagnosis

This column contains information about the age of the women when she was diagnosed. The value in the column is the number of days that she had lived until diagnosis.

In [140]:
clinical[['age_at_diagnosis']].describe()

Unnamed: 0,age_at_diagnosis
count,1022.0
mean,21487.875734
std,4811.606429
min,9706.0
25%,17923.25
50%,21490.0
75%,24800.0
max,32872.0


I will convert the values of the column into years, so we can interpret the data more easly ourselves:

In [141]:
clinical['age_at_diagnosis'] = clinical['age_at_diagnosis'].apply(lambda x: x / 365)

In [142]:
clinical[['age_at_diagnosis']].describe()

Unnamed: 0,age_at_diagnosis
count,1022.0
mean,58.870892
std,13.182483
min,26.591781
25%,49.104795
50%,58.876712
75%,67.945205
max,90.060274


Another thing that we can do is to convert this column to categories instead of the value itself, since I suppose that we care more about the group of the women rather than her exact age.

Now let's look at nulls:

In [143]:
sum(clinical['age_at_diagnosis'].isnull())

14

I will replace these values with the mean value. (The mean and median in this case are very close so I just chose one of them)

In [144]:
clinical['age_at_diagnosis'].mean()

58.8708924215211

In [145]:
clinical['age_at_diagnosis'].median()

58.87671232876713

In [146]:
clinical['age_at_diagnosis'] = clinical['age_at_diagnosis'].fillna(clinical['age_at_diagnosis'].mean())

In [147]:
sum(clinical['age_at_diagnosis'].isnull())

0

### morphology

**morphology** column contains information about the morphology of the tumor. This is not information that we need so I will remove it. Also I think that there is a little bit of target leakage in this column, since it contains information about the tumor itself, and the goal of the project is not to have this kind of information.

In [148]:
clinical['morphology'].value_counts()

8500/3    763
8520/3    201
8522/3     28
8523/3     19
8524/3      6
8510/3      6
8503/3      6
8541/3      3
8507/3      3
8502/3      1
Name: morphology, dtype: int64

In [149]:
clinical = clinical.drop(columns=['morphology'])

In [150]:
clinical.shape

(1036, 22)

### created_datetime, prior_treatment, icd_10_code

**created_datetime** contains information about the datetime in which this record was added to the website.

**prior_treatment** contains information about prior treatmens given to the patient.

**ICD-10 code** is a diagnostic code used to describe a patient’s medical condition.

All of these columns contains information that we don't care about, therefore I will remove them:

In [151]:
not_valuable = ['created_datetime', 'prior_treatment', 'icd_10_code']
clinical = clinical.drop(columns=not_valuable)

In [152]:
clinical.shape

(1036, 19)

In [153]:
clinical.head()

Unnamed: 0,case_id,year_of_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,tissue_or_organ_of_origin,prior_malignancy,synchronous_malignancy,site_of_resection_or_biopsy,days_to_last_follow_up,year_of_birth,race,days_to_birth,ethnicity,vital_status,age_at_index,year_of_death,treatment_type,treatment_or_therapy
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,2010.0,TCGA-D8-A1JB,stage iib,54.306849,"Breast, NOS",no,No,"Breast, NOS",1688.0,1956.0,white,-19822.0,not hispanic or latino,Alive,54,,"Pharmaceutical Therapy, NOS",yes
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,1995.0,TCGA-B6-A0IE,stage iiia,38.306849,"Breast, NOS",no,No,"Breast, NOS",,1957.0,black or african american,-13982.0,not hispanic or latino,Dead,38,2000.0,"Pharmaceutical Therapy, NOS",yes
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,1994.0,TCGA-B6-A0RP,not reported,73.810959,"Breast, NOS",no,No,"Breast, NOS",,1921.0,white,-26941.0,not hispanic or latino,Dead,73,2002.0,"Pharmaceutical Therapy, NOS",yes
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,1994.0,TCGA-B6-A0X1,not reported,48.284932,"Breast, NOS",no,No,"Breast, NOS",5677.0,1946.0,white,-17624.0,not hispanic or latino,Dead,48,,"Radiation Therapy, NOS",yes
4,dddd8e2f-e540-418a-b02e-698d18a12c14,2011.0,TCGA-A7-A26H,stage iia,72.391781,"Breast, NOS",yes,Not Reported,"Breast, NOS",724.0,1939.0,white,-26423.0,not hispanic or latino,Alive,72,,"Pharmaceutical Therapy, NOS",yes


### year_of_diagnosis

This column contains important information about the year that the patient was diagnosed with cancer. Nevertheless, we have information about the age of the patient at diagnosis, which is more important for us. Therefore, I will remove it:

In [154]:
clinical = clinical.drop(columns=['year_of_diagnosis'])

In [155]:
clinical.shape

(1036, 18)

### tissue_or_organ_of_origin

The column contains inforamtion about the exact location of the tumor. I don't think that this is something that interests us so I will remove it:

In [156]:
clinical['tissue_or_organ_of_origin'].value_counts()

Breast, NOS                       1027
Lower-inner quadrant of breast       3
Upper-outer quadrant of breast       2
Upper-inner quadrant of breast       2
Overlapping lesion of breast         1
Lower-outer quadrant of breast       1
Name: tissue_or_organ_of_origin, dtype: int64

In [157]:
clinical = clinical.drop(columns=['tissue_or_organ_of_origin'])

In [158]:
clinical.shape

(1036, 17)

### prior_malignancy

Meant to tell if the patient has had cancer before. I guess that this could be interesting, so I will convert it to 1 and 0:

In [159]:
clinical['prior_malignancy'].value_counts()

no              973
yes              62
not reported      1
Name: prior_malignancy, dtype: int64

In [160]:
clinical['prior_malignancy'] = clinical['prior_malignancy'].replace({'no': 0, 'yes': 1})

In [161]:
clinical['prior_malignancy'].value_counts()

0               973
1                62
not reported      1
Name: prior_malignancy, dtype: int64

As for the record that had 'not reported', I will replace the value with the mode of this column, which is 0:

In [162]:
clinical['prior_malignancy'].mode()

0    0
dtype: object

In [163]:
clinical['prior_malignancy'] = clinical['prior_malignancy'].replace({'not reported': 0}) 

In [164]:
clinical['prior_malignancy'].value_counts()

0    974
1     62
Name: prior_malignancy, dtype: int64

The column doesn't have any null values:

In [165]:
sum(clinical['prior_malignancy'].isnull())

0

### synchronous_malignancy

The column describes whether the patient had an additional malignant diagnosis at the same time the breast cancer was diagnosed. If both tumors were sequenced, both tumors would have synchronous malignancies.

Let's look at the values of the column:

In [166]:
clinical['synchronous_malignancy'].value_counts()

No              973
Not Reported     63
Name: synchronous_malignancy, dtype: int64

I think that this column can be dropped, since there are a lot of values registered as not reported, and the rest of it is only one value - no. (Tell me if you think otherwise)

In [167]:
clinical = clinical.drop(columns=['synchronous_malignancy'])

In [168]:
clinical.shape

(1036, 16)

### site_of_resection_or_biopsy

In [169]:
clinical['site_of_resection_or_biopsy'].value_counts()

Breast, NOS                       1027
Lower-inner quadrant of breast       3
Upper-outer quadrant of breast       2
Upper-inner quadrant of breast       2
Overlapping lesion of breast         1
Lower-outer quadrant of breast       1
Name: site_of_resection_or_biopsy, dtype: int64

Here the column has information about the place of the tumor. As I said in tissue_or_organ_of_origin, I will remove it:

In [170]:
clinical = clinical.drop(columns=['site_of_resection_or_biopsy'])

In [171]:
clinical.shape

(1036, 15)

In [172]:
clinical.head()

Unnamed: 0,case_id,submitter_id,tumor_stage,age_at_diagnosis,prior_malignancy,days_to_last_follow_up,year_of_birth,race,days_to_birth,ethnicity,vital_status,age_at_index,year_of_death,treatment_type,treatment_or_therapy
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,TCGA-D8-A1JB,stage iib,54.306849,0,1688.0,1956.0,white,-19822.0,not hispanic or latino,Alive,54,,"Pharmaceutical Therapy, NOS",yes
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,TCGA-B6-A0IE,stage iiia,38.306849,0,,1957.0,black or african american,-13982.0,not hispanic or latino,Dead,38,2000.0,"Pharmaceutical Therapy, NOS",yes
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,TCGA-B6-A0RP,not reported,73.810959,0,,1921.0,white,-26941.0,not hispanic or latino,Dead,73,2002.0,"Pharmaceutical Therapy, NOS",yes
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,TCGA-B6-A0X1,not reported,48.284932,0,5677.0,1946.0,white,-17624.0,not hispanic or latino,Dead,48,,"Radiation Therapy, NOS",yes
4,dddd8e2f-e540-418a-b02e-698d18a12c14,TCGA-A7-A26H,stage iia,72.391781,1,724.0,1939.0,white,-26423.0,not hispanic or latino,Alive,72,,"Pharmaceutical Therapy, NOS",yes


### days_to_last_follow_up

Information about the days from diagnosis to the last follow up. We don't care about that, so it will be removed:

In [173]:
clinical = clinical.drop(columns=['days_to_last_follow_up'])

In [174]:
clinical.shape

(1036, 14)

### year_of_death, vital_status

Both of these columns contains information about the death of the patient. This is not information that we would like to have, since we want our model tool to the able to predict the stage of a women that are still alive. Therefore I will remove both of them:

In [175]:
clinical['year_of_death'].value_counts()

2008.0    15
2005.0    11
2010.0    11
2004.0    10
2001.0     8
2002.0     6
2006.0     6
2000.0     5
2009.0     5
2003.0     5
1999.0     5
1998.0     3
2007.0     3
1992.0     2
1996.0     2
2012.0     1
2013.0     1
1995.0     1
1994.0     1
Name: year_of_death, dtype: int64

In [176]:
clinical['vital_status'].value_counts()

Alive    892
Dead     144
Name: vital_status, dtype: int64

In [177]:
clinical = clinical.drop(columns=['vital_status', 'year_of_death'])

In [178]:
clinical.shape

(1036, 12)

### year_of_birth, days_to_birth

**days_to_birth** is the number of days between the date used for index and the date from a person's date of birth, represented as a calculated negative number of days.

**year_of_birth** is self explanatory.

We don't need both of these columns since we have the information about the women's age at diagnosis. I will remove them:

In [179]:
clinical['days_to_birth'].value_counts()

-32872.0    8
-22199.0    3
-24315.0    2
-23404.0    2
-16642.0    2
           ..
-20625.0    1
-23922.0    1
-27400.0    1
-31932.0    1
-22848.0    1
Name: days_to_birth, Length: 986, dtype: int64

In [180]:
clinical = clinical.drop(columns=['year_of_birth', 'days_to_birth'])

In [181]:
clinical.shape

(1036, 10)

### treatment_type, treatment_or_therapy

These two columns contains information about the treatement given to the patients. This is not information that we would like to keep since we want to diagnose our patients according to their general information and genes, and not according to treatement that they had taken. Moreover, the treatement if given according to the stage of the patient, and we don't want to have target leakage.

In [182]:
clinical['treatment_type'].value_counts()

Pharmaceutical Therapy, NOS    524
Radiation Therapy, NOS         512
Name: treatment_type, dtype: int64

In [183]:
clinical['treatment_or_therapy'].value_counts()

yes             665
no              282
not reported     89
Name: treatment_or_therapy, dtype: int64

In [184]:
clinical = clinical.drop(columns=['treatment_type', 'treatment_or_therapy'])

In [185]:
clinical.shape

(1036, 8)

### race, ethnicity

Let's look at both of the columns:

In [186]:
clinical['race'].value_counts()

white                               714
black or african american           172
not reported                         93
asian                                56
american indian or alaska native      1
Name: race, dtype: int64

In [187]:
clinical['ethnicity'].value_counts()

not hispanic or latino    830
not reported              170
hispanic or latino         36
Name: ethnicity, dtype: int64

They both contain information about the ethnicity of the patient. I tried to see maybe we can combine both of them, and then to have one column with all of these categories, but it seems like it's not possible, for example:

patients who are both white and hispanic or latino:

In [188]:
len(clinical[(clinical['race'] == 'white') & (clinical['ethnicity'] == 'hispanic or latino')])

31

patients who are both white and not hispanic or latino:

In [189]:
len(clinical[(clinical['race'] == 'white') & (clinical['ethnicity'] == 'not hispanic or latino')])

618

We can see that there is an overlap on some of the values - for example, if someone if white and not hispanic or latino, what does it mean? Does it mean that she is not white and only hispanic? I don't know what to do with this. 

What I decided to do is to remove the ethnicity column and leave the race column in.

In [190]:
clinical = clinical.drop(columns=['ethnicity'])

In [191]:
clinical.shape

(1036, 7)

The column of race I will turn into one hot encoding, yet I will remove the columns of 'american indian or alaska native' (which will contain only one value as 1) and 'not reported' (doesn't have any information in it).

In [192]:
clinical = pd.get_dummies(clinical, columns=['race'])

In [193]:
clinical = clinical.drop(columns=['race_american indian or alaska native', 'race_not reported'])

In [194]:
clinical.shape

(1036, 9)

### age_at_index

Contains information about the patient's age (in years) on the reference or anchor date date used during date obfuscation. Data obfuscation (DO) is a form of data masking where data is purposely scrambled to prevent unauthorized access to sensitive materials, or in our case for privacy preservation.

I believe that we don't need this information, therefore I will remove the column:

In [195]:
clinical = clinical.drop(columns=['age_at_index'])

## Final result - Clean data!

In [196]:
clinical.shape

(1036, 8)

In [197]:
clinical.isnull().sum()

case_id                           0
submitter_id                      0
tumor_stage                       0
age_at_diagnosis                  0
prior_malignancy                  0
race_asian                        0
race_black or african american    0
race_white                        0
dtype: int64

In [198]:
clinical.head()

Unnamed: 0,case_id,submitter_id,tumor_stage,age_at_diagnosis,prior_malignancy,race_asian,race_black or african american,race_white
0,dd96a9c7-899c-47cd-a0f9-b149ed07a5d6,TCGA-D8-A1JB,stage iib,54.306849,0,0,0,1
1,3f834fa7-6d7b-4b85-98c0-5c55d55b6c95,TCGA-B6-A0IE,stage iiia,38.306849,0,0,1,0
2,451e1a67-47e6-4738-99d7-fb7771ef61a3,TCGA-B6-A0RP,not reported,73.810959,0,0,0,1
3,178b2c48-c07d-422e-ae17-8bcfd996ad51,TCGA-B6-A0X1,not reported,48.284932,0,0,0,1
4,dddd8e2f-e540-418a-b02e-698d18a12c14,TCGA-A7-A26H,stage iia,72.391781,1,0,0,1


## Exporting the data

In [199]:
clinical.to_csv('clinical_clean_doria.csv')