In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
import seaborn as sns
sns.set()

In [78]:
df = pd.read_csv('data/gathered.csv').drop(columns=['Unnamed: 0'])
df.head(2)

Unnamed: 0,file_name,case_id,year_of_diagnosis,classification_of_tumor,last_known_disease_status,updated_datetime,primary_diagnosis,submitter_id,tumor_stage,age_at_diagnosis,...,ENSGR0000275287.3,ENSGR0000276543.3,ENSGR0000277120.3,ENSGR0000280767.1,ENSGR0000281849.1,__alignment_not_unique,__ambiguous,__no_feature,__not_aligned,__too_low_aQual
0,2b5c518f-8327-478a-a282-01bfe59aca4c.htseq.cou...,4d0fec97-e024-4608-a0cc-426a3decc7b1,2007.0,not reported,not reported,2019-04-28T13:50:16.849041-05:00,"Infiltrating duct carcinoma, NOS",TCGA-AR-A252_exposure,stage i,18611.0,...,0.0,0.0,0.0,0.0,0.0,33323894.0,3232629.0,5808920.0,0.0,0.0
1,64262505-9f17-4989-94c9-fc6db56ca676.htseq.cou...,bb8d42d3-ad65-4d88-ae1d-f9aadfc7962d,2010.0,not reported,not reported,2019-04-28T13:39:49.010685-05:00,"Lobular carcinoma, NOS",TCGA-AO-A1KS_exposure,stage iia,25230.0,...,0.0,0.0,0.0,0.0,0.0,19183901.0,2673271.0,6146344.0,0.0,0.0


# Gens expressions

In [80]:
gens_indexes = df.columns[df.columns.str.startswith('ENSG')]
gens = df[gens_indexes]
gens_varieties = gens.nunique()

In [81]:
percentage_of_non_varying = np.round(float(len(gens_varieties[gens_varieties == 1]))*100/len(gens_varieties), 2)
print(f"""The total number of gens we have is {len(gens_varieties)}
But {percentage_of_non_varying} % have the same value for all the dataset""")

The total number of gens we have is 60483
But 3.67 % have the same value for all the dataset


So we will drop insignificant gens from df

In [82]:
insignificant_gens = np.array(gens_varieties[gens_varieties == 1].index)
df = df.drop(columns=insignificant_gens)

In [83]:
df.shape

(1164, 58312)

# Clinical info


In [84]:
gens_indexes = df.columns[df.columns.str.startswith('ENSG')]
clinical = df.drop(columns=gens_indexes)

In [85]:
feature_uniques = clinical.nunique()
feature_uniques[feature_uniques==1]

classification_of_tumor      1
last_known_disease_status    1
state                        1
tumor_grade                  1
days_to_diagnosis            1
progression_or_recurrence    1
gender                       1
alcohol_history              1
__not_aligned                1
__too_low_aQual              1
dtype: int64

In [86]:
columns_to_drop = []
columns_to_drop = columns_to_drop + list(feature_uniques[feature_uniques==1].index)

In [87]:
NUM_RECORDS = len(clinical)
for col in clinical.columns:
    if len(clinical[col].value_counts()) == NUM_RECORDS or len(clinical[col].value_counts()) == 1036:
        print(col)
        columns_to_drop.append(col)

file_name
case_id
updated_datetime
submitter_id
diagnosis_id
treatments
demographic_id
exposure_id
__alignment_not_unique
__ambiguous
__no_feature


**case_id** and **submitter_id** will be kept in the DF for now, since we need it in order to match the two tables later  
All of the other unique columns can be removed since they don't have any information that we need:


In [89]:
columns_to_drop.remove('submitter_id')
columns_to_drop.remove('case_id')

#### Column of all NaNs:

In [90]:
all_nones = []
for col in clinical.columns:
    if sum(clinical[col].isnull()) == NUM_RECORDS:
        all_nones.append(col)
print(all_nones)

columns_to_drop = columns_to_drop + all_nones

['days_to_last_known_disease_status', 'created_datetime', 'days_to_recurrence', 'cigarettes_per_day', 'weight', 'alcohol_intensity', 'bmi', 'years_smoked', 'height']


#### Column of 1 value:

In [91]:
all_same = []
for col in clinical.columns:
    if len(clinical[col].value_counts()) == 1:
        all_same.append(col)
print(all_same)

columns_to_drop = columns_to_drop + all_same

['classification_of_tumor', 'last_known_disease_status', 'state', 'tumor_grade', 'days_to_diagnosis', 'progression_or_recurrence', 'gender', 'alcohol_history', '__not_aligned', '__too_low_aQual']


#### Checking others features

In [92]:
clinical['updated_datetime'].value_counts().head()

2019-04-28T08:46:02.486768-05:00    4
2019-04-28T08:47:05.568166-05:00    4
2019-04-28T08:45:30.379556-05:00    3
2019-04-28T13:49:59.868148-05:00    3
2019-04-28T13:53:52.434304-05:00    3
Name: updated_datetime, dtype: int64

It seems like the column 'updated_datetime' has information about the last date in which the information was updated. Since this is not information that we care about, I will remove it:

In [93]:
columns_to_drop.append('updated_datetime')

In [94]:
clinical['primary_diagnosis'].value_counts()

Infiltrating duct carcinoma, NOS                            866
Lobular carcinoma, NOS                                      211
Infiltrating duct and lobular carcinoma                      37
Infiltrating duct mixed with other types of carcinoma        21
Medullary carcinoma, NOS                                      8
Infiltrating lobular mixed with other types of carcinoma      7
Intraductal papillary adenocarcinoma with invasion            6
Paget disease and infiltrating duct carcinoma of breast       4
Intraductal micropapillary carcinoma                          3
Secretory carcinoma of breast                                 1
Name: primary_diagnosis, dtype: int64

I don't think that this is information that we care about (please correct me if I'm wrong), so I will remove it:

In [95]:
columns_to_drop.append('primary_diagnosis')

**age_at_diagnosis**  
This column contains information about the age of the women when she was diagnosed. The value in the column is the number of days that she had lived until diagnosis.  

In [96]:
clinical[['age_at_diagnosis']].describe()

Unnamed: 0,age_at_diagnosis
count,1149.0
mean,21421.413403
std,4860.540308
min,9706.0
25%,17710.0
50%,21426.0
75%,24746.0
max,32872.0


I will convert the values of the column into years, so we can interpret the data more easly ourselves:

In [97]:
df['age_at_diagnosis'] = df['age_at_diagnosis'].apply(lambda x: x / 365)

Now lets check nulls

In [98]:
sum(clinical['age_at_diagnosis'].isnull())

15

In [99]:
df['age_at_diagnosis'] = df['age_at_diagnosis'].fillna(df['age_at_diagnosis'].mean())

**morphology**   
This column contains information about the morphology of the tumor. This is not information that we need so I will remove it. Also I think that there is a little bit of target leakage in this column, since it contains information about the tumor itself, and the goal of the project is not to have this kind of information.

In [100]:
columns_to_drop.append('morphology')

**created_datetime** contains information about the datetime in which this record was added to the website.

**prior_treatment** contains information about prior treatmens given to the patient.

**ICD-10 code** is a diagnostic code used to describe a patient’s medical condition.

All of these columns contains information that we don't care about, therefore I will remove them:

In [101]:
columns_to_drop = columns_to_drop + ['created_datetime', 'prior_treatment', 'icd_10_code']

**year_of_diagnosis**  
This column contains important information about the year that the patient was diagnosed with cancer. Nevertheless, we have information about the age of the patient at diagnosis, which is more important for us. Therefore, I will remove it:

In [102]:
columns_to_drop.append('year_of_diagnosis')

**tissue_or_organ_of_origin**  
The column contains inforamtion about the exact location of the tumor. I don't think that this is something that interests us so I will remove it:

In [103]:
columns_to_drop.append('tissue_or_organ_of_origin')

**prior_malignancy**  
Meant to tell if the patient has had cancer before. I guess that this could be interesting, so I will convert it to 1 and 0:

In [104]:
clinical['prior_malignancy'].value_counts()

no              1095
yes               68
not reported       1
Name: prior_malignancy, dtype: int64

In [105]:
df['prior_malignancy'] = df['prior_malignancy'].replace({'no': 0, 'yes': 1})

As for the record that had 'not reported', I will replace the value with the mode of this column, which is 0:

In [106]:
print(clinical['prior_malignancy'].mode())
df['prior_malignancy'] = df['prior_malignancy'].replace({'not reported': 0}) 

0    no
dtype: object


**synchronous_malignancy**  
The column describes whether the patient had an additional malignant diagnosis at the same time the breast cancer was diagnosed. If both tumors were sequenced, both tumors would have synchronous malignancies.

Let's look at the values of the column:

In [107]:
clinical['synchronous_malignancy'].value_counts()

No              1095
Not Reported      69
Name: synchronous_malignancy, dtype: int64

I think that this column can be dropped, since there are a lot of values registered as not reported, and the rest of it is only one value - no. (Tell me if you think otherwise)

In [108]:
columns_to_drop.append('synchronous_malignancy')

**site_of_resection_or_biopsy**  
Here the column has information about the place of the tumor. As I said in tissue_or_organ_of_origin, I will remove it:

In [109]:
clinical['site_of_resection_or_biopsy'].value_counts()

Breast, NOS                       1148
Lower-inner quadrant of breast       6
Upper-outer quadrant of breast       5
Overlapping lesion of breast         2
Upper-inner quadrant of breast       2
Lower-outer quadrant of breast       1
Name: site_of_resection_or_biopsy, dtype: int64

In [110]:
columns_to_drop.append('site_of_resection_or_biopsy')

**days_to_last_follow_up**  
Information about the days from diagnosis to the last follow up. We don't care about that, so it will be removed:

In [111]:
columns_to_drop.append('days_to_last_follow_up')

**year_of_death, vital_status**  

Both of these columns contains information about the death of the patient. This is not information that we would like to have, since we want our model tool to the able to predict the stage of a women that are still alive. Therefore I will remove both of them:

In [112]:
print(clinical['year_of_death'].value_counts())
print(clinical['vital_status'].value_counts())

2008.0    18
2005.0    17
2004.0    16
2001.0    14
2010.0    12
2006.0    10
2002.0     9
2000.0     8
2003.0     8
1999.0     7
2007.0     6
2009.0     5
1998.0     3
1992.0     2
1996.0     2
2013.0     1
1995.0     1
1994.0     1
2012.0     1
Name: year_of_death, dtype: int64
Alive    971
Dead     193
Name: vital_status, dtype: int64


In [113]:
columns_to_drop.append('year_of_death')
columns_to_drop.append('vital_status')

**days_to_birth** is the number of days between the date used for index and the date from a person's date of birth, represented as a calculated negative number of days.

**year_of_birth** is self explanatory.

We don't need both of these columns since we have the information about the women's age at diagnosis. I will remove them:

In [114]:
columns_to_drop.append('days_to_birth')
columns_to_drop.append('year_of_birth')

**treatment_type, treatment_or_therapy**  

These two columns contains information about the treatement given to the patients. This is not information that we would like to keep since we want to diagnose our patients according to their general information and genes, and not according to treatement that they had taken. Moreover, the treatement if given according to the stage of the patient, and we don't want to have target leakage.

In [115]:
# columns_to_drop.append('treatment_type')
# columns_to_drop.append('treatment_or_therapy')

**race, ethnicity**

In [116]:
clinical['race'].value_counts()

white                               832
black or african american           180
not reported                         94
asian                                57
american indian or alaska native      1
Name: race, dtype: int64

In [117]:
clinical['ethnicity'].value_counts()

not hispanic or latino    933
not reported              195
hispanic or latino         36
Name: ethnicity, dtype: int64

They both contain information about the ethnicity of the patient. I tried to see maybe we can combine both of them, and then to have one column with all of these categories, but it seems like it's not possible, for example:

patients who are both white and hispanic or latino:

In [118]:
print(len(df[(df['race'] == 'white') & (df['ethnicity'] == 'hispanic or latino')]))
print(len(df[(df['race'] == 'white') & (df['ethnicity'] == 'not hispanic or latino')]))

31
712


We can see that there is an overlap on some of the values - for example, if someone if white and not hispanic or latino, what does it mean? Does it mean that she is not white and only hispanic? I don't know what to do with this. 

What I decided to do is to remove the ethnicity column and leave the race column in.

In [119]:
columns_to_drop.append('ethnicity')

**age_at_index**  

Contains information about the patient's age (in years) on the reference or anchor date date used during date obfuscation. Data obfuscation (DO) is a form of data masking where data is purposely scrambled to prevent unauthorized access to sensitive materials, or in our case for privacy preservation.

I believe that we don't need this information, therefore I will remove the column:

In [120]:
columns_to_drop.append('age_at_index')

### Drop it

In [123]:
df = df.drop(columns=np.unique(columns_to_drop))

In [124]:
df.shape

(1164, 58269)

## Label

In [24]:
clinical['tumor_stage'].value_counts()

stage iia       378
stage iib       280
stage iiia      167
stage i          99
stage ia         88
stage iiic       68
stage iiib       27
stage iv         21
stage x          12
not reported     12
stage ii          5
stage ib          5
stage iii         2
Name: tumor_stage, dtype: int64

`stage x` means that the tumor can't be assessed, which is basically the same as `not reported`.  
  
So let's merge it

In [126]:
df['tumor_stage'] = df['tumor_stage'].replace({'not reported': 'stage x' })

In [128]:
df['tumor_stage'].value_counts()

stage iia     378
stage iib     280
stage iiia    167
stage i        99
stage ia       88
stage iiic     68
stage iiib     27
stage x        24
stage iv       21
stage ib        5
stage ii        5
stage iii       2
Name: tumor_stage, dtype: int64

# Persist it

In [129]:
df.to_csv('data/cleaned.csv')