In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
df_master = pd.read_csv("../../../data/initial_dataset/Master_04_10_2019.csv",sep=",",low_memory=False)
df_master = df_master.drop(df_master.columns[[0,1]],1)
df_itd = pd.read_csv("../../../data/updated_dataset/ITD_merge_updated.csv",sep=",",low_memory=False)
df_itd = df_itd.drop(df_itd.columns[0],1)
df_merge = pd.merge(df_master,df_itd, on='data_pd', how='inner')

# RULES :

We start from initial dataframe that was given to me.

1) Replace FLT3_ITD by new column ITD where we have set up the rules. It is simple : we have NGS and Clin ITD so in most of the cases both are concordant and when not , we give priority to clin ITD if it's 1 and if not we look at the read counts (details on ITD.ipynb notebook at the bottom)

2) Additions : sum (p and q) and return 1 if sum >=1 for each chromosome . ex : add_10 = add_10p+add_10q

3) Deletions : exactly the same . ex: del_10 = del_10p +del_10q

4) Keep only new columns additions and deletions with frequency >=2%

5) Translocations : Keep all translocations that appear at least two times. For translocations that appear only once : create a new column "other transloc" to sum all those translocation and return 1 when value is >=1

6) Inversions : There is only one inversion with frequency greater than 2% (inv  16). we will add inv3 because it creates its own cluster with SF3B1. (We will not do this in this notebook).

7) Replace complex columns by : 1 if sumn of aneuploidies (any additions and any deletions is >=3)

# Data Frame with all genomics and cytogenetic features

In [3]:
df_w_transloc = df_merge.loc[:,'ASXL1':'ZRSR2']
print ('number of genomic features: '+str(df_w_transloc.shape[1]))
df_w_transloc=df_w_transloc.join(df_merge.loc[:,'t_v_11':'complex'])
print ('number of cyto features: '+str(df_merge.loc[:,'t_v_11':'complex'].shape[1]))
print ('number of genomic + cyto features: '+ str(df_w_transloc.shape[1]))
df_w_transloc = df_merge.loc[:,['data_pd','sample_pd']].join(df_w_transloc)
print ('number of genomic + cyto features + id features: '+ str(df_w_transloc.shape[1]))

number of genomic features: 83
number of cyto features: 355
number of genomic + cyto features: 438
number of genomic + cyto features + id features: 440


# 1 Replace FLT3_ITD by Merge_ITD_new_rules

In [4]:

df_w_transloc['FLT3_ITD'] = df_merge['Merge_ITD_new_rules']
df_w_transloc.rename(columns={'FLT3_ITD':'ITD'},inplace=True)
print (df_w_transloc.shape)

(2427, 440)


Let's get rid of NaN rows that are present because of NaN values in the translocation features

In [5]:
print('There are '+str(df_w_transloc[df_w_transloc.t_13_19.isna()].shape[0])+' rows with Nan values in this translocation features')

There are 277 rows with Nan values in this translocation features


##### Let's get rid of them (unfortunately the size of the dataset is going to decrease by those number of rows)

##### The final dataset without those NaN values and with all genomics and translocation features is as follow: (we also save it)

In [6]:
df_w_transloc = df_w_transloc[~df_w_transloc.t_13_19.isna()]  # (Those rows have Nans for all translocations and lots of cyto events)
df_w_transloc.shape

(2150, 440)

#  Rules for Cyto events

# 2 Additions

In [7]:
tmp_add = df_w_transloc.loc[:,'add_10p':'add_xq']
tmp_add = pd.concat([tmp_add,df_w_transloc.loc[:,'plus1':'plusy']],1)
to_keep = []
for i in range(1,23):
    tmp_add['add_'+str(i)] = tmp_add['add_'+str(i)+'p']+tmp_add['add_'+str(i)+'q']+tmp_add['plus'+str(i)]
    freq = 100*sum(tmp_add['add_'+str(i)])/tmp_add.shape[0]
    print ('Chromosome : '+str(i)+' with Freqency(%)', freq)
    to_keep +=['add_'+str(i) if freq >=2 else '']
    
tmp_add['add_x'] = tmp_add['add_xp']+tmp_add['add_xq']+tmp_add['plusx']
freq = 100*sum(tmp_add['add_x'])/tmp_add.shape[0]
print ('Chromosome : x with Freqency(%)', freq)
to_keep +=['add_x' if freq >=2 else '']

freq = 100*sum(tmp_add['plusy'])/tmp_add.shape[0]
print ('Chromosome : y with Freqency(%)', freq)
to_keep +=['plusy' if freq >=2 else '']


to_keep= [t for t in to_keep if t!='']
print()
print('We keep : '+ str(to_keep))
tmp_add = tmp_add[to_keep]
tmp_add[tmp_add>1]=1


Chromosome : 1 with Freqency(%) 1.5348837209302326
Chromosome : 2 with Freqency(%) 0.9767441860465116
Chromosome : 3 with Freqency(%) 1.302325581395349
Chromosome : 4 with Freqency(%) 1.5813953488372092
Chromosome : 5 with Freqency(%) 0.8837209302325582
Chromosome : 6 with Freqency(%) 1.9534883720930232
Chromosome : 7 with Freqency(%) 1.3488372093023255
Chromosome : 8 with Freqency(%) 10.232558139534884
Chromosome : 9 with Freqency(%) 1.6744186046511629
Chromosome : 10 with Freqency(%) 0.9767441860465116
Chromosome : 11 with Freqency(%) 3.953488372093023
Chromosome : 12 with Freqency(%) 1.1627906976744187
Chromosome : 13 with Freqency(%) 2.2325581395348837
Chromosome : 14 with Freqency(%) 1.441860465116279
Chromosome : 15 with Freqency(%) 0.8372093023255814
Chromosome : 16 with Freqency(%) 1.069767441860465
Chromosome : 17 with Freqency(%) 1.6744186046511629
Chromosome : 18 with Freqency(%) 0.6976744186046512
Chromosome : 19 with Freqency(%) 1.8604651162790697
Chromosome : 20 with Freq

# 3 Deletions

In [9]:
tmp_del = df_w_transloc.loc[:,'del_10p':'del_xq']
tmp_del = pd.concat([tmp_del,df_w_transloc.loc[:,'minus1':'minusy']],1)
tmp_del.drop(['minus7.'],1,inplace=True)
to_keep = []
list_p_missing = ['4', '14', '20', '21', '22']
list_with_p = ['1', '2', '3', '5', '6', '7', '8', '9', '10', '11', '12', '13', '15', '16', '17', '18', '19',  ]
for l in list_p_missing:
    tmp_del['del_'+l] = tmp_del['del_'+l+'q'] + tmp_del['minus'+l]
    freq = 100*sum(tmp_del['del_'+l])/tmp_del.shape[0]
    print ('Chromosome : '+l+' with Freqency(%)', freq)
    to_keep +=['del_'+l if freq >=2 else '']
    
for l in list_with_p:
    tmp_del['del_'+l] = tmp_del['del_'+l+'p'] + tmp_del['del_'+l+'q'] + tmp_del['minus'+l]
    freq = 100*sum(tmp_del['del_'+l])/tmp_del.shape[0]
    print ('Chromosome : '+l+' with Freqency(%)', freq)
    to_keep +=['del_'+l if freq >=2 else '']
    
tmp_del['del_x'] = tmp_del['del_xq'] + tmp_del['minusx'] 
freq = 100*sum(tmp_del['del_x'])/tmp_del.shape[0]
to_keep +=['del_x' if freq >=2 else '']
print ('Chromosome : x with Freqency(%)', 100*sum(tmp_del['del_x'])/tmp_del.shape[0])

freq = 100*sum(tmp_del['minusy'])/tmp_del.shape[0]
to_keep +=['minusy' if freq >=2 else '']
print ('Chromosome : y with Freqency(%)', 100*sum(tmp_del['minusy'])/tmp_del.shape[0])

to_keep= [t for t in to_keep if t!='']
print()
print('We keep : '+ str(to_keep))
tmp_del = tmp_del[to_keep]
tmp_del[tmp_del>1]=1

Chromosome : 4 with Freqency(%) 1.3953488372093024
Chromosome : 14 with Freqency(%) 0.9767441860465116
Chromosome : 20 with Freqency(%) 2.744186046511628
Chromosome : 21 with Freqency(%) 1.5813953488372092
Chromosome : 22 with Freqency(%) 1.2093023255813953
Chromosome : 1 with Freqency(%) 0.6046511627906976
Chromosome : 2 with Freqency(%) 0.9767441860465116
Chromosome : 3 with Freqency(%) 2.13953488372093
Chromosome : 5 with Freqency(%) 7.162790697674419
Chromosome : 6 with Freqency(%) 1.2093023255813953
Chromosome : 7 with Freqency(%) 8.883720930232558
Chromosome : 8 with Freqency(%) 0.5116279069767442
Chromosome : 9 with Freqency(%) 3.0232558139534884
Chromosome : 10 with Freqency(%) 0.6511627906976745
Chromosome : 11 with Freqency(%) 1.6744186046511629
Chromosome : 12 with Freqency(%) 2.7906976744186047
Chromosome : 13 with Freqency(%) 2.186046511627907
Chromosome : 15 with Freqency(%) 1.1162790697674418
Chromosome : 16 with Freqency(%) 2.3255813953488373
Chromosome : 17 with Freqen

# 5 Translocations

In [12]:
transloc = df_w_transloc.loc[:,'t_v_11':'complex']
#transloc.tail()
transloc.loc['count']=transloc.sum()
# high_count = pd.DataFrame(transloc.loc['count'].T)
# transloc_keep = high_count[((high_count['count']>=2) & ((high_count.index.str.contains("t_"))|(high_count.index=="complex"))) ].index
# transloc_to_sum = high_count[(high_count['count']<2) & ((high_count.index.str.contains("t_")))].index
# tmp_others = df_w_transloc.loc[:,transloc_keep]
# tmp_others['others_transloc'] = transloc.loc[:,transloc_to_sum].sum(axis = 1)
# tmp_others.others_transloc[tmp_others.others_transloc>1]=1
# tmp_others.columns.shape

In [13]:
transloc

Unnamed: 0,t_v_11,add_10p,add_10q,add_11p,add_11q,add_12p,add_12q,add_13p,add_13q,add_14p,...,t_9_22,t_x_1,t_x_12,t_x_15,t_x_16,t_x_21,t_x_5,t_x_y,wholedup,complex
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 6 Check frequency of inversions

In [44]:
transloc = df_w_transloc.loc[:,[col for col in df_w_transloc if col.startswith('inv')]]
transloc.loc['count']=100*transloc.sum()/transloc.shape[0]
transloc.loc['count']

inv_1     0.186047
inv_10    0.139535
inv_11    0.139535
inv_12    0.046512
inv_16    4.372093
inv_17    0.046512
inv_2     0.093023
inv_3     1.023256
inv_4     0.046512
inv_5     0.093023
inv_6     0.046512
inv_7     0.232558
inv_8     0.186047
inv_9     0.139535
inv_x     0.046512
Name: count, dtype: float64

# 4 All together in a datframe with other features

In [45]:
df_modif_final = df_w_transloc.loc[:,'data_pd':'ZRSR2']
print(df_modif_final.shape) # 83 genomic features + 2 ids features
df_modif_final = df_modif_final.join(tmp_add)
print(df_modif_final.shape) # + 5 translocation addition features
df_modif_final = df_modif_final.join(tmp_del) # + 11 deletion features
print(df_modif_final.shape)
df_modif_final = df_modif_final.join(tmp_others) # + 52 translocation features
print('Final df for clustering dimensions :'+str(df_modif_final.shape))

(2150, 85)
(2150, 90)
(2150, 101)
Final df for clustering dimensions :(2150, 153)


#### Let's look at the types of the variables to make it uniform 

In [46]:
print(df_modif_final.dtypes.unique())
df_modif_final.iloc[:,2:] = df_modif_final.iloc[:,2:].astype(int)
print(df_modif_final.dtypes.unique())

[dtype('O') dtype('int64') dtype('float64')]
[dtype('O') dtype('int64')]


## Create the csv file

In [226]:
# df_modif_final.to_csv("../../../data/updated_dataset/modif_final.csv",sep=",")