# **Skin Cancer 'HAM_10000' dataset metadata EDA**

In [37]:
# import the libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import ydata_profiling as yd

In [38]:
# load the dataset 
df = pd.read_csv('HAM10000_metadata.csv')

In [39]:
# EDA with the ydata_profiling library
profile = yd.ProfileReport(df)
profile.to_file('EDA_of_skinCancer_metadata.html')

Summarize dataset: 100%|██████████| 17/17 [00:00<00:00, 21.68it/s, Completed]                    
Generate report structure: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  2.52it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 151.41it/s]


In [40]:
# view the dataset
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
dtypes: float64(1), object(6)
memory usage: 547.8+ KB


In [42]:
# checking for the duplicate values
df[df.duplicated()]

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization


### There is no Duplicate value in this metadata

In [43]:
columns =df.columns
for i in columns:
    print(i)

lesion_id
image_id
dx
dx_type
age
sex
localization


We know that the null values only present in the age column so we will resolve this issue also

In [44]:
df['age'].isnull().sum()

57

In [45]:
df['age'].fillna(df['age'].mean(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace = True)


Now all the null values ara replace with the mean of this age column

In [46]:
df['age'].isnull().sum()

0

### Total values of the uniques values

In [47]:
df['lesion_id'].nunique()   # .nunique() return the number of unique values in the series

7470

In [None]:
df['image_id'].nunique() 

10015

In [49]:
df['dx'].unique()

array(['bkl', 'nv', 'df', 'mel', 'vasc', 'bcc', 'akiec'], dtype=object)

In [50]:
len(df[df['dx'] == 'bkl'])

1099

In [51]:
len(df[df['dx'] == 'nv'])

6705

In [52]:
df[df['dx'] == 'df'].shape[0]

115

In [62]:
df['dx'].value_counts()

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64

In [53]:
df[df['dx'] == 'mel']

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
1211,HAM_0000871,ISIC_0025964,mel,histo,40.0,female,chest
1212,HAM_0000871,ISIC_0030623,mel,histo,40.0,female,chest
1213,HAM_0000040,ISIC_0027190,mel,histo,80.0,male,upper extremity
1214,HAM_0005678,ISIC_0031023,mel,histo,60.0,male,chest
1215,HAM_0005678,ISIC_0028086,mel,histo,60.0,male,chest
...,...,...,...,...,...,...,...
2319,HAM_0001953,ISIC_0025611,mel,histo,65.0,male,back
6769,HAM_0002552,ISIC_0032985,mel,histo,25.0,male,upper extremity
8820,HAM_0002552,ISIC_0032936,mel,histo,25.0,male,upper extremity
8834,HAM_0002552,ISIC_0033232,mel,histo,25.0,male,upper extremity


In [54]:
df[df['dx'] == 'vasc']

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
2320,HAM_0006889,ISIC_0031197,vasc,histo,20.0,male,upper extremity
2321,HAM_0006889,ISIC_0031270,vasc,histo,20.0,male,upper extremity
2322,HAM_0001920,ISIC_0029486,vasc,histo,55.0,male,back
2323,HAM_0001920,ISIC_0031901,vasc,histo,55.0,male,back
2324,HAM_0005155,ISIC_0032076,vasc,histo,85.0,male,lower extremity
...,...,...,...,...,...,...,...
2457,HAM_0000415,ISIC_0025680,vasc,consensus,55.0,female,trunk
2458,HAM_0004413,ISIC_0026068,vasc,consensus,55.0,female,abdomen
2459,HAM_0004257,ISIC_0025452,vasc,consensus,55.0,female,abdomen
2460,HAM_0003829,ISIC_0026349,vasc,consensus,60.0,female,trunk


In [55]:
df[df['dx'] == 'bcc']

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
2462,HAM_0000781,ISIC_0028155,bcc,histo,50.0,male,back
2463,HAM_0003220,ISIC_0034093,bcc,histo,80.0,female,face
2464,HAM_0007141,ISIC_0029230,bcc,histo,50.0,male,trunk
2465,HAM_0007141,ISIC_0031513,bcc,histo,50.0,male,trunk
2466,HAM_0007009,ISIC_0032384,bcc,histo,50.0,male,back
...,...,...,...,...,...,...,...
2971,HAM_0001573,ISIC_0028542,bcc,histo,45.0,male,chest
2972,HAM_0001215,ISIC_0025260,bcc,histo,70.0,male,back
2973,HAM_0001215,ISIC_0031531,bcc,histo,70.0,male,back
2974,HAM_0005026,ISIC_0028978,bcc,histo,40.0,female,abdomen


In [56]:
df[df['dx'] == 'akiec']

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
9687,HAM_0002644,ISIC_0029417,akiec,histo,80.0,female,neck
9688,HAM_0006002,ISIC_0029915,akiec,histo,50.0,female,face
9689,HAM_0000549,ISIC_0029360,akiec,histo,70.0,male,upper extremity
9690,HAM_0000549,ISIC_0026152,akiec,histo,70.0,male,upper extremity
9691,HAM_0000673,ISIC_0029659,akiec,histo,70.0,female,face
...,...,...,...,...,...,...,...
10009,HAM_0005705,ISIC_0031430,akiec,histo,75.0,female,lower extremity
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen


In [57]:
df['dx_type'].value_counts()

dx_type
histo        5340
follow_up    3704
consensus     902
confocal       69
Name: count, dtype: int64

In [58]:
df['sex'].value_counts()

sex
male       5406
female     4552
unknown      57
Name: count, dtype: int64

In [59]:
df['localization'].value_counts()

localization
back               2192
lower extremity    2077
trunk              1404
upper extremity    1118
abdomen            1022
face                745
chest               407
foot                319
unknown             234
neck                168
scalp               128
hand                 90
ear                  56
genital              48
acral                 7
Name: count, dtype: int64

In [60]:
df.describe()

Unnamed: 0,age
count,10015.0
mean,51.863828
std,16.920252
min,0.0
25%,40.0
50%,50.0
75%,65.0
max,85.0
