In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import csv
from sklearn.preprocessing import LabelEncoder


Getting the dataset and putting them into dataframe

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [4]:
train_df.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,ddi_scale
0,fd06d13de341cc75ad679916c5d7e6a6,4,4,prurigo-nodularis,benign-epidermal,benign,,34
1,a4bb4e5206c4e89a303f470576fc5253,1,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,,12
2,c94ce27e389f96bda998e7c3fa5c4a2e,5,5,keloid,inflammatory,non-neoplastic,1 Diagnostic,56
3,ebcf2b50dd943c700d4e2b586fcd4425,3,3,basal-cell-carcinoma,malignant-epidermal,malignant,,34
4,c77d6c895f05fea73a8f3704307036c0,1,1,prurigo-nodularis,benign-epidermal,benign,,12


In [5]:
print(train_df.describe())

       fitzpatrick_scale  fitzpatrick_centaur    ddi_scale
count        2860.000000          2860.000000  2860.000000
mean            2.524476             2.095455    23.547552
std             1.474428             1.510942    15.530522
min            -1.000000            -1.000000    -1.000000
25%             2.000000             1.000000    12.000000
50%             2.000000             2.000000    12.000000
75%             3.000000             3.000000    34.000000
max             6.000000             6.000000    56.000000


In [6]:
print(train_df.isnull().sum())

md5hash                     0
fitzpatrick_scale           0
fitzpatrick_centaur         0
label                       0
nine_partition_label        0
three_partition_label       0
qc                       2770
ddi_scale                   0
dtype: int64


Since we have the professional version of the scale(FST) we can replace the ddi_scale that is missing or not professional

In [7]:
train_df['fitzpatrick_scale'] = train_df['fitzpatrick_centaur']
test_df['fitzpatrick_scale'] = test_df['fitzpatrick_centaur']

Drop the column that we just replaced

In [8]:
train_df = train_df.drop(columns=['ddi_scale'])
test_df = test_df.drop(columns=['ddi_scale'])


As we said there were some wrongly labeled columns so i am going to get rid of those 

In [9]:
train_df = train_df[train_df['qc'] != '3 Wrongly labelled']
test_df = test_df[test_df['qc'] != '3 Wrongly labelled']

In [10]:
print(train_df.describe())

       fitzpatrick_scale  fitzpatrick_centaur
count        2856.000000          2856.000000
mean            2.093137             2.093137
std             1.509251             1.509251
min            -1.000000            -1.000000
25%             1.000000             1.000000
50%             2.000000             2.000000
75%             3.000000             3.000000
max             6.000000             6.000000


In [11]:
# Check unique values in the 'qc' column for training data
print("Unique 'qc' values in training data after removal:", train_df['qc'].unique())

# Check unique values in the 'qc' column for test data
print("Unique 'qc' values in test data after removal:", test_df['qc'].unique())

Unique 'qc' values in training data after removal: [nan '1 Diagnostic' '5 Potentially' '4 Other' '2 Characteristic']
Unique 'qc' values in test data after removal: [nan '1 Diagnostic' '5 Potentially' '2 Characteristic']


Changing the labels to numerical values

In [12]:
label_encoder = LabelEncoder()
train_df['label_numerical'] = label_encoder.fit_transform(train_df['label'])

In [13]:
train_df.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,label_numerical
0,fd06d13de341cc75ad679916c5d7e6a6,4,4,prurigo-nodularis,benign-epidermal,benign,,16
1,a4bb4e5206c4e89a303f470576fc5253,1,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,,4
2,c94ce27e389f96bda998e7c3fa5c4a2e,5,5,keloid,inflammatory,non-neoplastic,1 Diagnostic,12
3,ebcf2b50dd943c700d4e2b586fcd4425,3,3,basal-cell-carcinoma,malignant-epidermal,malignant,,3
4,c77d6c895f05fea73a8f3704307036c0,1,1,prurigo-nodularis,benign-epidermal,benign,,16


We will also convert the nine partition 

In [14]:
train_df['nine_partition_numerical'] = label_encoder.fit_transform(train_df['nine_partition_label'])
train_df['three_partition_numerical'] = label_encoder.fit_transform(train_df['three_partition_label']) 



In [15]:
print(train_df.isnull().sum())

md5hash                         0
fitzpatrick_scale               0
fitzpatrick_centaur             0
label                           0
nine_partition_label            0
three_partition_label           0
qc                           2770
label_numerical                 0
nine_partition_numerical        0
three_partition_numerical       0
dtype: int64


In [16]:
train_df.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,label,nine_partition_label,three_partition_label,qc,label_numerical,nine_partition_numerical,three_partition_numerical
0,fd06d13de341cc75ad679916c5d7e6a6,4,4,prurigo-nodularis,benign-epidermal,benign,,16,1,0
1,a4bb4e5206c4e89a303f470576fc5253,1,1,basal-cell-carcinoma-morpheiform,malignant-epidermal,malignant,,4,5,1
2,c94ce27e389f96bda998e7c3fa5c4a2e,5,5,keloid,inflammatory,non-neoplastic,1 Diagnostic,12,2,2
3,ebcf2b50dd943c700d4e2b586fcd4425,3,3,basal-cell-carcinoma,malignant-epidermal,malignant,,3,5,1
4,c77d6c895f05fea73a8f3704307036c0,1,1,prurigo-nodularis,benign-epidermal,benign,,16,1,0


drop the columns that we did the one hot encoding on 

In [17]:
train_df = train_df.drop(['label', 'three_partition_label', 'nine_partition_label'], axis=1)

In [18]:
train_df.head()

Unnamed: 0,md5hash,fitzpatrick_scale,fitzpatrick_centaur,qc,label_numerical,nine_partition_numerical,three_partition_numerical
0,fd06d13de341cc75ad679916c5d7e6a6,4,4,,16,1,0
1,a4bb4e5206c4e89a303f470576fc5253,1,1,,4,5,1
2,c94ce27e389f96bda998e7c3fa5c4a2e,5,5,1 Diagnostic,12,2,2
3,ebcf2b50dd943c700d4e2b586fcd4425,3,3,,3,5,1
4,c77d6c895f05fea73a8f3704307036c0,1,1,,16,1,0
