In [38]:
import pandas as pd
import numpy as np

from os import listdir
from os.path import isfile, join

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

import tensorflow as tf, re, math
import tensorflow.keras.backend as K
import efficientnet.tfkeras as efn

import matplotlib.pyplot as plt
%matplotlib inline

In [39]:
import pandas as pd
import numpy as np
import itertools
import time

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings 
warnings.simplefilter('ignore')

In [40]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [41]:
# Images Example
train_images_dir = r'C:\Users\HIMANSHU\Downloads\Melanoma jpg\train'
train_images = [f for f in listdir(train_images_dir) if isfile(join(train_images_dir, f))]

test_images_dir = r'C:\Users\HIMANSHU\Downloads\Melanoma jpg\test'
test_images = [f for f in listdir(test_images_dir) if isfile(join(test_images_dir, f))]

print('5 Training images', train_images[:5]) # Print the first 5

5 Training images ['ISIC_0015719.jpg', 'ISIC_0052212.jpg', 'ISIC_0068279.jpg', 'ISIC_0074268.jpg', 'ISIC_0074311.jpg']


In [42]:
train = pd.read_csv(r'C:\Users\HIMANSHU\Downloads\Melanoma jpg\train.csv')
test = pd.read_csv(r'C:\Users\HIMANSHU\Downloads\Melanoma jpg\test.csv')

In [43]:
train.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053
3,ISIC_0068279,IP_6890425,female,45.0,head/neck,unknown,benign,0,0,1872,1053
4,ISIC_0074268,IP_8723313,female,55.0,upper extremity,unknown,benign,0,11,6000,4000


In [44]:
test.head()

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,width,height
0,ISIC_0052060,IP_3579794,male,70.0,,6000,4000
1,ISIC_0052349,IP_7782715,male,40.0,lower extremity,6000,4000
2,ISIC_0058510,IP_7960270,female,55.0,torso,6000,4000
3,ISIC_0073313,IP_6375035,female,50.0,torso,6000,4000
4,ISIC_0073502,IP_0589375,female,45.0,lower extremity,1920,1080


In [45]:
train.shape

(33126, 11)

In [47]:
# Filling missing anatom site values with 'unknown' tag:

for df in [train, test]:
    df['anatom_site_general_challenge'].fillna('unknown', inplace=True)

In [49]:
# Filling age and sex with appropriate values.

train['sex'].fillna(train['sex'].mode()[0], inplace=True)

train['age_approx'].fillna(train['age_approx'].median(), inplace=True)

In [50]:
# Checking missing value counts:

print(
    f'Train missing value count: {train.isnull().sum().sum()}\nTest missing value count: {train.isnull().sum().sum()}'
)

Train missing value count: 0
Test missing value count: 0


In [51]:
# getting dummy variables for gender on train set

sex_dummies = pd.get_dummies(train['sex'], prefix='sex')
train = pd.concat([train, sex_dummies], axis=1)

# getting dummy variables for gender on test set

sex_dummies = pd.get_dummies(test['sex'], prefix='sex')
test = pd.concat([test, sex_dummies], axis=1)

# dropping not useful columns

train.drop(['sex', 'image_name','patient_id','diagnosis','benign_malignant'], axis=1, inplace=True)
test.drop(['sex', 'image_name','patient_id'], axis=1, inplace=True)

In [52]:
# getting dummy variables for location on train set

anatom_dummies = pd.get_dummies(train['anatom_site_general_challenge'], prefix='anatom')
train = pd.concat([train, anatom_dummies], axis=1)

# getting dummy variables for location on test set

anatom_dummies = pd.get_dummies(test['anatom_site_general_challenge'], prefix='anatom')
test = pd.concat([test, anatom_dummies], axis=1)

# dropping not useful columns

train.drop('anatom_site_general_challenge', axis=1, inplace=True)
test.drop('anatom_site_general_challenge', axis=1, inplace=True)

In [53]:
train.head()

Unnamed: 0,age_approx,target,tfrecord,width,height,sex_female,sex_male,anatom_head/neck,anatom_lower extremity,anatom_oral/genital,anatom_palms/soles,anatom_torso,anatom_unknown,anatom_upper extremity
0,45.0,0,0,6000,4000,0,1,1,0,0,0,0,0,0
1,45.0,0,0,6000,4000,1,0,0,0,0,0,0,0,1
2,50.0,0,6,1872,1053,1,0,0,1,0,0,0,0,0
3,45.0,0,0,1872,1053,1,0,1,0,0,0,0,0,0
4,55.0,0,11,6000,4000,1,0,0,0,0,0,0,0,1


In [54]:
# Splitting of data for model
# X - Predictor variable
# y - Target variable

X = train.iloc[:,:]
y = train[['target']]
X.drop(columns=['target'], inplace=True)

In [55]:
X.head()

Unnamed: 0,age_approx,tfrecord,width,height,sex_female,sex_male,anatom_head/neck,anatom_lower extremity,anatom_oral/genital,anatom_palms/soles,anatom_torso,anatom_unknown,anatom_upper extremity
0,45.0,0,6000,4000,0,1,1,0,0,0,0,0,0
1,45.0,0,6000,4000,1,0,0,0,0,0,0,0,1
2,50.0,6,1872,1053,1,0,0,1,0,0,0,0,0
3,45.0,0,1872,1053,1,0,1,0,0,0,0,0,0
4,55.0,11,6000,4000,1,0,0,0,0,0,0,0,1


In [56]:
X.describe()

Unnamed: 0,age_approx,tfrecord,width,height,sex_female,sex_male,anatom_head/neck,anatom_lower extremity,anatom_oral/genital,anatom_palms/soles,anatom_torso,anatom_unknown,anatom_upper extremity
count,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0,33126.0
mean,48.872336,6.894826,4039.630381,2678.758256,0.482431,0.517569,0.055998,0.25409,0.003743,0.01132,0.508513,0.015909,0.150426
std,14.365684,4.390486,2109.14559,1437.617626,0.499699,0.499699,0.229922,0.435355,0.061069,0.105795,0.499935,0.125125,0.357494
min,0.0,-1.0,640.0,480.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,3.0,1872.0,1053.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,50.0,7.0,5184.0,3456.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,60.0,11.0,6000.0,4000.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
max,90.0,14.0,6000.0,6000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [57]:
# Splitting into train and test values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
print('X_train shape: {}'.format(X_train.shape))
print('X_test shape: {}'.format(X_test.shape))

X_train shape: (26500, 13)
X_test shape: (6626, 13)


In [58]:
# This is a classification task
# We will use the following models

classifiers = {
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVC': SVC(),
    'MLPClassifier': MLPClassifier(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'XGBClassifier': XGBClassifier()
}

In [59]:
# Function to train model and do predictions
# We want a model with high recall as to detect outliers - Fraud transcations

precision = []
recall = []

for name, clf in classifiers.items():
    start = time.time()
    name = clf.fit(X_train, y_train)
    end = time.time()
    y_pred = name.predict(X_test)
    
    print('****************************************************************************************')
    print('\nModel: {}'.format(name))
    print('\nTime taken: {:.2f}min'.format((end-start)/60))
    print('\nTrainig Accuracy: {:.2f}%'.format(name.score(X_train, y_train)*100))
    print('\nTest Accuracy: {:.2f}%'.format(name.score(X_test, y_test)*100))
    #print('\nConfusion Matrix: \n')
    #print(confusion_matrix(y_test, y_pred))
    print('\nPrecision Score: {:.3f}'.format(precision_score(y_test, y_pred)))
    print('\nRecall Score: {:.3f}'.format(recall_score(y_test, y_pred)))
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    print('\n')
    print('*****************************************************************************************')
    
# i = recall.index(max(recall))
# print('\nModel with best recall: {}'.format(classifiers[i]))
# print('\nRecall: {}, Precision: {}'.format(recall[i], precision[i]))

****************************************************************************************

Model: LogisticRegression()

Time taken: 0.01min

Trainig Accuracy: 98.23%

Test Accuracy: 98.19%

Precision Score: 0.000

Recall Score: 0.000


*****************************************************************************************
****************************************************************************************

Model: RandomForestClassifier()

Time taken: 0.02min

Trainig Accuracy: 98.92%

Test Accuracy: 97.93%

Precision Score: 0.235

Recall Score: 0.067


*****************************************************************************************
****************************************************************************************

Model: AdaBoostClassifier()

Time taken: 0.01min

Trainig Accuracy: 98.25%

Test Accuracy: 98.19%

Precision Score: 0.000

Recall Score: 0.000


*****************************************************************************************
*********************

In [60]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.1.0


In [61]:
len(train_images)

33126

In [62]:
len(test_images)

10982

In [63]:
import cv2

img = cv2.imread(r'C:\Users\HIMANSHU\Downloads\Melanoma jpg\train\%s.jpg'%k)
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
plt.subplot(2,5,i+1); plt.axis('off')
plt.imshow(img)
plt.show()

NameError: name 'k' is not defined