**SMOTE OverSampling on unsegmented dermoscopic images**

In [None]:
import numpy as np
import cv2
import pandas as pd
import csv
from google.colab.patches import cv2_imshow
import math
from PIL import Image
import glob
import os
from sklearn.metrics import *
from zipfile import ZipFile
from collections import *
from imblearn.over_sampling import SMOTE

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Preparing X_train and Y_train

In [None]:
os.chdir(root_path+'/Unbalanced_Train_Processed')
images_pro = glob.glob('*.jpg')
os.chdir(root_path)
images_pro = sorted(images_pro)

In [None]:
print(len(images_pro))

25331


**Preparing Y_train**

In [None]:
print("Preparing the Y_train....")
Y_train = []
# getting the corresponding labels for the images
df = pd.read_csv("ISIC_2019_Training_GroundTruth.csv")
Names = df.image
MEL = df.MEL
NV = df.NV
BCC = df.BCC
AK = df.AK
BKL = df.BKL
DF = df.DF
VASC = df.VASC
SCC = df.SCC

img_names = []

for j in range(len(Names)):
  img_names.append(Names[j])

for i in range(len(images_pro)):
  if (images_pro[i][:-4] in img_names):
    val = img_names.index(images_pro[i][:-4])
    if(MEL[val] ==1):
      Y_train.append('MEL')
    elif(NV[val] ==1):
      Y_train.append('NV')
    elif(BCC[val] ==1):
      Y_train.append('BCC')
    elif(AK[val] ==1):
      Y_train.append('AK')
    elif(BKL[val] ==1):
      Y_train.append('BKL')
    elif(DF[val] ==1):
      Y_train.append('DF')
    elif(VASC[val] ==1):
      Y_train.append('VASC')
    elif(SCC[val] ==1):
      Y_train.append('SCC')
   
#print(Y_train)

Preparing the Y_train....


In [None]:
with open(root_path+'/Y_train.txt', mode="w") as outfile:  # also, tried mode="rb"
    for s in Y_train:
        outfile.write("%s\n" % s)

In [None]:
Y_train=[]
with open(root_path+'/Y_train.txt', 'r') as filehandle:
    for line in filehandle:
        sam = line[:-1]
        Y_train.append(sam)

**Preparing the X_train**

In [None]:
os.chdir(root_path+'/Unbalanced_Train_Processed')

In [None]:
print("Preparing the X_train....")
X = []
for i in range(len(images_pro)):
    print((i,images_pro[i]))
    image = cv2.imread (images_pro[i])
    image = np.array(image)
    X.append(image)
X_train = np.array(X)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
(20331, 'ISIC_0065355.jpg')
(20332, 'ISIC_0065357.jpg')
(20333, 'ISIC_0065358.jpg')
(20334, 'ISIC_0065359.jpg')
(20335, 'ISIC_0065360.jpg')
(20336, 'ISIC_0065361.jpg')
(20337, 'ISIC_0065362.jpg')
(20338, 'ISIC_0065366.jpg')
(20339, 'ISIC_0065367.jpg')
(20340, 'ISIC_0065368.jpg')
(20341, 'ISIC_0065370.jpg')
(20342, 'ISIC_0065371.jpg')
(20343, 'ISIC_0065372.jpg')
(20344, 'ISIC_0065374.jpg')
(20345, 'ISIC_0065376.jpg')
(20346, 'ISIC_0065377.jpg')
(20347, 'ISIC_0065378.jpg')
(20348, 'ISIC_0065379.jpg')
(20349, 'ISIC_0065380.jpg')
(20350, 'ISIC_0065382.jpg')
(20351, 'ISIC_0065383.jpg')
(20352, 'ISIC_0065384.jpg')
(20353, 'ISIC_0065385.jpg')
(20354, 'ISIC_0065387.jpg')
(20355, 'ISIC_0065389.jpg')
(20356, 'ISIC_0065390.jpg')
(20357, 'ISIC_0065391.jpg')
(20358, 'ISIC_0065392.jpg')
(20359, 'ISIC_0065393.jpg')
(20360, 'ISIC_0065394.jpg')
(20361, 'ISIC_0065396.jpg')
(20362, 'ISIC_0065397.jpg')
(20363, 'ISIC_0065398.jpg')
(20364, 'IS

In [None]:
np.save(root_path+'/X_train.npy', X_train)

In [None]:
X_train= np.load(root_path+'/X_train.npy')

In [None]:
# reshaping the training sample for easy computation 
print("Reshaping X_train...")
ReX_train = X_train.reshape(25331, 256 * 256 * 3)
ReX_train.shape

Reshaping X_train...


(25331, 196608)

# Smote Oversampling

In [None]:
strategy={'AK':7000,'MEL':7000,'BCC':7000,'BKL':7000,'DF':7000,'VASC':7000,'SCC':7000}
print("Performing oversampling of the minority classes....")
smote = SMOTE(random_state=0,sampling_strategy=strategy)
X_resampled, y_resampled = smote.fit_resample(ReX_train, Y_train) 

print("Number of samples in each class after balancing dataset using smotetomek: ")
print(sorted(Counter(y_resampled).items()))

Performing oversampling of the minority classes....
Number of samples in each class after balancing dataset using smotetomek: 
[('AK', 7000), ('BCC', 7000), ('BKL', 7000), ('DF', 7000), ('MEL', 7000), ('NV', 12875), ('SCC', 7000), ('VASC', 7000)]


In [None]:
# reversing the reshaping done earlier
X_resampled = X_resampled.reshape(-1,256,256,3)

In [None]:
print(X_resampled.shape)

(61875, 256, 256, 3)


In [None]:
np.save(root_path+'/X_resampled.npy', X_resampled)

In [None]:
X_resampled= np.load(root_path+'/X_resampled.npy')

In [None]:
with open(root_path+'/y_resampled.txt', mode="w") as outfile:  # also, tried mode="rb"
    for s in y_resampled:
        outfile.write("%s\n" % s)

In [None]:
y_resampled=[]
with open(root_path+'/y_resampled.txt', 'r') as filehandle:
    for line in filehandle:
        sam = line[:-1]
        y_resampled.append(sam)

In [None]:
print(len(y_resampled))

61875


In [None]:
os.chdir(root_path)
print("Creating folder for resampled images....")

os.mkdir('training_balanced')


Creating folder for resampled images....


# Sampling images

**Creating the oversampled images**

**Separating the classes into different folders for using in the classification model**

In [None]:
# creating folders for the new sampled class samples

print("Creating folders for the new sampled class samples....")

os.chdir(root_path+'/training_balanced')

os.mkdir('MEL') 
os.mkdir('NV') 
os.mkdir('BCC') 
os.mkdir('AK') 
os.mkdir('BKL') 
os.mkdir('DF') 
os.mkdir('VASC') 
os.mkdir('SCC') 

Creating folders for the new sampled class samples....


In [None]:
os.chdir('Balanced_train')

In [None]:
print("Creating the newly sampled images.....")

for i in range(0,len(y_resampled)):

    print(i)
    # creating the newly sampled images
    str_name = 'ICIS_resampled_'+str(i+1) +'.jpg'
    cv2.imwrite(str_name, X_resampled[i])
    cv2.imwrite(root_path+'/training_balanced/'+y_resampled[i]+'/'+str_name,X_resampled[i])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
56875
56876
56877
56878
56879
56880
56881
56882
56883
56884
56885
56886
56887
56888
56889
56890
56891
56892
56893
56894
56895
56896
56897
56898
56899
56900
56901
56902
56903
56904
56905
56906
56907
56908
56909
56910
56911
56912
56913
56914
56915
56916
56917
56918
56919
56920
56921
56922
56923
56924
56925
56926
56927
56928
56929
56930
56931
56932
56933
56934
56935
56936
56937
56938
56939
56940
56941
56942
56943
56944
56945
56946
56947
56948
56949
56950
56951
56952
56953
56954
56955
56956
56957
56958
56959
56960
56961
56962
56963
56964
56965
56966
56967
56968
56969
56970
56971
56972
56973
56974
56975
56976
56977
56978
56979
56980
56981
56982
56983
56984
56985
56986
56987
56988
56989
56990
56991
56992
56993
56994
56995
56996
56997
56998
56999
57000
57001
57002
57003
57004
57005
57006
57007
57008
57009
57010
57011
57012
57013
57014
57015
57016
57017
57018
57019
57020
57021
57022
57023
57024
57025
57026
57027
57028
57029
57030