In [36]:
import pandas as pd, numpy as np, random
from imblearn.over_sampling import SMOTE, KMeansSMOTE, SVMSMOTE, BorderlineSMOTE, ADASYN

random.seed(0)

OVERSAMPLE = False
AUGMENT = False
AUGMENT_NEOPLASIA = False
DO_SMOTE = True
DO_SMOTE_SVM = True
DO_SMOTE_KMEAN = True
DO_SMOTE_BORDER = True
DO_SMOTE_ADASYN = True


# Augmentation

Augment dataset by generating new neoplasia samples by averaging random neoplasia samples. This is better than scaling records (think of the effect on the spectra) imo - but this can be tested!

I am augmenting using the training set, to avoid information leakage

I am only uagmenting neoplasia, since looking at evaluation metrics and the spectra, squamous tissue is already successfuly / easily identified.

In [25]:
data = pd.read_csv('original_data/noExclusion_train_data.csv', header=None).reset_index(drop=True)
label = pd.read_csv('original_data/noExclusion_train_label.csv', header=None, names=['label']).astype(int)
combined = pd.merge(data, label, left_index=True, right_index=True)

# print(combined['label'].value_counts())
# 1: 137, 2: 257, 3: 178
squamous = combined.loc[combined['label']==1].reset_index(drop=True)
ndbe = combined.loc[combined['label']==2].reset_index(drop=True)
neoplasia = combined.loc[combined['label']==3].reset_index(drop=True)

In [26]:
# augment dataset (balance NDBE and neoplasia)

if OVERSAMPLE:
    # need to make 79 new neoplasia records
    for i in range(79):
        rand1 = random.randint(0,178) # choose any of the neoplasia records
        rand2 = random.randint(0,178)
        records = neoplasia.iloc[[rand1,rand2]]
        # average the 2 records to make a new neoplasia record
        new_neoplasia = records.mean().to_frame().transpose()
        neoplasia = pd.concat([neoplasia, new_neoplasia], axis=0)

    neoplasia.reset_index(drop=True)
    neoplasia['label'] = neoplasia['label'].astype(int)

    augmented = pd.concat([squamous, ndbe, neoplasia]).reset_index(drop=True)

    x_train = augmented.drop('label', axis=1)
    y_train = augmented['label']

    x_train.to_csv('augmented_data/train_data.csv', index=False, header=False)
    y_train.to_csv('augmented_data/train_label.csv', index=False, header=False)

In [27]:
# augment dataset squamous to 150, NDBE and neoplasia to 300
# why? To create more training, whilst not having too much fake data
# any more tha 300 and neoplasia wouldve had more fake data than real
# 300 for NDBE to match neoplasia and balance it, since they have similar spectra
# 150 to squamous so it isn't too underrepresented - although it doesn't really struggle
# since it is so distinct from NDBE and neoplasia

if AUGMENT:
    # need to make 122 new neoplasia records
    for i in range(122):
        rand1 = random.randint(0,178) # choose any of the neoplasia records
        rand2 = random.randint(0,178)
        records = neoplasia.iloc[[rand1,rand2]]
        # average the 2 records to make a new neoplasia record
        new_neoplasia = records.mean().to_frame().transpose()
        neoplasia = pd.concat([neoplasia, new_neoplasia], axis=0).reset_index(drop=True)

    # need to make 43 new ndbe records
    for i in range(43):
        rand1 = random.randint(0,257) # choose any of the ndbe records
        rand2 = random.randint(0,257)
        records = ndbe.iloc[[rand1,rand2]]
        # average the 2 records to make a new ndbe record
        new_ndbe = records.mean().to_frame().transpose()
        ndbe = pd.concat([ndbe, new_ndbe], axis=0).reset_index(drop=True)

    # need to make 13 new squamous records
    for i in range(13):
        rand1 = random.randint(0,137) # choose any of the ndbe records
        rand2 = random.randint(0,137)
        records = squamous.iloc[[rand1,rand2]]
        # average the 2 records to make a new ndbe record
        new_squamous = records.mean().to_frame().transpose()
        squamous = pd.concat([squamous, new_squamous], axis=0).reset_index(drop=True)

    augmented = pd.concat([squamous, ndbe, neoplasia]).reset_index(drop=True)

    x_train = augmented.drop('label', axis=1)
    y_train = augmented['label'].astype(int)

    x_train.to_csv('augmented_datav2/train_data.csv', index=False, header=False)
    y_train.to_csv('augmented_datav2/train_label.csv', index=False, header=False)

In [28]:
# augment dataset (neoplasia only) and create an equal number of augmented data as
# real data - see if it improves recall

if AUGMENT_NEOPLASIA:
    # need to make 178 new neoplasia records
    for i in range(178):
        rand1 = random.randint(0,178) # choose any of the neoplasia records
        rand2 = random.randint(0,178)
        records = neoplasia.iloc[[rand1,rand2]]
        # average the 2 records to make a new neoplasia record
        new_neoplasia = records.mean().to_frame().transpose()
        neoplasia = pd.concat([neoplasia, new_neoplasia], axis=0).reset_index(drop=True)

    augmented = pd.concat([squamous, ndbe, neoplasia]).reset_index(drop=True)

    x_train = augmented.drop('label', axis=1)
    y_train = augmented['label'].astype(int)

    x_train.to_csv('augmented_datav3/train_data.csv', index=False, header=False)
    y_train.to_csv('augmented_datav3/train_label.csv', index=False, header=False)

    augmented['label'].value_counts()

# Data generation

In [38]:
# (Imblearn, 2014), (Brownlee, 2020)
# trying every smote technique that isn't for categorical 
# print(data.shape)
if DO_SMOTE:
    smote = SMOTE(random_state=1)
    print("pre smote:", label.value_counts())
    x, y = smote.fit_resample(data, label)
    print(f"post smote: {y.value_counts()}")
    x.to_csv('SMOTE/smote_train_data.csv', index=False, header=False)
    y.to_csv('SMOTE/smote_train_label.csv', index=False, header=False)

# print(data.shape)
    
if DO_SMOTE_KMEAN:
    smote = KMeansSMOTE(random_state=1)
    print("pre kmean smote:", label.value_counts())
    x, y = smote.fit_resample(data, label)
    print(f"post kmean smote: {y.value_counts()}")
    x.to_csv('SMOTE/kmeanssmote_train_data.csv', index=False, header=False)
    y.to_csv('SMOTE/kmeanssmote_train_label.csv', index=False, header=False)

if DO_SMOTE_SVM:
    smote = SVMSMOTE(random_state=1)
    print("pre svm smote:", label.value_counts())
    x, y = smote.fit_resample(data, label)
    print(f"post svm smote: {y.value_counts()}")
    x.to_csv('SMOTE/svmsmote_train_data.csv', index=False, header=False)
    y.to_csv('SMOTE/svmsmote_train_label.csv', index=False, header=False)

if DO_SMOTE_BORDER:
    smote = BorderlineSMOTE(random_state=1)
    print("pre border smote:", label.value_counts())
    x, y = smote.fit_resample(data, label)
    print(f"post border smote: {y.value_counts()}")
    x.to_csv('SMOTE/bordersmote_train_data.csv', index=False, header=False)
    y.to_csv('SMOTE/bordersmote_train_label.csv', index=False, header=False)
    
if DO_SMOTE_ADASYN:
    smote = ADASYN(random_state=1)
    print("pre adasyn smote:", label.value_counts())
    x, y = smote.fit_resample(data, label)
    print(f"post adasyn smote: {y.value_counts()}")
    x.to_csv('SMOTE/adasynsmote_train_data.csv', index=False, header=False)
    y.to_csv('SMOTE/adasynsmote_train_label.csv', index=False, header=False)


pre smote: label
2        257
3        178
1        137
dtype: int64
post smote: label
1        257
2        257
3        257
dtype: int64
pre kmean smote: label
2        257
3        178
1        137
dtype: int64




post kmean smote: label
3        260
1        258
2        257
dtype: int64
pre svm smote: label
2        257
3        178
1        137
dtype: int64
post svm smote: label
1        257
2        257
3        257
dtype: int64
pre border smote: label
2        257
3        178
1        137
dtype: int64
post border smote: label
1        257
2        257
3        257
dtype: int64
pre adasyn smote: label
2        257
3        178
1        137
dtype: int64
post adasyn smote: label
1        257
2        257
3        253
dtype: int64
