In [2]:
# imports
import pandas as pd

In [2]:
# create column names
features = []
for i in range (1,434):
    feature_name = f"feature_{i}"
    # print(feature_name)
    features.append(feature_name)
# features.append("label")
len(features)

433

In [3]:
# merge test data and labels
def merge(data, labels):
    dataDf = pd.read_csv(data, header=None, names=features, float_precision='round_trip')
    labelsDf = pd.read_csv(labels, header=None, names=['label'])
    # turn labels into ints
    labelsDf['label'] = labelsDf['label'].astype(int)
    merged = pd.merge(dataDf, labelsDf, left_index=True, right_index=True)
    # print(f"data: {dataDf.shape}, labels: {labelsDf.shape}, merged: {merged.shape}")
    # print(f"{data}: \n{dataDf.iloc[4]}")
    return merged


In [None]:
test_merged = merge("original_data/noExclusion_test_data.csv","original_data/noExclusion_test_label.csv")
train_merged = merge("original_data/noExclusion_train_data.csv","original_data/noExclusion_train_label.csv")    
# print(f"test: {test_merged.shape}, train: {train_merged.shape}")
combined = pd.concat([train_merged,test_merged])
combined['label']
combined.to_csv('combined.csv')
test_merged.to_csv('test_merged.csv')
train_merged.to_csv('train_merged.csv')

In [4]:
import random
random.seed(1)
# to use for sampling 236 NDBE records (iCodeLikeImDrunk, 2012)
indexes = random.sample(range(320), 236)
len(indexes)
# indexes

236

In [5]:
# make balanced dataset (balance NDBE and neoplasia)
combined = pd.read_csv('combined.csv', index_col=0).reset_index(drop=True)
combined['label'].value_counts()
# 1: 159, 2: 320, 3: 236
squamous = combined.loc[combined['label']==1].reset_index(drop=True)
ndbe = combined.loc[combined['label']==2].reset_index(drop=True)
neoplasia = combined.loc[combined['label']==3].reset_index(drop=True)

# choose the 236 random records
balanced_ndbe = ndbe.iloc[indexes]
# (LondonRob, 2013)
excluded = ndbe.iloc[~ndbe.index.isin(indexes)]
squamous['label'].value_counts()

# merge back together
balanced = pd.concat([squamous, balanced_ndbe, neoplasia]).reset_index(drop=True)
balanced['label'].value_counts()

2    236
3    236
1    159
Name: label, dtype: int64

In [6]:
# (Hamzah Hafejee, 2022, COMP3611_Coursework_Assessment.ipynb, Comp 3611, University of Leeds)
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit

def stratified_split(df, feature):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.25,random_state=42)
    
    # returns 2 sets of indexes for test and train
    # hence, .loc is used on the dataset df to retrieve the corresponding records
    for train_index, test_index in split.split(df,df[feature]):
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]
    
    # for set_ in (strat_train_set, strat_test_set):
    #     set_.drop((feature),axis=1,inplace=True)
        
   
    return strat_train_set, strat_test_set

train_set, test_set = stratified_split(balanced, 'label')

training_set=train_set.drop("label",axis=1)
training_labels=train_set["label"].copy()

testing = test_set.drop("label",axis=1)
testing_labels= test_set["label"].copy()

training_set.to_csv('balanced_data/train_data.csv', index=False, header=False)
training_labels.to_csv('balanced_data/train_label.csv', index=False, header=False)
testing.to_csv('balanced_data/test_data.csv', index=False, header=False)
testing_labels.to_csv('balanced_data/test_label.csv', index=False, header=False)


Manually fixed to make NDBE and neoplasia completely equal