In [7]:
# python file to characterise the dataset

# import libraries
import pandas as pd, numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [2]:
# csv file, read it in and output some statistics 
file = "combined.csv"
df = pd.read_csv(file, index_col=0)

In [4]:
# include:
    # count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
    # count of rows and columns (wavelengths)
    # range of numbers (sort by top 10 columns)
    # number of 0s 
    # number of nulls
    # any negative numbers?
    # data types
    # data format
    # data precision
    # number of distinct values
    # modal value?

    # order columns to get results for "top 10" columns for each 
    # categorisation task - dataset is too big to show all columns


# print(f"{file}: \nrows: {df.shape[0]} \ncols: {df.shape[1]}")
results = {}

# no. rows (tissue samples) & columns(wavelengths)
n_rows = df.shape[0]
results['n_rows'] = n_rows
n_columns = df.shape[1]
results['n_columns'] = n_columns

# column names? [0, 1, 2, ..., n-2, n-1, n]
columns = df.columns.tolist()
column_names = []
# the first 3 columns
column_names.append(columns[0])
column_names.append(columns[1])
column_names.append(columns[2])
column_names.append('...')
# the last 3 columns
column_names.append(columns[-3])
column_names.append(columns[-2])
column_names.append(columns[-1])
results['column_names'] = str(column_names)

# example row?
row_eg = []
row_eg.append(df.iloc[0].iloc[0])
row_eg.append(df.iloc[0].iloc[1])
row_eg.append(df.iloc[0].iloc[2])

row_eg.append("...")

row_eg.append(df.iloc[0].iloc[-3])
row_eg.append(df.iloc[0].iloc[-2])
row_eg.append(int(df.iloc[0].iloc[-1]))
results['row_example'] = str(row_eg)

# no. values in the df
n_values = n_rows * n_columns
results['n_values'] = n_values

# count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
n_classes = df.groupby('label').size() # no. times each class
results['n_squamous'] = n_classes[1]
results['n_ndbe'] = n_classes[2]
results['n_neoplasia'] = n_classes[3]

# number of 0s in entire df
n_zeros = (df ==0).sum().sum() # (James, 2020)
results['n_zeroes'] = n_zeros

# number of nulls
n_nulls = df.isna().sum().sum() # (Naveen, 2023)
results['n_nulls'] = n_nulls

# any negative numbers?
# count number of values less than 0
n_negatives = df.lt(0).sum().sum() # (Sanchit Aluna, 2016)
negatives = df.lt(0)
results['n_negatives'] = n_negatives

# data types
dtypes = df.dtypes.unique().tolist()
results['dtypes'] = str(dtypes)

# number of columns of each type
for dtype in dtypes:
    # get len of columns array after filtering by each dtype
    count = len(df.select_dtypes(include=[dtype]).columns)
    # print(count)
    results['n_'+str(dtype)] = count

# data format


# data precision


# number of distinct values
n_unique = df.stack().dropna().nunique()
results['n_unique'] = n_unique

# modal value?

# range of numbers (sort by top 10 columns)
ranges = {}
for column in df.columns.tolist():
    range = df[column].max() - df[column].min()
    ranges[column] = range
# sort by top 10 widest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=True): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['top10_ranges'] = str(sorted_ranges)
# 10 lowest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=False): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['bottom10_ranges'] = str(sorted_ranges)

results

{'n_rows': 715,
 'n_columns': 434,
 'column_names': "['feature_1', 'feature_2', 'feature_3', '...', 'feature_432', 'feature_433', 'label']",
 'row_example': "[1.3113, 1.3093, 1.2225, '...', 0.084531, 0.042246, 3]",
 'n_values': 310310,
 'n_squamous': 159,
 'n_ndbe': 320,
 'n_neoplasia': 236,
 'n_zeroes': 715,
 'n_nulls': 0,
 'n_negatives': 57,
 'dtypes': "[dtype('float64'), dtype('int64')]",
 'n_float64': 433,
 'n_int64': 1,
 'n_unique': 145211,
 'top10_ranges': "{'feature_127': 5.10342, 'feature_128': 5.0939499999999995, 'feature_123': 5.01679, 'feature_129': 4.96903, 'feature_131': 4.956600000000001, 'feature_125': 4.9508399999999995, 'feature_126': 4.921209999999999, 'feature_130': 4.91711, 'feature_120': 4.877619999999999, 'feature_119': 4.86939}",
 'bottom10_ranges': "{'feature_355': 0.17462, 'feature_357': 0.17762, 'feature_353': 0.17836, 'feature_358': 0.17855, 'feature_356': 0.17867943, 'feature_354': 0.18157572000000002, 'feature_359': 0.18625, 'feature_360': 0.18784, 'feature

In [5]:
# to csv
characterisation = pd.DataFrame(results, index=[0])
characterisation.to_csv('characterisation.csv')

# to json
characterisation.to_json('characterisation.json')

In [15]:
df_characterised = df.copy()
df_characterised.insert(0, 'statistics', None)


  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,425,426,427,428,429,430,431,432,433,434
count,0.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,...,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0,715.0
mean,,-1.01861e-16,9.192336000000001e-17,-2.708013e-16,8.695453e-18,-1.714246e-16,-1.739091e-17,9.440778000000001e-17,-1.515493e-16,-2.981298e-16,...,-3.975064e-17,1.590026e-16,-1.4906490000000002e-17,7.453245e-17,-4.9688300000000004e-17,-1.4906490000000002e-17,6.956362e-17,1.093143e-16,-4.96883e-18,-7.950129000000001e-17
std,,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,...,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007,1.0007
min,,-2.104524,-2.079225,-2.080658,-2.082018,-2.094461,-2.079963,-2.082524,-2.072513,-2.064484,...,-1.387715,-1.409693,-1.395794,-1.351689,-1.354029,-1.353309,-1.337838,-1.304503,-1.273486,-1.506193
25%,,-0.8806269,-0.8730957,-0.868781,-0.8591011,-0.8672757,-0.8667819,-0.8657599,-0.8833805,-0.8808429,...,-0.8210808,-0.805373,-0.8262203,-0.8175081,-0.780559,-0.757213,-0.7786715,-0.7937304,-0.7623114,-0.1464355
50%,,0.2417813,0.2166949,0.2256128,0.2155955,0.2251554,0.2271979,0.2129071,0.2088136,0.2009772,...,-0.144427,-0.1273305,-0.1230994,-0.1163329,-0.1153379,-0.1357646,-0.1437587,-0.1345441,-0.09578144,-0.1464355
75%,,0.8334537,0.8356462,0.8435776,0.8528852,0.8535309,0.8573048,0.8625507,0.8718559,0.8547681,...,0.5855651,0.6373067,0.6218642,0.6208171,0.598193,0.5782258,0.5691615,0.5784073,0.5713943,1.213323
max,,3.587333,3.610147,3.579669,3.613506,3.665029,3.549126,3.643301,3.825928,3.719187,...,5.175459,4.073943,4.06382,4.467565,4.571869,4.597466,4.665563,6.109151,6.921962,1.213323


In [13]:
print(df_characterised[-10:])

    statistics  feature_1  feature_2  feature_3  feature_4  feature_5  \
133       None    1.73220    1.71710    1.72550    1.71550    1.71080   
134       None    1.65110    1.61760    1.64330    1.61640    1.60590   
135       None    0.26549    0.25595    0.23619    0.23982    0.24419   
136       None    0.84373    0.84120    0.83687    0.83452    0.83787   
137       None    1.14250    1.13270    1.12760    1.12600    1.12340   
138       None    1.21470    1.21280    1.20860    1.21630    1.20410   
139       None    0.69003    0.69312    0.71059    0.70677    0.72841   
140       None    0.60519    0.61038    0.61824    0.62226    0.62701   
141       None    0.62265    0.62775    0.62462    0.62315    0.61800   
142       None    1.53240    1.53240    1.53230    1.52430    1.52340   

     feature_6  feature_7  feature_8  feature_9  ...  feature_425  \
133    1.71050    1.69930    1.69890    1.69020  ...     0.029018   
134    1.62400    1.62240    1.60290    1.59030  ...     0

In [14]:
# calculate statistical values such as mean, mode, median for each column
# mean
mean = df_characterised.mean()
mean[0] = 'Mean'
mean[-1] = np.nan


# mode


# median
median = df_characterised.median()
median[0] = 'Median'
median[-1] = np.nan

# range
range = df_characterised.max() - df_characterised.min()
range[0] = 'Range'
range[-1] = np.nan

# standard deviation
sd = df_characterised.std()
sd[0] = 'Std'
sd[-1] = np.nan


df_characterised.iloc[-10:]
range


statistics        Range
feature_1      3.345818
feature_2      3.338058
feature_3      3.313953
feature_4      3.337115
                 ...   
feature_430     0.33589
feature_431     0.34651
feature_432     0.44173
feature_433     0.50764
label               NaN
Length: 435, dtype: object

In [5]:
stats = pd.concat([mean, median, range, sd], axis=1)
stats = stats.transpose()
stats.to_csv('statistics.csv')
stats


Unnamed: 0,statistics,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_425,feature_426,feature_427,feature_428,feature_429,feature_430,feature_431,feature_432,feature_433,label
0,Mean,1.206475,1.202361,1.19951,1.196979,1.189087,1.188905,1.185028,1.179817,1.170785,...,0.069739,0.071132,0.07197,0.073042,0.075101,0.076387,0.077219,0.077727,0.078882,
1,Median,1.3486,1.3295,1.3316,1.3233,1.3198,1.321,1.3085,1.3002,1.2852,...,0.062481,0.064707,0.065623,0.066756,0.068704,0.068724,0.068921,0.06971,0.072949,
2,Range,3.345818,3.338058,3.313953,3.337115,3.343641,3.272805,3.320597,3.400506,3.292616,...,0.32983,0.2767,0.28151,0.31446,0.32868,0.33589,0.34651,0.44173,0.50764,
3,Std,0.588237,0.587129,0.58588,0.586329,0.580951,0.581816,0.580339,0.576913,0.569694,...,0.05029,0.050495,0.051598,0.054076,0.055504,0.056484,0.057759,0.059625,0.061985,


In [11]:
df_characterised_merge = pd.concat([df_characterised, stats])
df_characterised_merge.to_csv('combined_stats.csv')
df_characterised_merge

Unnamed: 0,statistics,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_425,feature_426,feature_427,feature_428,feature_429,feature_430,feature_431,feature_432,feature_433,label
0,,1.3113,1.3093,1.2225,1.2478,1.1472,1.2208,1.2676,1.2763,1.2903,...,0.068851,0.065492,0.045312,0.067299,0.065363,0.058188,0.085106,0.084531,0.042246,3
1,,1.875,1.8717,1.9102,1.8683,1.8878,1.8514,1.8527,1.8292,1.8479,...,0.018279,0.023191,0.019388,0.01921,0.015802,0.021584,0.020229,0.014808,0.02426,3
2,,1.3782,1.3666,1.3652,1.3719,1.3559,1.3428,1.3263,1.3238,1.3126,...,0.036983,0.035556,0.037337,0.035494,0.041724,0.050014,0.045198,0.05327,0.043443,2
3,,2.1714,2.151,2.095,2.0307,1.9952,2.0172,2.0675,2.0585,2.0178,...,0.088781,0.099672,0.13051,0.16132,0.16018,0.18109,0.2101,0.25304,0.29712,2
4,,0.73976,0.72715,0.71311,0.69704,0.67125,0.68409,0.68809,0.66799,0.6674,...,0.099238,0.084927,0.082649,0.096438,0.10441,0.09825,0.099586,0.092213,0.087415,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,,1.5324,1.5324,1.5323,1.5243,1.5234,1.5292,1.5192,1.511,1.5066,...,0.03316,0.03051,0.030018,0.034682,0.035465,0.033294,0.034401,0.037511,0.032529,3
0,Mean,1.206475,1.202361,1.19951,1.196979,1.189087,1.188905,1.185028,1.179817,1.170785,...,0.069739,0.071132,0.07197,0.073042,0.075101,0.076387,0.077219,0.077727,0.078882,
1,Median,1.3486,1.3295,1.3316,1.3233,1.3198,1.321,1.3085,1.3002,1.2852,...,0.062481,0.064707,0.065623,0.066756,0.068704,0.068724,0.068921,0.06971,0.072949,
2,Range,3.345818,3.338058,3.313953,3.337115,3.343641,3.272805,3.320597,3.400506,3.292616,...,0.32983,0.2767,0.28151,0.31446,0.32868,0.33589,0.34651,0.44173,0.50764,


In [42]:
# to compare scaled DS against original
# original was therefore not scaled / standardised
scaler = StandardScaler()
labels = df['label']
unlabeled_df = df.drop(columns='label')
features = unlabeled_df.columns.tolist()
scaled_data = scaler.fit_transform(unlabeled_df)
describe = pd.DataFrame(scaled_data)
describe.columns = features
describe = pd.merge(describe, labels, left_index=True, right_index=True)
describe.to_csv('scaled_combined.csv')