In [1]:
# python file to characterise the dataset

# import libraries
import pandas as pd, numpy as np
import os


In [2]:
# csv file, read it in and output some statistics 
file = "combined.csv"
df = pd.read_csv(file, index_col=0)

In [3]:
df.dtypes

feature_1      float64
feature_2      float64
feature_3      float64
feature_4      float64
feature_5      float64
                ...   
feature_430    float64
feature_431    float64
feature_432    float64
feature_433    float64
label            int64
Length: 434, dtype: object

In [4]:
# include:
    # count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
    # count of rows and columns (wavelengths)
    # range of numbers (sort by top 10 columns)
    # number of 0s 
    # number of nulls
    # any negative numbers?
    # data types
    # data format
    # data precision
    # number of distinct values
    # modal value?

    # order columns to get results for "top 10" columns for each 
    # categorisation task - dataset is too big to show all columns


# print(f"{file}: \nrows: {df.shape[0]} \ncols: {df.shape[1]}")
results = {}

# no. rows (tissue samples) & columns(wavelengths)
n_rows = df.shape[0]
results['n_rows'] = n_rows
n_columns = df.shape[1]
results['n_columns'] = n_columns
# no. values in the df
n_values = n_rows * n_columns
results['n_values'] = n_values

# count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
n_classes = df.groupby('label').size() # no. times each class
results['n_squamous'] = n_classes[1]
results['n_ndbe'] = n_classes[2]
results['n_neoplasia'] = n_classes[3]

# number of 0s in entire df
n_zeros = (df ==0).sum().sum() # (James, 2020)
results['n_zeroes'] = n_zeros

# number of nulls
n_nulls = df.isna().sum().sum() # (Naveen, 2023)
results['n_nulls'] = n_nulls

# any negative numbers?
# count number of values less than 0
n_negatives = df.lt(0).sum().sum() # (Sanchit Aluna, 2016)
negatives = df.lt(0)
results['n_negatives'] = n_negatives

# data types
dtypes = df.dtypes.unique().tolist()
results['dtypes'] = dtypes

# number of columns of each type
for dtype in dtypes:
    # get len of columns array after filtering by each dtype
    count = len(df.select_dtypes(include=[dtype]).columns)
    # print(count)
    results['n_'+str(dtype)] = count

# data format


# data precision


# number of distinct values
n_unique = df.stack().dropna().nunique()
results['n_unique'] = n_unique

# modal value?

# range of numbers (sort by top 10 columns)
ranges = {}
for column in df.columns.tolist():
    range = df[column].max() - df[column].min()
    ranges[column] = range
# sort by top 10 widest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=True): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['top10_ranges'] = sorted_ranges
# 10 lowest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=False): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['bottom10_ranges'] = sorted_ranges

results

{'n_rows': 715,
 'n_columns': 434,
 'n_values': 310310,
 'n_squamous': 159,
 'n_ndbe': 320,
 'n_neoplasia': 236,
 'n_zeroes': 715,
 'n_nulls': 0,
 'n_negatives': 57,
 'dtypes': [dtype('float64'), dtype('int64')],
 'n_float64': 433,
 'n_int64': 1,
 'n_unique': 145211,
 'top10_ranges': {'feature_127': 5.10342,
  'feature_128': 5.0939499999999995,
  'feature_123': 5.01679,
  'feature_129': 4.96903,
  'feature_131': 4.956600000000001,
  'feature_125': 4.9508399999999995,
  'feature_126': 4.921209999999999,
  'feature_130': 4.91711,
  'feature_120': 4.877619999999999,
  'feature_119': 4.86939},
 'bottom10_ranges': {'feature_355': 0.17462,
  'feature_357': 0.17762,
  'feature_353': 0.17836,
  'feature_358': 0.17855,
  'feature_356': 0.17867943,
  'feature_354': 0.18157572000000002,
  'feature_359': 0.18625,
  'feature_360': 0.18784,
  'feature_344': 0.1892599999999999,
  'feature_345': 0.18952}}

In [5]:
df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_425,feature_426,feature_427,feature_428,feature_429,feature_430,feature_431,feature_432,feature_433,label
0,1.31130,1.30930,1.22250,1.24780,1.14720,1.22080,1.26760,1.27630,1.29030,1.27570,...,0.068851,0.065492,0.045312,0.067299,0.065363,0.058188,0.085106,0.084531,0.042246,3
1,1.87500,1.87170,1.91020,1.86830,1.88780,1.85140,1.85270,1.82920,1.84790,1.88730,...,0.018279,0.023191,0.019388,0.019210,0.015802,0.021584,0.020229,0.014808,0.024260,3
2,1.37820,1.36660,1.36520,1.37190,1.35590,1.34280,1.32630,1.32380,1.31260,1.30360,...,0.036983,0.035556,0.037337,0.035494,0.041724,0.050014,0.045198,0.053270,0.043443,2
3,2.17140,2.15100,2.09500,2.03070,1.99520,2.01720,2.06750,2.05850,2.01780,2.00700,...,0.088781,0.099672,0.130510,0.161320,0.160180,0.181090,0.210100,0.253040,0.297120,2
4,0.73976,0.72715,0.71311,0.69704,0.67125,0.68409,0.68809,0.66799,0.66740,0.67134,...,0.099238,0.084927,0.082649,0.096438,0.104410,0.098250,0.099586,0.092213,0.087415,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138,1.21470,1.21280,1.20860,1.21630,1.20410,1.20450,1.19220,1.17560,1.17550,1.16340,...,0.033310,0.047268,0.052854,0.055493,0.061956,0.065242,0.063146,0.054809,0.069272,3
139,0.69003,0.69312,0.71059,0.70677,0.72841,0.72722,0.70740,0.69638,0.68104,0.71926,...,0.192170,0.199210,0.228460,0.238530,0.244610,0.233950,0.249360,0.261970,0.252180,3
140,0.60519,0.61038,0.61824,0.62226,0.62701,0.62702,0.63458,0.62804,0.62511,0.63702,...,0.170130,0.168460,0.179930,0.199300,0.210660,0.206620,0.217060,0.225780,0.250500,2
141,0.62265,0.62775,0.62462,0.62315,0.61800,0.62042,0.62425,0.63137,0.62590,0.62140,...,0.064621,0.066401,0.060681,0.061961,0.064319,0.075752,0.078947,0.077760,0.091303,1


In [25]:
df_characterised = df.copy()
df_characterised.insert(0, 'statistics', None)

In [20]:
print(df_characterised[-10:])

    statistics  feature_1  feature_2  feature_3  feature_4  feature_5  \
133       None    1.73220    1.71710    1.72550    1.71550    1.71080   
134       None    1.65110    1.61760    1.64330    1.61640    1.60590   
135       None    0.26549    0.25595    0.23619    0.23982    0.24419   
136       None    0.84373    0.84120    0.83687    0.83452    0.83787   
137       None    1.14250    1.13270    1.12760    1.12600    1.12340   
138       None    1.21470    1.21280    1.20860    1.21630    1.20410   
139       None    0.69003    0.69312    0.71059    0.70677    0.72841   
140       None    0.60519    0.61038    0.61824    0.62226    0.62701   
141       None    0.62265    0.62775    0.62462    0.62315    0.61800   
142       None    1.53240    1.53240    1.53230    1.52430    1.52340   

     feature_6  feature_7  feature_8  feature_9  ...  feature_425  \
133    1.71050    1.69930    1.69890    1.69020  ...     0.029018   
134    1.62400    1.62240    1.60290    1.59030  ...     0

In [23]:
# calculate statistical values such as mean, mode, median for each column
# mean
mean = df_characterised.mean()
mean[0] = 'Mean'
mean[-1] = np.nan


# mode
mode = df_characterised.mode()
mode[0] = 'Mode'
mode[-1] = np.nan

# median
median = df_characterised.median()
median[0] = 'Median'
median[-1] = np.nan

# range


# standard deviation
sd = df_characterised.std()
sd[0] = 'std'
sd[-1] = np.nan


df_characterised.iloc[-10:]
mean


  mean = df_characterised.mean()
  mode = df_characterised.mode()
  median = df_characterised.median()
  sd = df_characterised.std()


statistics         Mean
feature_1      1.206475
feature_2      1.202361
feature_3       1.19951
feature_4      1.196979
                 ...   
feature_430    0.076387
feature_431    0.077219
feature_432    0.077727
feature_433    0.078882
label               NaN
Length: 435, dtype: object

In [26]:
# df_characterised = df_characterised.append(mean, ignore_index=True)
df_characterised = df_characterised.append(mode, ignore_index=True)
# df_characterised = df_characterised.append(median, ignore_index=True)
# df_characterised = df_characterised.append(sd, ignore_index=True)
df_characterised.iloc[-10:]

  df_characterised = df_characterised.append(mode, ignore_index=True)


Unnamed: 0,statistics,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_427,feature_428,feature_429,feature_430,feature_431,feature_432,feature_433,label,0,-1
1420,,,,,,,,,,,...,,,,,,,,,Mode,
1421,,,,,,,,,,,...,,,,,,,,,Mode,
1422,,,,,,,,,,,...,,,,,,,,,Mode,
1423,,,,,,,,,,,...,,,,,,,,,Mode,
1424,,,,,,,,,,,...,,,,,,,,,Mode,
1425,,,,,,,,,,,...,,,,,,,,,Mode,
1426,,,,,,,,,,,...,,,,,,,,,Mode,
1427,,,,,,,,,,,...,,,,,,,,,Mode,
1428,,,,,,,,,,,...,,,,,,,,,Mode,
1429,,,,,,,,,,,...,,,,,,,,,Mode,
