In [2]:
# python file to characterise the dataset

# import libraries
import pandas as pd
import os


In [23]:
# csv file, read it in and output some statistics 
file = "combined.csv"
df = pd.read_csv(file, index_col=0)

In [29]:
df.dtypes

feature_1      float64
feature_2      float64
feature_3      float64
feature_4      float64
feature_5      float64
                ...   
feature_430    float64
feature_431    float64
feature_432    float64
feature_433    float64
label            int64
Length: 434, dtype: object

In [70]:
# include:
    # count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
    # count of rows and columns (wavelengths)
    # range of numbers (sort by top 10 columns)
    # number of 0s 
    # number of nulls
    # any negative numbers?
    # data types
    # data format
    # data precision
    # number of distinct values
    # modal value?

    # order columns to get results for "top 10" columns for each 
    # categorisation task - dataset is too big to show all columns


# print(f"{file}: \nrows: {df.shape[0]} \ncols: {df.shape[1]}")
results = {}

# no. rows (tissue samples) & columns(wavelengths)
n_rows = df.shape[0]
results['n_rows'] = n_rows
n_columns = df.shape[1]
results['n_columns'] = n_columns
# no. values in the df
n_values = n_rows * n_columns
results['n_values'] = n_values

# count of each class (show imbalanced dataset) - show models overfit to majority class in imbalanced dataset
n_classes = df.groupby('label').size() # no. times each class
results['n_squamous'] = n_classes[1]
results['n_ndbe'] = n_classes[2]
results['n_neoplasia'] = n_classes[3]

# number of 0s in entire df
n_zeros = (df ==0).sum().sum() # (James, 2020)
results['n_zeroes'] = n_zeros

# number of nulls
n_nulls = df.isna().sum().sum() # (Naveen, 2023)
results['n_nulls'] = n_nulls

# any negative numbers?
# count number of values less than 0
n_negatives = df.lt(0).sum().sum() # (Sanchit Aluna, 2016)
negatives = df.lt(0)
results['n_negatives'] = n_negatives

# data types
dtypes = df.dtypes.unique().tolist()
results['dtypes'] = dtypes

# number of columns of each type
for dtype in dtypes:
    # get len of columns array after filtering by each dtype
    count = len(df.select_dtypes(include=[dtype]).columns)
    # print(count)
    results['n_'+str(dtype)] = count

# data format


# data precision


# number of distinct values
n_unique = df.stack().dropna().nunique()
results['n_unique'] = n_unique

# modal value?

# range of numbers (sort by top 10 columns)
ranges = {}
for column in df.columns.tolist():
    range = df[column].max() - df[column].min()
    ranges[column] = range
# sort by top 10 widest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=True): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['top10_ranges'] = sorted_ranges
# 10 lowest ranges
count = 0
sorted_ranges = {}
for element in sorted(ranges.items(), key=lambda x: x[1], reverse=False): # (Gern Blanston, 2009)
    sorted_ranges[element[0]] = element[1] 
    if count==9:
        break
    count+=1
results['bottom10_ranges'] = sorted_ranges

results

{'n_rows': 715,
 'n_columns': 434,
 'n_values': 310310,
 'n_squamous': 159,
 'n_ndbe': 320,
 'n_neoplasia': 236,
 'n_zeroes': 715,
 'n_nulls': 0,
 'n_negatives': 57,
 'dtypes': [dtype('float64'), dtype('int64')],
 'n_float64': 433,
 'n_int64': 1,
 'n_unique': 145211,
 'top10_ranges': {'feature_127': 5.10342,
  'feature_128': 5.0939499999999995,
  'feature_123': 5.01679,
  'feature_129': 4.96903,
  'feature_131': 4.956600000000001,
  'feature_125': 4.9508399999999995,
  'feature_126': 4.921209999999999,
  'feature_130': 4.91711,
  'feature_120': 4.877619999999999,
  'feature_119': 4.86939},
 'bottom10_ranges': {'feature_355': 0.17462,
  'feature_357': 0.17762,
  'feature_353': 0.17836,
  'feature_358': 0.17855,
  'feature_356': 0.17867943,
  'feature_354': 0.18157572000000002,
  'feature_359': 0.18625,
  'feature_360': 0.18784,
  'feature_344': 0.1892599999999999,
  'feature_345': 0.18952}}

In [None]:
# calculate statistical values such as mean, mode, median for each column