In [1]:
# Find the min and max values for each column
def dataset_minmax(dataset):
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

In [2]:
# Rescale dataset columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

In [3]:
# Contrive small dataset
dataset = [[50, 30], [10, 90], [60, 20]]
print(dataset)

[[50, 30], [10, 90], [60, 20]]


In [4]:
minmax = dataset_minmax(dataset)
print(minmax)

[[10, 60], [20, 90]]


In [5]:
normalize_dataset(dataset, minmax)
print(dataset)

[[0.8, 0.14285714285714285], [0.0, 1.0], [1.0, 0.0]]


In [6]:
# Example of standardizing a contrived dataset
from math import sqrt

# calculate column means
def column_means(dataset):
    means = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        means[i] = sum(col_values) / float(len(dataset))
    return means

In [7]:
def column_stdevs(dataset, means):
    stdevs = [0 for i in range(len(dataset[0]))]
    for i in range(len(dataset[0])):
        variance = [pow(row[i]-means[i], 2) for row in dataset]
        stdevs[i] = sum(variance)
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]
    return stdevs

In [8]:
# standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for row in dataset:
        for i in range(len(row)):
            row[i] = (row[i] - means[i]) / stdevs[i]

In [9]:
# Standardize dataset
dataset = [[50, 30], [20, 90], [30, 50]]
print(dataset)

[[50, 30], [20, 90], [30, 50]]


In [10]:
# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)
print(means)
print(stdevs)

[33.333333333333336, 56.666666666666664]
[15.275252316519467, 30.550504633038933]


In [11]:
# standardize dataset
standardize_dataset(dataset, means, stdevs)
print(dataset)

[[1.0910894511799618, -0.8728715609439694], [-0.8728715609439697, 1.091089451179962], [-0.21821789023599253, -0.2182178902359923]]


In [12]:
from csv import reader

# Load a CSV file
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset

In [13]:
# Convert string column to float
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())

In [14]:
import os
# Load pima-indians-diabetes dataset
filename = os.path.join('data', 'pima-indians-diabetes.csv')
dataset = load_csv(filename)

In [15]:
print(f'Loaded data file {filename} with {len(dataset)} rows and {len(dataset[0])} columns')

Loaded data file data\pima-indians-diabetes.csv with 768 rows and 9 columns


In [16]:
# convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

print(dataset[0])
print(dataset[1])
print(dataset[767])

[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0, 0.0]
[1.0, 93.0, 70.0, 31.0, 0.0, 30.4, 0.315, 23.0, 0.0]


In [17]:
# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdevs(dataset, means)

In [18]:
print(means)
print(stdevs)

[3.8450520833333335, 120.89453125, 69.10546875, 20.536458333333332, 79.79947916666667, 31.992578124999977, 0.4718763020833327, 33.240885416666664, 0.3489583333333333]
[3.3695780626988623, 31.97261819513622, 19.355807170644777, 15.952217567727677, 115.24400235133837, 7.8841603203754405, 0.33132859501277484, 11.76023154067868, 0.4769513772427971]


In [19]:
# standardize dataset
standardize_dataset(dataset, means, stdevs)

print(dataset[0])
print(dataset[1])
print(dataset[767])

[0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067]
[-0.8443348188985389, -1.1226647449053888, -0.16044119073007646, 0.5305558070991293, -0.692439324724129, -0.6839762138098154, -0.3648230303776593, -0.19054772934660633, -0.7316434126904563]
[-0.8443348188985389, -0.8724506413504606, 0.04621513544300315, 0.6559302255151698, -0.692439324724129, -0.2019971766535742, -0.47347649567428407, -0.8708064446897502, -0.7316434126904563]
