# 1. Loading and preparing data

In [5]:
from csv import reader
from math import sqrt

### Define functions
* Loading CSV files
* Converting strings to floats or ints
* Finding min/max, means and stdevs of dataset columns
* Normalizing and Standardizing a dataset

In [6]:
# Load a CSV file, skipping empty rows
# returns list of lists, but all values are strings
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:   
        # locally scoped file
        csv_reader = reader(file)
        for row in csv_reader:  
            # looping over each row
            if not row: 
                # i.e. if empty
                continue    
                # go to next iteration, i.e. skip ahead to next row
            dataset.append(row)
    print('Loaded data file {0} with {1} rows and {2} columns.'.format(filename, len(dataset), len(dataset[0]))) 
    # Prints that load was successful
    return dataset

# Convert column to floating point values
def str_column_to_float(dataset, column):
    for row in dataset: 
        # in each row
        row[column] = float(row[column].strip())    
        # casts as float and strips out whitespace
        
# Some algorithms prefer all values to be numeric, including the outcome or predicted value:
# Use set() to get the unique strings, and enumerate() to give each an index int
# Store in dictionary, and replace strings in dataset with integers
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset] 
    # Populates the list class_values with the column-value at each row in dataset
    unique = set(class_values)  
    # gets the (class=) set of unique values in class_values
    lookup = dict() 
    # create dictionary to map each item in the set of unique values
    for i, value in enumerate(unique):  
        # iterates over unique, adding an index
        # enumerate returns for each iteration a tuple containing an index and the corresponding value
        # since we enumerated a set (list of unique), we're adding keys to each unique value
        lookup[value] = i   # Map key = unique string "value", to value = index number i
    for row in dataset:
        # replace string keys in specified dataset column with (index) integers
        row[column] = lookup[row[column]]
    # return dictionary so downstream user can convert predictions back to strings
    return lookup

# Calculating minimum and maximum value in each dataset column
def dataset_minmax(dataset):
    # Note: will error if a column is not numerical
    minmax = list()
    # iterating over each column in dataset
    for i in range (len(dataset[0])):   
        # requires that first row has no empty columns!
        col_values = [row[i] for row in dataset]    
        # define list with all rows' values in column i
        # Find min and max values in each column, and add to minmax list (without indexing => ordering important!)
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])   
        # append min and max as a list, i.e. minmax = list of lists
    return minmax

# Normalize dataset, i.e. rescale columns to the range 0-1
def normalize_dataset(dataset, minmax):
    for row in dataset:
        for i in range (len(row)):
            row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])
            
# Calculate the mean of each dataset column
def column_means(dataset):
    # Inside bracket returns a series of 0's len(...) long, creating the list 'means'
    means = [0 for i in range(len(dataset[0]))] 
    # populate list of means with 0
    # range() defaults is start = 0, end = x, step = 1; returns that sequence of numbers
    for i in range(len(dataset[0])):    
        # Iterate over each column number in first row
        # Get list of values for each column in all rows
        col_values = [row[i] for row in dataset]    
        # dataset is list of rows
        means[i] = sum(col_values) / float(len(col_values)) 
        # update column mean; cast to avoid integer division
    return means    # list of column means, index = column number

# Calculate the standard deviation of each dataset column, assuming means already calculated
# Returns list of column standard deviations, index = column number
def column_stdev(dataset, means):
    stdevs = [0 for i in range(len(dataset[0]))]    
    # populate list with 0's, length = #columns
    for i in range(len(dataset[0])):    
        # iterate over column numbers in first row, for all rows
        variance = [pow(row[i] - means[i], 2) for row in dataset]   
        # squares of deviations
        stdevs[i] = sum(variance)   
        # sum of squares of deviations; x below
    stdevs = [sqrt(x/(float(len(dataset)-1))) for x in stdevs]  
    # square each sum to get st.dev.
    return stdevs

# Standardize dataset
def standardize_dataset(dataset, means, stdevs):
    for row in dataset: 
        # Iterate over each row
        for i in range(len(row)):   
            # Iterate over each column
            row[i] = (row[i] - means[i]) / stdevs[i]

### Testing normalization

In [7]:
# Contrive small testing dataset
dataset = [[50, 30], [20, 90]]  
# [x, y], [x, y]
print("Dataset:", dataset)
# Calculate min and max for each column
minmax = dataset_minmax(dataset)
print("minmax:", minmax)
# Normalize columns in dataset
normalize_dataset(dataset, minmax)
print("Normalized:", dataset, "\n")

# Load pima-indians-diabetes dataset
filename = 'data/pima-indians-diabetes.csv'
dataset = load_csv(filename)
# Convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print("Pima as floats:", dataset[0])
# Calculate min and max for each column
minmax = dataset_minmax(dataset)
# Normalize columns to 0-1
normalize_dataset(dataset, minmax)
print("Pima normalized:", dataset[0], "\n")

Dataset: [[50, 30], [20, 90]]
minmax: [[20, 50], [30, 90]]
Normalized: [[1.0, 0.0], [0.0, 1.0]] 

Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
Pima as floats: [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
Pima normalized: [0.35294117647058826, 0.7437185929648241, 0.5901639344262295, 0.35353535353535354, 0.0, 0.5007451564828614, 0.23441502988898377, 0.48333333333333334, 1.0] 



### Testing standardization

In [8]:
# Contrive small testing dataset: list of 3 lists, each with two items
dataset = [[50, 30], [20, 90], [30, 50]]
print("Dataset:",dataset)
# Estimate mean and standard deviation
means = column_means(dataset)
stdevs = column_stdev(dataset, means)
print("Means:", means)
print("St.devs.:", stdevs)
# Standardize dataset
standardize_dataset(dataset, means, stdevs)
print("Standardized:", dataset, "\n")

# Load pima-indians-diabetes dataset
dataset = load_csv('data/pima-indians-diabetes.csv')
# Convert string columns to float
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)
print("Pima as floats:", dataset[0])
# Calculate column means and st. devs.
means = column_means(dataset)
stdevs = column_stdev(dataset, means)
# Standardize dataset
standardize_dataset(dataset, means, stdevs)
print("Pima standardized:", dataset[0], "\n")

Dataset: [[50, 30], [20, 90], [30, 50]]
Means: [33.333333333333336, 56.666666666666664]
St.devs.: [15.275252316519467, 30.550504633038933]
Standardized: [[1.0910894511799618, -0.8728715609439694], [-0.8728715609439697, 1.091089451179962], [-0.21821789023599253, -0.2182178902359923]] 

Loaded data file data/pima-indians-diabetes.csv with 768 rows and 9 columns.
Pima as floats: [6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0]
Pima standardized: [0.6395304921176576, 0.8477713205896718, 0.14954329852954296, 0.9066790623472505, -0.692439324724129, 0.2038799072674717, 0.468186870229798, 1.4250667195933604, 1.3650063669598067] 

