In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

For many machine learning algorithms it's important to scale, or normalize, the data before using it. Many algorithms compute the Euclidean Distance between two observations and if one of the features is vastly larger than another, the distance will be biased towards that particular feature. To normalize the data, for each value, subtract each the mean and then divide by the standard deviation (or normalize the data).

In [3]:
glass = pd.read_csv('dati/glass.data.csv')

In [4]:
glass.head(2)

Unnamed: 0,Id number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0,0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0,0,1


In [5]:
glass.drop('Id number', inplace = True, axis = 1)

In [11]:
def standardize(columns, data):
    """
    take the columns as a list and give back the dataframe with the standardized columns
    """
    for col in columns:
        data[col] = (data[col] - np.mean(data[col]))/np.std(data[col])

In [13]:
standardize(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe'], glass)

In [14]:
glass.head(2)

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type of glass
0,0.872868,0.284953,1.254639,-0.692442,-1.127082,-0.671705,-0.145766,-0.352877,-0.586451,1
1,-0.249333,0.591817,0.636168,-0.17046,0.102319,-0.026213,-0.793734,-0.352877,-0.586451,1


In [1]:
def normalize(columns, data):
    """
    take the columns as a list and give back the dataframe with the normalized columns
    """
    for col in columns:
        data[col] = (data[col] - np.min(data[col]))/(np.max(data[col]) - np.min(data[col]))