In [3]:
import itertools
#from ggplot import ggplot, aes, geom_point
import numpy as np
import pandas as pd

dating_test_set = 'Tornadoes_SPC_1950to2015.csv'

column_names = [
    'Number of frequent flyer miles earned per year',
    'Percentage of time spent playing video games',
    'Liters of ice cream consumed per week',
    'Category'
]


def load_file(filepath):
    """
    Loads data in tab-separated format
    Args:
        filepath: Location of data file
    Returns:
        Data and labels extracted from text file
    """
    data = []
    labels = []
    with open(filepath) as infile:
        for line in infile:
            row = line.strip().split('\t')
            data.append(row[:-1])
            labels.append(row[-1])
    return data, labels


def normalize(df):
    """
    Normalizes data to give equal weight to each features.
    General formula:
        norm_value = (value - min_value) / (max_value - min_value)
    Args:
        df: Pandas data frame with unnormalized data
    Returns:
        Normalized dataframe, range of values, min values
    """
    min_values = df.min()
    max_values = df.max()
    range_values = max_values - min_values
    norm_df = (df - min_values) / range_values
    return norm_df, range_values, min_values


def classify(input_data, training_set, labels, k=1):
    """
    Uses kNN algorithm to classify input data given a set of
    known data.
    Args:
        input_data: Pandas Series of input data
        training_set: Pandas Data frame of training data
        labels: Pandas Series of classifications for training set
        k: Number of neighbors to use
    Returns:
        Predicted classification for given input data
    """
    distance_diff = training_set - input_data
    distance_squared = distance_diff**2
    distance = distance_squared.sum(axis=1)**0.5
    distance_df = pd.concat([distance, labels], axis=1)
    distance_df.sort(columns=[0], inplace=True)
    top_knn = distance_df[:k]
    return top_knn[1].value_counts().index.values[0]


def plot(df, x, y, color):
    """
    Scatter plot with two of the features (x, y) grouped by classification (color)
    Args:
        df: Dataframe of data
        x: Feature to plot on x axis
        y: Feature to plot on y axis
        color: Group by this column
    """
    print(ggplot(df, aes(x=x, y=y, color=color)) + geom_point())


def main():

    # Load data
    raw_data, raw_labels = load_file(dating_test_set)

    # Convert data to Pandas data structures
    labels = pd.Series(raw_labels, name=column_names[3])
    df = pd.DataFrame.from_records(np.array(raw_data, np.float32), columns=column_names[:3])
    df[column_names[3]] = labels

    plot(df, column_names[1], column_names[2], column_names[3])

    """
    # Normalize data since ranges of values are different
    norm_df, range_values, min_values = normalize(df)
    # Use first 10% of data for testing
    num_test_rows = int(norm_df.shape[0] * .1)
    # 90% training data
    training_df = norm_df[num_test_rows:]
    training_labels = labels[num_test_rows:]
    # 10% training data
    test_df = norm_df[:num_test_rows]
    test_labels = labels[:num_test_rows]
    # Apply kNN algorithm to all test data
    result_df = test_df.apply(lambda row: classify(row, training_df, training_labels, k=3), axis=1)
    # Calculate the number of correct predictions
    error_df = result_df == test_labels
    print error_df.value_counts()
    """

if __name__ == '__main__':
    main()

AssertionError: 3 columns passed, passed data had 0 columns