In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import make_classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# The Curse of Dimensionality

In [None]:
n_vals = np.logspace(1, 4, 20)

In [None]:
datasets = { int(n): make_classification(int(n)) for n in n_vals }

models = {
    'knn' : KNeighborsClassifier(),
    'lr' : LogisticRegression(),
    'dt' : DecisionTreeClassifier(),
    'svm' : SVC(),
}

In [None]:
from time import time

In [None]:
def time_fit(data, model):
    start = time()
    model.fit(*data)
    fit_time = time() - start
    return fit_time

def time_predict(data, model):
    start = time()
    model.predict(data[0])
    predict_time = time() - start
    return predict_time

def fit_predict_model_on_n(model_name, n):
    model = models[model_name]
    dataset = datasets[n]
    fit_time = time_fit(dataset, model)
    predict_time = time_predict(dataset, model)
    return {
        'model_name' : model_name,
        'n' : n,
        'fit_time' : fit_time,
        'predict_time' : predict_time
    }

In [None]:
results = []
for n in datasets.keys():
    for model in models.keys():
        results.append(fit_predict_model_on_n(model, n))

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
for model_name in results_df.model_name.unique():
    model_results_df = results_df[results_df.model_name == model_name]

    ax1.set_title('Fit Time')
    ax1.plot(model_results_df.n, model_results_df.fit_time, label=model_name)
    ax1.legend()
    
    ax2.set_title('Prediction Time')
    ax2.plot(model_results_df.n, model_results_df.predict_time, label=model_name)
    ax2.legend()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
for model_name in results_df.model_name.unique():
    model_results_df = results_df[results_df.model_name == model_name]

    ax1.set_title('Fit Time')
    ax1.plot(model_results_df.n, model_results_df.fit_time, label=model_name)
    ax1.set_xscale('log')
    ax1.legend()
    
    ax2.set_title('Prediction Time')
    ax2.plot(model_results_df.n, model_results_df.predict_time, label=model_name)
    ax2.set_xscale('log')
    ax2.legend()