# Selecting the right bandwidth

In this problem we will try out band-width selection in two steps. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KernelDensity

In [9]:
def read_bh_masses():
    """
    Read in black hole masses from CSV file
    """
    
    # You might need to update this
    fname = '../../../Datafiles/joint-bh-mass-table.csv'
    t = Table().read(fname)
    return t

In [10]:
t_bh = read_bh_masses()#

In [11]:
def show_one_bandwidth(t, bw, what='MBH', xmin=0, xmax=70):
    """
    Show the KDE estimate of the masses with one band-width
    """
    
    X = t[what][:, np.newaxis]
    kde = KernelDensity(bandwidth=bw, kernel='gaussian').fit(X)
    
    xgrid = np.linspace(xmin, xmax, 200)
    ln_p_bh = kde.score_samples(xgrid[:, np.newaxis])
    p_bh = np.exp(ln_p_bh)
    
    h = plt.hist(t[what], normed='area')
    plt.plot(xgrid, p_bh)
    plt.title("Bandwidth={0}".format(bw))
    plt.xlabel(r'$M_{BH}$')
    plt.ylabel(r'$P(M_{BH})$')
    
    return kde

def score_one_bandwidth(t, bw, kernel='gaussian'):
    """
    Fit a KDE and get its score on the fitted sample.
    
    It returns the log likelihood of the data.
    """
    X = t['MBH'][:, np.newaxis]
    kde = KernelDensity(bandwidth=bw, kernel=kernel).fit(X)
    score = kde.score(X)
    
    return score

def score_many_bandwidths(t, kernel='gaussian', bws=None):
    """
    Fit many bandwidths
    """
    if bws is None:
        bws = np.linspace(1, 7.0, 100)
        
    scores = np.zeros_like(bws)
    for i, bw in enumerate(bws):
        scores[i] = score_one_bandwidth(t, bw, kernel=kernel)
    
    return bws, scores
    

## Implement k-fold CV next.