In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import math
from scipy import stats
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
import matplotlib.pyplot as pylab

In [None]:
def knn_test(predictors, response, data, test):
    
    neighbours=np.arange(1, 51)
    best_score = -np.inf
    
    for k in neighbours: 
        knn = KNeighborsRegressor(n_neighbors = k) 
        scores = cross_val_score(knn, data[predictors], data[response], cv=5, scoring = 'neg_mean_squared_error')
        # taking the average of scores across 10 folds
        cv_score = np.mean(scores)
        # use the cv score for model selection
        if cv_score >= best_score:
            best_score = cv_score
            best_knn = knn
    
    knn = best_knn
    # train the selected model with the whole train set
    knn.fit(data[predictors], data[response])
    # Predict the test data with the selected and re-estimated model
    predictions = knn.predict(test[predictors])
    test_rmse = np.sqrt(mean_squared_error(test[response], predictions))
    cv_rmse= np.sqrt(-best_score)
    print('Chosen K: {}'.format(knn.n_neighbors))
    
    return test_rmse, cv_rmse 

In [None]:
def knn_test_mahalanobis(predictors, response, data, test):
    
    neighbours=np.arange(1, 51)
    best_score = -np.inf
    
    for k in neighbours: 
        knn = KNeighborsRegressor(n_neighbors = k, metric='mahalanobis', metric_params={'V': data[predictors].cov()}) 
        scores = cross_val_score(knn, data[predictors], data[response], cv=5, scoring = 'neg_mean_squared_error')
        # taking the average of scores across 10 folds
        cv_score = np.mean(scores)
        # use the cv score for model selection
        if cv_score >= best_score:
            best_score = cv_score
            best_knn = knn
    
    knn = best_knn
    # train the selected model with the whole train set
    knn.fit(data[predictors], data[response])
    # Predict the test data with the selected and re-estimated model
    predictions = knn.predict(test[predictors])
    test_rmse = np.sqrt(mean_squared_error(test[response], predictions))
    cv_rmse= np.sqrt(-best_score)
    #print('Chosen K: {}'.format(knn.n_neighbors))
    
    return test_rmse, cv_rmse, knn.n_neighbors

In [None]:
def knn_test_standardised(predictors, response, data, test):
    
    neighbours=np.arange(1, 31)
    best_score = -np.inf
    y_train=data[response]
    y_test=test[response]
    mu=data[predictors].mean()
    sigma=data[predictors].std()
    standardised_knn_train=(data[predictors]-mu)/sigma
    standardised_knn_test=(test[predictors]-mu)/sigma
    best_score=-100
    for k in neighbours: 
        knn = KNeighborsRegressor(n_neighbors = k) 
        scores = cross_val_score(knn, standardised_knn_train, y_train, cv=5, scoring = 'neg_mean_squared_error')
        # taking the average of scores across 5 folds
        cv_score = np.mean(scores)
        # use the cv score for model selection
        if cv_score >= best_score:
            best_score = cv_score
            best_knn = knn
    
    knn = best_knn
    # train the selected model with the whole train set
    knn.fit(standardised_knn_train[predictors], y_train)
    # Predict the test data with the selected and re-estimated model
    predictions = knn.predict(standardised_knn_test[predictors])
    test_rmse = np.sqrt(mean_squared_error(y_test, predictions))
    cv_rmse= np.sqrt(-best_score)
    #print('Chosen K: {}'.format(knn.n_neighbors))
    
    return test_rmse, cv_rmse, knn.n_neighbors

In [26]:
def Gradient_Ascent_Algo(X, y, beta, alpha, numIterations): 
    # sample size 
    j=0;
    N = len(X) 
    XTrans = X.transpose()
    # create a vector to save all the likelihood values at each iteration 
    likelihood_values = np.zeros((numIterations,1)) 
    beta_values = np.zeros((numIterations+1,len(beta)))
    beta_values[0,:]= beta
    for i in range(0, numIterations): 
        # predicted values from the model 
        f_X = np.dot(X, beta)
        # calculte the likelihood
        diff = y - f_X
        L = -(1/(2*N))*sum(np.square(diff))
        # save all the likelihood values at each iteration 
        likelihood_values[i] = L
        # calcualte the gradient using matrix representation
        grad = (1/N)*np.matmul(XTrans,diff)
        # update the parameters simulteneously with learning rate alpha
        beta = beta+alpha*grad   
        # save all the estimated parametes at each step 
        beta_values[i+1,:]= beta.transpose()
    return beta, likelihood_values, beta_values

In [46]:
def Gradient_Ascent_Algo_Stop(X, y, beta, alpha, numIterations, precision): 
    # sample size 
    j=0;
    N = len(X) 
    XTrans = X.transpose()
    # create a vector to save all the likelihood values at each iteration 
    likelihood_values = np.zeros((numIterations,1)) 
    beta_values = np.zeros((numIterations+1,len(beta)))
    beta_values[0,:]= beta
    grad_old=0
    for i in range(0, numIterations): 
        # predicted values from the model 
        f_X = np.dot(X, beta)
        # calculte the likelihood
        diff = y - f_X
        L = -(1/(2*N))*sum(np.square(diff))
        # save all the likelihood values at each iteration 
        likelihood_values[i] = L

        # calcualte the gradient using matrix representation
        grad = (1/N)*np.matmul(XTrans,diff)
        if (abs(sum(grad))<precision):
            #print(sum(grad))
            break
        # update the parameters simulteneously with learning rate alpha
        beta = beta+alpha*grad   
        # save all the estimated parametes at each step 
        beta_values[i+1,:]= beta.transpose()
        grad_old=grad
    return beta, likelihood_values[:i+1], beta_values[:i+1]