### Problem 3 - RANSAC
##### When data is scattered, the least squares method to fit a curve might not be the best choice. In this problem, you are given data for health insurance costs based on the person’s age. There are other fields as well, but you have to fit a line only for age and insurance cost data.

###### The data is stored in the file - 'dataset.csv'

>> Read and plot csv file

In [None]:
import csv
import numpy as np
age = []
cost = []
with open('dataset.csv', 'r') as theFile:
    reader = csv.DictReader(theFile)
    for line in reader:
        age.append(float(line['age']))
        cost.append(float(line['charges']))


##### 1. Compute the covariance matrix (from scratch) and find its eigenvalues and eigenvectors. 
###### Plot the eigenvectors on the same graph as the data.

In [None]:
# Calculate mean age and cost
mean_age = sum(age) / len(age)
mean_cost = sum(cost) / len(cost)
var_age = 0.0
var_cost = 0.0
covar_ = 0.0
covar_cost = 0.0
tls_dem = 0.0
for i in range(len(age)):
    var_age += (age[i]-mean_age)**2     # den
    var_cost += (cost[i]-mean_cost)**2
    covar_ += (age[i]-mean_age)*(cost[i]-mean_cost)  # num
    #tls_dem += (((cost)-mean_cost)**2 -(age-mean_age)**2)
var_age /= len(age)
var_cost /= len(age)
covar_ /= len(age)

print('Mean age: ', mean_age, ', Mean cost: ', mean_cost, ', Variance age: ',
      var_age, ', Variance cost: ', var_cost, ', Covariance: ', covar_)


#### Covariance Matrix:

In [None]:
import matplotlib.pyplot as plot
from numpy import linalg as LA

covariance_mat = np.matrix([[var_age, covar_], [covar_, var_cost]])
print('Covariance Matrix', covariance_mat)
eigen_values, eigen_vector = LA.eig(covariance_mat)
print('Eigen Values :\n', eigen_values, '\nEigen Vector :\n', eigen_vector)

# find out covariance with respect  columns
cov_mat = np.stack((age, cost), axis=0)


fig1 = plot.figure(figsize=(40, 15))
plot.subplot(121)


# Plotting Eigen vectors
origin = [40, 30000]

eig_vec1 = np.array(eigen_vector[:, 0])
eig_vec2 = np.array(eigen_vector[:, 1])


print(eig_vec1)
print(eig_vec2)

plot.quiver(*origin, *eig_vec1, color=['r'], scale=15)
plot.quiver(*origin, *eig_vec2, color=['b'], scale=21)

plot.xlabel('Age')
plot.ylabel('Cost')
plot.scatter(age, cost, c="pink")
plot.title('Age vs Insurance plot with eigen vectors')


#### 2.1 Fit a line to the data using linear least square method

In [None]:
m = covar_/var_age
c = mean_cost - m*mean_age

Y_pred = m*np.array(age) + c

fig1 = plot.figure(figsize=(40, 15))
plot.subplot(121)
plot.xlabel('Age')
plot.ylabel('Cost')
plot.scatter(age, cost, c="pink", label="CSV data")
plot.plot([min(age), max(age)], [min(Y_pred), max(Y_pred)],
          color='black', label='Linear least square')  # predicted
plot.legend()
plot.title('Dataset')

# Uncomment and run to validate
# A = np.vstack([np.array(age), np.ones(len(age))]).T
# m, c = np.linalg.lstsq(A, cost, rcond=None)[0]
# q = plot.plot(np.array(age), m*np.array(age) + c, 'r', label='Fitted line')


plot.title('Linear least square')
plot.savefig('LLS.png')


#### Line fitting using total least square (aka orthogonal linear regression)

In [None]:
def tls(x,y):
    x = (x-np.min(x))/(np.max(x)-np.min(x))
    y = (y-np.min(y))/(np.max(y)-np.min(y))
    mean_age_new = sum(x)/len(x)
    mean_cost_new = sum(y)/len(y)

    #Define Matrix A
    A = np.vstack(((x-mean_age_new),(y-mean_cost_new))).T
    At_A = np.dot(A.T,A)
    V,U,S,H = SVD(At_A)
    a,b = V[:,-1]
    c = a*mean_age_new + b*mean_cost_new
    return a,b,c


In [None]:
import random
from scipy.odr import *


def tls(X, y):
    if len(X.shape) == 1:
        n = 1
        X = X.reshape(len(X), 1)
    else:
        n = np.array(X).shape[1]  # the number of variable of X

    Z = np.vstack((X.T, y)).T
    U, s, Vt = LA.svd(Z, full_matrices=True)

    V = Vt.T
    Vxy = V[:n, n:]
    Vyy = V[n:, n:]
    a_tls = - Vxy / Vyy  # total least squares soln

    Xtyt = - Z.dot(V[:, n:]).dot(V[:, n:].T)
    Xt = Xtyt[:, :n]  # X error
    y_tls = (X+Xt).dot(a_tls)

    fro_norm = LA.norm(Xtyt, 'fro')  # Frobenius norm

    return y_tls, X + Xt, a_tls, fro_norm


Y_, X_, a_, norm = tls(np.array(age), np.array(cost))


# Define a function (quadratic in our case) to fit the data with.
def linear_func(p, x):
    m, c = p
    return m*x + c


# Create a model for fitting.
linear_model = Model(linear_func)

# Create a RealData object using our initiated data from above.
data = RealData(age, cost)

# Set up ODR with the model and data.
odr = ODR(data, linear_model, beta0=[0., 1.])

# Run the regression.
out = odr.run()
Y_inbuild = out.beta[0]*np.array(age) + out.beta[1]

fig1 = plot.figure(figsize=(40, 15))
plot.subplot(121)
plot.xlabel('Age')
plot.ylabel('Cost')
plot.scatter(age, cost, c="pink", label="CSV data")
plot.scatter(X_, Y_, c="black", label="TLS")
plot.scatter(age, Y_inbuild, c="blue", label="TLS_I")
plot.legend()
plot.title('Dataset')
