In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#### Reading Data:

In [2]:
X_train = np.load('Data/Train/X.npy')
X_prime_train = np.load('Data/Train/X_prime.npy')
feature_info_train = np.load('Data/Train/feature_information.npy')

In [3]:
X_test = np.load('Data/Test/X.npy')
X_prime_test = np.load('Data/Test/X_prime.npy')
feature_info_test = np.load('Data/Test/feature_information.npy')

In [4]:
isFeatureReal_train = pd.DataFrame(feature_info_train)
isFeatureReal_test = pd.DataFrame(feature_info_test)

In [5]:
isFeatureReal_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,1,1,0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,0,1,0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [6]:
isFeatureReal_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,96,97,98,99,100,101,102,103,104,105
0,1,0,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,1,0,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,0,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,0,0


#### Naive Least Square Regression

In [7]:
def least_square_regression(X, Y):
    m, k = X.shape[0], X.shape[1]
    z = np.ones((m,1))
    X = np.append(X, z, axis=1)
    mat = np.linalg.inv(np.dot(np.transpose(X), X))
    w = np.dot(mat, np.dot(np.transpose(X), Y))
    return w

In [8]:
def predict(w, isFeatureReal, X_prime):
    X_prime = pd.DataFrame(X_prime)
    num_features = w.shape[0]
    
    # Predicting column wise missing data
    for k in range(num_features-1):
        col_to_predict = k
        # Weight for column k
        w_k = w[k]
        new_w = np.delete(w_k, col_to_predict)
        # Filling where data is missing
        for idx, _ in isFeatureReal.loc[isFeatureReal[col_to_predict] == 0].iterrows():
            row = X_prime.iloc[idx]
            new_row = row.drop(col_to_predict)
            # x_i = (w1*x1 + w2*x2 + ...+ wk*xk)/w_i
            pred = - (np.dot(new_w, new_row) + w_k[-1])/w_k[col_to_predict]
            X_prime[col_to_predict].iloc[idx] = pred    
    return X_prime

In [9]:
# Root mean square error
def calculateError(X_prime, X):
    predictedData = X_prime.to_numpy()
    trueData = X

    diff = (trueData - predictedData)**2
    sumDiff = np.sum(diff, axis = 1)
    err = np.sqrt(np.mean(sumDiff))

    return err

In [10]:
w = least_square_regression(X_prime_train, X_train)

# Training Error
X_pred_train = predict(w, isFeatureReal_train, X_prime_train)
errTrain = calculateError(X_pred_train, X_train)
print("Training Error:", errTrain)

# Testing Error
X_pred_test = predict(w, isFeatureReal_test, X_prime_test)
errTest = calculateError(X_pred_test, X_test)
print("Testing Error:", errTest)

Training Error: 0.3713595158154271
Testing Error: 0.36347680846333613


In [11]:
np.save('Data/weightLinearReg.npy', w)

####  Ridge regression:

In [12]:
def ridge_regression(X, Y, l = 0.01):
    m, k = X.shape[0], X.shape[1]
    z = np.ones((m,1))
    X = np.append(X, z, axis=1)
    mat = np.linalg.inv(np.dot(np.transpose(X), X) + l*np.identity(k+1))
    w = np.dot(mat, np.dot(np.transpose(X), Y))
    return w

In [13]:
w_ridge_regression = ridge_regression(X_prime_train, X_train, l = 0.01)
print(w_ridge_regression.shape)

# Training Error
X_pred_train_ridge = predict(w_ridge_regression, isFeatureReal_train, X_prime_train)
errTrainRidge = calculateError(X_pred_train_ridge, X_train)
print("Training Error:", errTrainRidge)

# Testing Error

X_pred_test_ridge = predict(w_ridge_regression, isFeatureReal_test, X_prime_test)
errTestRidge = calculateError(X_pred_test_ridge, X_test)
print("Testing Error:", errTestRidge)

(107, 106)
Training Error: 0.3723908291437859
Testing Error: 0.36451064501095454


In [None]:
_lambda = [i/100 for i in range(0,11)]
errTrain = []
errTest = []

for _l in _lambda:
    w_ridge_regression = ridge_regression(X_prime_train, X_train, l = _l)
    X_pred_train_ridge = predict(w_ridge_regression, isFeatureReal_train, X_prime_train)
    errTrainRidge = calculateError(X_pred_train_ridge, X_train)
    errTrain.append(errTrainRidge)
    
    X_pred_test_ridge = predict(w_ridge_regression, isFeatureReal_test, X_prime_test)
    errTestRidge = calculateError(X_pred_test_ridge, X_test)
    errTest.append(errTestRidge)

In [None]:
plt.figure(figsize=(8,6))
plt.plot(_lambda, errTrain, marker='x', label='Training error')
plt.plot(_lambda, errTest, marker='x', label='Testing error')
plt.title("Lambda values v/s Error in Ridge regression")
plt.xlabel("Lambda")
plt.ylabel("Error")
plt.legend()
plt.show()