# Unit Test for the MLEV Python Package Project
## Part I: Simulate two input datasets
The following linear model is assumed:<br>
$y = X \beta + \epsilon$ <br>
$\epsilon$ is from $i.i.d.$ Nomral $(0,\sigma^2)$<br>
$ X $ is $n$ by $p$ design matrix<br>
$\beta$ is $p$ by $1$ parameter vector
- **Dataset1:** low dimensional dataset ($n=100, p=10$)
- **Dataset2:** high dimensional dataset ($n=1000, p=10000$)

In [8]:
import numpy as np
import pandas as pd

# dataset1 (n=100 >> p=10)
X1 = np.random.rand(100, 10)
beta1 = np.random.normal(loc=0, scale=(10/10)**0.5, size=10)
error1 = np.random.normal(loc=0, scale=10**0.5, size=100)
y1 = np.dot(X1, beta1) + error1
dataset1 = {'y': y1, 'X': X1}
print('True variance of error1 is ' + str(np.var(error1)))

# dataset2 (n=1000 << p=10000)
X2 = pd.DataFrame(np.random.randn(1000, 10000))
beta2 = np.random.normal(loc=0, scale=(10/1e4)**0.5, size=int(1e4))
error2 = np.random.normal(loc=0, scale=10**0.5, size=1000)
y2 = pd.Series(np.dot(X2, beta2) + error2)
dataset2 = {'y': y2, 'X': X2}
print('True variance of error2 is ' + str(np.var(error2)))

True variance of error1 is 10.7475478753


True variance of error2 is 9.2305705296


## Part II: Compute variance estimates using the MLEV package
### Step 1: Load the MLEV package

In [4]:
import numpy as np
import pandas as pd
import scipy.optimize as optimx

class MLEV:
    """    
    The MLEV package is designed to estimate the variance of error/noise term in linear models.

    It works in both low dimensional and high dimensional scenarios.
    To instantiate, only the numpy.ndarray and pandas.DataFrame/Series input data types are allowed.
    Once the instantiation is finished, please run the getMLEV() function to estimate the error variance.  
    """

    def __init__(self, X=None, y=None):
        """
        The constructor function to instantiate MLEV class.
        :param X: Design matrix (Input Dtype: numpy.ndarray, pandas.DataFrame or pandas.Series)
        :param y: Response variable (Input Dtype: numpy.ndarray or pandas.Series)
        """
        if not ((isinstance(X, np.ndarray) or isinstance(X, pd.DataFrame) or isinstance(X, pd.Series)) and (
            isinstance(y, np.ndarray) or isinstance(y, pd.Series))):
            raise TypeError(
                'Design matrix (X) must be numpy.ndarray, pandas.Dataframe or pandas.Series and response variable (y) must be numpy.ndarray or pandas.Series')
        elif not (X.ndim == 1 or X.ndim == 2):
            raise TypeError('The dimension of design matrix (X) must be 1 or 2')
        elif not (y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1)):
            raise TypeError('The dimension of response variable (y) is not 1')
        elif X.shape[0] != y.shape[0]:
            raise TypeError('The number of rows in design matrix (X.shape[0]) does not match with y.shape[0]')
        self.X = np.asarray(X, dtype=np.float64)
        self.y = np.asarray(y, dtype=np.float64)
        if np.sum(np.isnan(self.y)) != 0:
            raise ValueError('Missing values are not allowed in response variable (y)')
        elif np.sum(np.isnan(self.X)) != 0:
            raise ValueError('Missing values are not allowed in design matrix (X)')
        elif np.linalg.matrix_rank(self.X.T) < np.min(self.X.shape):
            raise ValueError(
                'Design matrix (X) is rank deficient, please remove linearly dependent rows and/or columns')
        self.n = float(self.X.shape[0])
        self.p = float(self.X.shape[1])
        self.theta2_init = float(1)
        print('Instantiation completed!')

    def describe(self):
        """ 
        :return: The dimension of input design matrix, X.
        """
        print('The dimension of design matrix is (' + str(int(self.n)) + ', ' + str(int(self.p)) + ')')

    def eigen(self):
        """
        This function performs eigen value (QR) decomposition. Designed for internal use only.
        :return: eigen values in self.lbd and transformed y in self.yTildeSq.
        """
        self.XXt = np.dot(self.X, self.X.T)
        self.lbd, self.vec = np.linalg.eigh(self.XXt)
        self.yTilde = np.dot(self.vec.T, self.y)
        self.yTildeSq = self.yTilde ** 2
        return (self)

    def mlevObj(self, theta2):
        """
        This is the objective function for theta2 (signal-to-noise ratio). Designed for internal use only.
        :param theta2: Signal-to-noise ratio.
        :return: Value of objective function (negative log-likelihood value).
        """
        out1 = np.log(np.sum(self.yTildeSq / (theta2 / self.p * self.lbd + 1.0))) + 1.0 / self.n * np.sum(
            np.log(theta2 / self.p * self.lbd + 1.0))
        return (out1)

    def getTheta2(self, theta2_init=float(1)):
        """
        Numerically minimizes the mlevObj() function over theta2 (signal-to-noise ratio). Designed for internal use only.
        :param theta2_init: The initial value of theta2 for numerical optimization, default is 1.0.
        :return: The maximum likelihood estimates of theta2 (signal-to-noise ratio).
        """
        self.theta2_est = optimx.fmin_l_bfgs_b(func=self.mlevObj, x0=np.array([theta2_init]), bounds=[(0, None)],
                                               approx_grad=True)
        self.theta2_hat = self.theta2_est[0]
        return (self)

    def getTheta1(self):
        """
        Take the maximum likelihood estimates of theta2 and solve theta1 (error variance). Designed for internal use only.
        :return: The maximum likelihood estimates of theta1 (error variance).
        """
        self.theta1_hat = (1.0 / self.n) * np.sum(self.yTildeSq / (self.theta2_hat / self.p * self.lbd + 1.0))
        return (self.theta1_hat)

    def getMLEV(self):
        """
        This is the function for users to get Maximum Likelihood Estimates of Variances.
        :return: The maximum likelihood estimates of variances.
        """
        self.mlev_hat = self.eigen().getTheta2().getTheta1()
        return (self.mlev_hat)

### Step 2: Compute the MLEV variance estimates
- Dataset1: low dimension, numpy.ndarray
- Dataset2: high dimension, pandas.Series/DataFrame

In [9]:
# Test-1 regular numpy.ndarray datatype input, low dimension
my_X = dataset1['X']
my_y = dataset1['y']
print(type(my_X), type(my_y))
my_Data = MLEV(my_X, my_y)
print(my_Data.getMLEV())

# Test-2 regular pandas.DataFrame and pandas.Series datatype inputs, high dimension
my_X = dataset2['X']
my_y = dataset2['y']
print(type(my_X), type(my_y))
my_Data = MLEV(my_X, my_y)
print(my_Data.getMLEV())

(<type 'numpy.ndarray'>, <type 'numpy.ndarray'>)
Instantiation completed!
10.6689611621
(<class 'pandas.core.frame.DataFrame'>, <class 'pandas.core.series.Series'>)


Instantiation completed!


6.09676513391


## Part III: return errors for invalid input datasets
The MLEV class only takes the following datatypes:
- Design matrix, X: *numpy.ndarray, pandas.DataFrame* or *pandas.Series*;
- Response, y: *numpy.ndarray* or *pandas.Series*;<br>

In addition, MLEV will generate errors if:
- The design matrix, X, has more than two dimensions;
- The response, y, has more than one dimensions;
- The shapes of X and y do not match;
- Design matrix, X, is rank-deficient;<br>

In [10]:
# TypeError: Design matrix (X) must be numpy.ndarray, pandas.Dataframe or pandas.Series and response variable (y) must be numpy.ndarray or pandas.Series
wrong_X = {'x1': [1,2,3,4], 'x2': [4,3,2,1]}
wrong_y = pd.DataFrame([1,2,3,4])
print(type(wrong_X), type(wrong_y)) # (<type 'dict'>, <class 'pandas.core.frame.DataFrame'>)
wrong_Data = MLEV(wrong_X, wrong_y)

(<type 'dict'>, <class 'pandas.core.frame.DataFrame'>)


TypeError: Design matrix (X) must be numpy.ndarray, pandas.Dataframe or pandas.Series and response variable (y) must be numpy.ndarray or pandas.Series

In [11]:
# ValueError: Missing values are not allowed in design matrix (X)
wrong_X = pd.DataFrame(dataset1['X'])
wrong_X.iloc[0,0] = np.NaN
my_y = dataset1['y']
wrong_Data = MLEV(wrong_X, my_y)

ValueError: Missing values are not allowed in design matrix (X)

In [13]:
# ValueError: X_Transpose is rank deficient, remove linearly dependent rows from design matrix (X)
wrong_X = np.array([[1,1,1,1], [2,2,2,2], [3,4,5,6], [7,8,9,10], [11,12,9,10]])
my_y = np.array([1,2,3,4,5])
wrong_Data = MLEV(wrong_X, my_y)

ValueError: Design matrix (X) is rank deficient, please remove linearly dependent rows and/or columns