# Test our algorithm for guessing missing values

In [1]:
#def test_missing_values_guess(N_OBS,N_FEATURES):
"""
hello!
This is a script that tests our algorithm for imputing missing values (NAs). I
generate data from distributions where the Features
have some relationship to each other (such as multivariate normal), which makes
it worthwhile to do something more intelligent than just using the mean value
for each Feature.

Steps:
1. Create distribution (for multivariate normal, create means & covariances)
2. Generate data from that distribution
3. Convert a randomly selected subset of the data to NAs
4. Apply the algorithm to impute the NAs
5. Calculate how close the algorithm got to the true values
"""
import numpy as np
import random
import math
from __future__ import division

In [2]:
# How big a dataset do we want to test on
N_OBS = 100;
N_FEATURES = 3;

## Test case 1: Multivariate normal distribution

In [3]:
# Case 1: Single Multivariate Normal distribution
# randomly generate correlation matrix for the distribution. All correlations must
# be between -1 and 1. 'random' uses the uniform distribution.
C = np.random.random((N_FEATURES,N_FEATURES))*2. - 1.
for i in range(N_FEATURES):
    C[i,i] = 1.
# randomly generate volatilities. All volatilities must be positive.
v = np.random.rand(N_FEATURES,1)*10.
# covariance matrix. Convolve volatilites and correlations. (v*v').*(C)
V = np.dot(v,v.T)
# alternative way:
V = np.outer(v,v)
cov = V * C
# ensure symmetry
cov = (cov + cov.T)/2.
# randomly generate means
mu = np.random.rand(N_FEATURES,)*2. - 1.
# randomly generate data
data = np.random.multivariate_normal(mu,cov,N_OBS)
print data

[[  3.60926792e+00  -3.58986806e-01  -2.50852197e+00]
 [ -5.50267312e+00  -3.88657137e+00   4.28605289e+00]
 [  2.61083069e+00   4.50628680e+00  -5.93036463e+00]
 [  1.17814541e+01  -5.37141827e-01   6.42202402e+00]
 [  1.25044894e+00  -1.25424032e+00  -4.37350875e-01]
 [  7.04339858e+00   1.47752689e-01   2.39328236e+00]
 [  3.74592304e+00   2.90100656e+00  -5.36980736e+00]
 [  7.60547875e+00  -2.24639044e-01  -9.18198712e+00]
 [ -3.85530758e+00   2.95064533e+00   7.52376085e+00]
 [  6.00863935e+00   6.75862934e+00   8.63318584e+00]
 [ -2.93278025e+00  -5.87426566e+00  -2.12040179e+00]
 [  7.87261923e+00  -1.63841554e+00  -1.74265166e+00]
 [ -5.50590623e+00  -4.26619889e+00   1.97950701e+00]
 [ -3.30363860e+00  -4.77039461e+00  -1.15644978e+01]
 [  8.03762245e+00  -7.64257691e-02   9.31540241e-01]
 [  2.07911839e+01   5.64396160e+00  -3.43745876e+00]
 [  6.53549494e+00  -4.04606099e-01   2.08676229e+00]
 [  3.28676109e+00  -7.48490839e-01   9.73360191e-01]
 [ -4.68257603e+00  -4.62730

In [4]:
# choose random fraction of data to convert to NAs
NA_FRACTION = 0.19

# create copy of data. This copy will have data converted to NAs
datan = data
# create a random index for the subsample that will be created to NAs
random_index = sorted(random.sample(range(datan.size),int(math.floor(datan.size*NA_FRACTION))))
# heres a better way to do it! convert to 1D array, convert subset of data
# randomly selected by the index to NA, then return to original shape
datan.shape = (N_OBS*N_FEATURES,)
datan[random_index] = np.nan
datan.shape = (N_OBS,N_FEATURES)
np.isnan(datan).sum() / np.size(datan)

0.19

### Guess the missing data

In [5]:
# apply algorithm to compute NAs
# !!! this is your code !!
# data_imputed = impute_NAs_using_KNN(datan)

### Compute the RMSE of the guesses

In [6]:
# RMSE of difference between imputed and original data
rmse_guess = np.sqrt(((data_imputed[np.isnan(datan)] - data[np.isnan(datan)])**2).sum())
rmse_guess = np.sqrt(((data_imputed[np.isnan(datan)] - data[np.isnan(datan)])**2).sum())

# next: do this many times to get a statistical picture of the method's accuracy


NameError: name 'data_imputed' is not defined

# Gibbs Sampler

1. Seeding:
    * Get the sample distribution of each variable
    * Draw randomly from this distribution to fill in the missing values
2. In parallel:
    * forecast one variable using all the others
    --> recombine forecasts and compare similarity of new matrix to old
    * forecasting can use any appropriate machine learning technique.
      for example, SGD (to replace linear regression), or decision tree.
3. Iterate 5-10 times

In [7]:
from scipy import stats
import multiprocessing as mp

pool = mp.Pool(processes=N_FEATURES)
results = pool.map(cube, range(1,7))
print(results)

NameError: name 'cube' is not defined