# Hands on Session

In this hand on session we will explore our hability to predict wine price based on its characteristics. 

## The Data

We will use the (famous) wine dataset that collected a set of red Bordeaux wines data from the decades 1950 to 1970.

The data available is:

* **Year**: the year that the grapes were collected;
* **Price**: the (log-scaled) price;
* **WinterRain**: the amount of  winter rain for the given year;
* **AGST**: data related to the geography of the place where the grape came from;
* **HarvestRain**: the amount of rain during the harvest;
* **Age**: the age of the wine;
* **FrancePop**: the france population;

## The activities

* Build and try several (linear) models for the given dataset;
* Calculate the error measures for each model;
* Choose the best model for the given dataset:
    * <font color="red">Not always all variables are needed!</font>;
    * Think about the trade-off of using less variables and loosing capacity of explaining variation;
* Try to predict the price using the test data;
* Try building the model in a distributed way;

# Helping functions and Imports

In [11]:
from daal.data_management import AOSNumericTable
from daal.data_management import SOANumericTable
from daal.data_management import BlockDescriptor_Intc
from daal.data_management import BlockDescriptor
from daal.data_management import BlockDescriptor_Float64
from daal.data_management import readOnly
from daal.data_management import readWrite
from daal.data_management import data_feature_utils
from daal.data_management import HomogenNumericTable
from daal.data_management import NumericTableIface
from daal.data_management import MergedNumericTable
from daal.data_management import FileDataSource
from daal.data_management import StringDataSource
from daal.data_management import DataSourceIface
from daal.data_management import packed_mask

from daal.algorithms.linear_regression import training
from daal.algorithms.linear_regression import prediction

import daal.algorithms.normalization.zscore as zscore

from daal import step1Local
from daal import step2Master


import numpy as np

import math

from daal.algorithms import covariance 



In [4]:
def printNumericTable(data_table, message='', num_printed_rows=0, num_printed_cols=0,
                      interval=10):
    num_rows = data_table.getNumberOfRows()
    num_cols = data_table.getNumberOfColumns()
    layout = data_table.getDataLayout()

    if num_printed_rows != 0:
        num_printed_rows = min(num_rows, num_printed_rows)
    else:
        num_printed_rows = num_rows

    if num_printed_cols != 0:
        num_printed_cols = min(num_cols, num_printed_cols)
    else:
        num_printed_cols = num_cols

    block = BlockDescriptor()
    if isFull(layout) or layout == NumericTableIface.csrArray:
        data_table.getBlockOfRows(0, num_rows, readOnly, block)
        printArray(block.getArray(), num_printed_cols, num_printed_rows,
                   num_cols, message, interval)
        data_table.releaseBlockOfRows(block)
    else:
        packed_table = data_table.getBlockOfRowsAsDouble(0, num_rows)
        if isLower(layout):
            printLowerArray(packed_table, num_printed_rows, message, interval)
        elif isUpper(layout):
            printUpperArray(packed_table, num_printed_cols, num_printed_rows,
                            num_cols, message, interval)
        
def isFull(layout):
    layout_int = int(layout)
    if packed_mask & layout_int:
        return False
    return True


def printArray(array, num_printed_cols, num_printed_rows, num_cols, message,
               interval=10, flt64=True):
    print(message)
    flat_array = array.flatten()
    decimals = '3' if flt64 else '0'
    for i in range(num_printed_rows):
        for j in range(num_printed_cols):
            print("{:<{width}.{dec}f}".format(
                flat_array[i * num_cols + j], width=interval, dec=decimals), end=''
            )
        print()
    print()

    
def getNPArray(data_table):
    num_rows = data_table.getNumberOfRows()
    num_cols = data_table.getNumberOfColumns()
    layout = data_table.getDataLayout()
    np_array = np.ones((num_rows, num_cols), dtype=np.int)
    block = BlockDescriptor()
    data_table.getBlockOfRows(0, num_rows, readOnly, block)
    np_array = block.getArray()
    data_table.releaseBlockOfRows(block)
    return np_array

# Covariance/Correlation

In [12]:
dataSource = FileDataSource(                                                                                                                                                                           
    "wine.csv",                                                                                                                                                                                      
    DataSourceIface.doAllocateNumericTable,                                                                                                                                                            
    DataSourceIface.doDictionaryFromContext                                                                                                                                                            
)                                                                                                                                                                                                      
                                                                                                                                                                                                       
# Retrieve the data from input file                                                                                                                                                                    
dataSource.loadDataBlock()                                                                                                                                                                             
                                                                                                                                                                                                       
# Create algorithm to compute dense variance-covariance matrix in batch mode                                                                                                                           
algorithm = covariance.Batch()                                                                                                                                                                                    
                                                                                                                                                                                                       
# Set input arguments of the algorithm                                                                                                                                                                 
algorithm.input.set(covariance.data, dataSource.getNumericTable())                                                                                                                                                
                                                                                                                                                                                                       
# Get computed variance-covariance matrix                                                                                                                                                              
res = algorithm.compute()                                                                                                                                                                              
                                                                                                                                                                                                       
# Print values                                                                                                                                                                                         
printNumericTable(res.get(covariance.covariance), "Covariance matrix:")                                                                                                                                           
printNumericTable(res.get(covariance.mean), "Mean vector:")    

Covariance matrix:
59.167    -2.240    17.267    -1.283    16.033    -59.167   28037.729 
-2.240    0.423     11.755    0.290     -27.264   2.240     -1112.846 
17.267    11.755    17497.460 -28.686   -2711.455 -17.267   -786.221  
-1.283    0.290     -28.686   0.456     -3.242    1.283     -641.559  
16.033    -27.264   -2711.455 -3.242    5538.257  -16.033   11255.584 
-59.167   2.240     -17.267   1.283     -16.033   59.167    -28037.729
28037.729 -1112.846 -786.221  -641.559  11255.584 -28037.72913434205.952

Mean vector:
1965.800  7.067     605.280   16.509    148.560   17.200    49694.437 

