In [19]:
%load_ext rpy2.ipython

In [1]:
from daal.data_management import AOSNumericTable
from daal.data_management import SOANumericTable
from daal.data_management import BlockDescriptor_Intc
from daal.data_management import BlockDescriptor
from daal.data_management import BlockDescriptor_Float64
from daal.data_management import readOnly
from daal.data_management import readWrite
from daal.data_management import data_feature_utils
from daal.data_management import HomogenNumericTable
from daal.data_management import NumericTableIface
from daal.data_management import MergedNumericTable
from daal.data_management import FileDataSource
from daal.data_management import StringDataSource
from daal.data_management import DataSourceIface
from daal.data_management import packed_mask

from daal.algorithms.linear_regression import training
from daal.algorithms.linear_regression import prediction

import daal.algorithms.normalization.zscore as zscore

from daal import step1Local
from daal import step2Master


import numpy as np

import math

In [4]:
def printNumericTable(data_table, message='', num_printed_rows=0, num_printed_cols=0,
                      interval=10):
    num_rows = data_table.getNumberOfRows()
    num_cols = data_table.getNumberOfColumns()
    layout = data_table.getDataLayout()

    if num_printed_rows != 0:
        num_printed_rows = min(num_rows, num_printed_rows)
    else:
        num_printed_rows = num_rows

    if num_printed_cols != 0:
        num_printed_cols = min(num_cols, num_printed_cols)
    else:
        num_printed_cols = num_cols

    block = BlockDescriptor()
    if isFull(layout) or layout == NumericTableIface.csrArray:
        data_table.getBlockOfRows(0, num_rows, readOnly, block)
        printArray(block.getArray(), num_printed_cols, num_printed_rows,
                   num_cols, message, interval)
        data_table.releaseBlockOfRows(block)
    else:
        packed_table = data_table.getBlockOfRowsAsDouble(0, num_rows)
        if isLower(layout):
            printLowerArray(packed_table, num_printed_rows, message, interval)
        elif isUpper(layout):
            printUpperArray(packed_table, num_printed_cols, num_printed_rows,
                            num_cols, message, interval)
        
def isFull(layout):
    layout_int = int(layout)
    if packed_mask & layout_int:
        return False
    return True


def printArray(array, num_printed_cols, num_printed_rows, num_cols, message,
               interval=10, flt64=True):
    print(message)
    flat_array = array.flatten()
    decimals = '3' if flt64 else '0'
    for i in range(num_printed_rows):
        for j in range(num_printed_cols):
            print("{:<{width}.{dec}f}".format(
                flat_array[i * num_cols + j], width=interval, dec=decimals), end=''
            )
        print()
    print()

    
def getNPArray(data_table):
    num_rows = data_table.getNumberOfRows()
    num_cols = data_table.getNumberOfColumns()
    layout = data_table.getDataLayout()
    np_array = np.ones((num_rows, num_cols), dtype=np.int)
    block = BlockDescriptor()
    data_table.getBlockOfRows(0, num_rows, readOnly, block)
    np_array = block.getArray()
    data_table.releaseBlockOfRows(block)
    return np_array

In [13]:
nFeatures1 = 5
nFeatures2 = 1
nOutcomes = 1

trainDatasetFileName = 'wine.csv'

trainDataSource = FileDataSource(trainDatasetFileName, 
                                 DataSourceIface.notAllocateNumericTable,
                                 DataSourceIface.doDictionaryFromContext)

trainData_sub1 = HomogenNumericTable(nFeatures1, 0, NumericTableIface.notAllocate)
trainData_sub2 = HomogenNumericTable(nFeatures2, 0, NumericTableIface.notAllocate)

trainOutcome = HomogenNumericTable(nOutcomes, 0, NumericTableIface.notAllocate)

mergedData1 = MergedNumericTable(trainData_sub2, trainOutcome)

mergedData2 = MergedNumericTable(mergedData1, trainData_sub1)

nObservations = trainDataSource.loadDataBlock(mergedData2)

trainData = MergedNumericTable(trainData_sub2,trainData_sub1)

print("Observations read: {}".format(nObservations))

printNumericTable(mergedData2)


Observations read: 25

1952.000  7.495     600.000   17.117    160.000   31.000    43183.569 
1953.000  8.039     690.000   16.733    80.000    30.000    43495.030 
1955.000  7.686     502.000   17.150    130.000   28.000    44217.857 
1957.000  6.984     420.000   16.133    110.000   26.000    45152.252 
1958.000  6.777     582.000   16.417    187.000   25.000    45653.805 
1959.000  8.076     485.000   17.483    187.000   24.000    46128.638 
1960.000  6.519     763.000   16.417    290.000   23.000    46583.995 
1961.000  8.494     830.000   17.333    38.000    22.000    47128.005 
1962.000  7.388     697.000   16.300    52.000    21.000    48088.673 
1963.000  6.713     608.000   15.717    155.000   20.000    48798.990 
1964.000  7.309     402.000   17.267    96.000    19.000    49356.943 
1965.000  6.252     602.000   15.367    267.000   18.000    49801.821 
1966.000  7.744     819.000   16.533    86.000    17.000    50254.966 
1967.000  6.840     714.000   16.233    118.000   16.0

In [17]:
algorithm = training.Batch_Float64NormEqDense()

algorithm.input.set(training.data, trainData)
algorithm.input.set(training.dependentVariables, trainOutcome)
                                     
trainingResult = algorithm.compute()
printNumericTable(trainingResult.get(training.model).getBeta(), "Linear Regression coefficients:")


Linear Regression coefficients:
-392.622  0.198     0.001     0.601     -0.004    0.198     -0.000    



In [18]:
algorithm = prediction.Batch()
algorithm.input.setTable(prediction.data, trainData)
algorithm.input.setModel(prediction.model, trainingResult.get(training.model))

predictionResult = algorithm.compute()

prediction_result = getNPArray(predictionResult.get(prediction.prediction))
real_values = getNPArray(trainOutcome)

real_values_mean = np.sum(real_values)/len(real_values)
ssreg = np.sum((prediction_result-real_values_mean)**2)
sstot = np.sum((real_values - real_values_mean)**2)
r_squared = ssreg / sstot

print(r_squared)

0.829359222337


In [31]:
%%R
dataset = read.csv("wine.csv")

colnames(dataset) = c("Year", "Price", "WinterRain", 
                      "AGST", "HarvestRain","Age", "FrancePop")

print(colnames(dataset))

model = lm(Price ~ Year, data=dataset)

print(summary(model)$r.squared)


[1] "Year"        "Price"       "WinterRain"  "AGST"        "HarvestRain"
[6] "Age"         "FrancePop"  
[1] 0.2004961
