This notebook shows:
- Install SystemML Python package and jar file
  - pip
  - SystemML 'Hello World'
- Example 1: Matrix Multiplication
- Load diabetes dataset from scikit-learn
- Example 2: Implement three different algorithms to train linear regression model
  - Algorithm 1: Linear Regression - Direct Solve (no regularization)
  - Algorithm 2: Linear Regression - Batch Gradient Descent (no regularization)
  - Algorithm 3: Linear Regression - Conjugate Gradient (no regularization)
- Example 3: Invoke existing SystemML algorithm script LinearRegDS.dml using MLContext API
- Example 4: Invoke existing SystemML algorithm using scikit-learn/SparkML pipeline like API
- Example 5: Invoking a Keras model with SystemML

# Install SystemML Python package and jar file

In [None]:
!pip install --user systemml==1.2.0

In [None]:
!pip install https://sparktc.ibmcloud.com/repo/latest/systemml-1.2.0-SNAPSHOT-python.tar.gz

In [22]:
!pip install https://github.com/apache/systemml.git

Collecting https://github.com/apache/systemml.git
  Downloading https://github.com/apache/systemml.git


  Cannot unpack file C:\Users\ming\AppData\Local\Temp\pip-unpack-5uxml6sa\systemml.git (downloaded from C:\Users\ming\AppData\Local\Temp\pip-req-build-aql94_kc, content-type: text/html; charset=utf-8); cannot detect archive format
Cannot determine archive format of C:\Users\ming\AppData\Local\Temp\pip-req-build-aql94_kc


In [None]:
help(systemml)

In [None]:
from systemml import MLContext, dml, dmlFromResource, dmlFromFile, dmlFromUrl

In [1]:
import systemml 
from systemml import mlcontext
from systemml.mllearn import 

In [2]:
help(systemml)

Help on package systemml:

NAME
    systemml

DESCRIPTION
    #-------------------------------------------------------------
    #
    # Licensed to the Apache Software Foundation (ASF) under one
    # or more contributor license agreements.  See the NOTICE file
    # distributed with this work for additional information
    # regarding copyright ownership.  The ASF licenses this file
    # to you under the Apache License, Version 2.0 (the
    # "License"); you may not use this file except in compliance
    # with the License.  You may obtain a copy of the License at
    #
    #   http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing,
    # software distributed under the License is distributed on an
    # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    # KIND, either express or implied.  See the License for the
    # specific language governing permissions and limitations
    # under the License.
    #
    #---------

In [12]:
from systemml import classloader
from systemml import converters
from systemml import defmatrix 
from systemml import mlcontext
from systemml import project_info
from systemml import random 

In [14]:
from .keras2caffe import *

ModuleNotFoundError: No module named '__main__.keras2caffe'; '__main__' is not a package

In [18]:
pwd

'C:\\Users\\ming\\Documents\\Backup_G\\Advanced_Data_Science_with_IBM\\coursera\\coursera_ai\\week2\\systemml'

In [19]:
cd 'C:\\Users\\ming\\Documents\\Backup_G\\Advanced_Data_Science_with_IBM\\systemml'

[WinError 123] The filename, directory name, or volume label syntax is incorrect: "'C:\\\\Users\\\\ming\\\\Documents\\\\Backup_G\\\\Advanced_Data_Science_with_IBM\\\\systemml'"
C:\Users\ming\Documents\Backup_G\Advanced_Data_Science_with_IBM\coursera\coursera_ai\week2\systemml


In [20]:
mvn package -P distributio

SyntaxError: invalid syntax (<ipython-input-20-0ebe8ee82c9d>, line 1)

In [13]:
from systemml import mllearn 

SyntaxError: import * only allowed at module level (estimators.py, line 917)

In [3]:
from systemml.mllearn import c

AttributeError: module 'systemml' has no attribute 'mllearn'

In [None]:
#!pip install --upgrade systemml
!pip install --upgrade https://github.com/niketanpansare/future_of_data/raw/master/systemml-1.1.0-SNAPSHOT-python.tar.gz
!ln -s -f ~/.local/lib/python2.7/site-packages/systemml/systemml-java/*.jar ~/data/libs/

In [4]:
!pip show systemml

Name: systemml
Version: 1.2.0
Summary: Apache SystemML is a distributed and declarative machine learning platform.
Home-page: http://systemml.apache.org/
Author: Apache SystemML
Author-email: dev@systemml.apache.org
License: Apache 2.0
Location: c:\users\ming\appdata\roaming\python\python36\site-packages
Requires: scipy, Pillow, scikit-learn, numpy, pandas
Required-by: 


In [5]:
help(systemml)

Help on package systemml:

NAME
    systemml

DESCRIPTION
    #-------------------------------------------------------------
    #
    # Licensed to the Apache Software Foundation (ASF) under one
    # or more contributor license agreements.  See the NOTICE file
    # distributed with this work for additional information
    # regarding copyright ownership.  The ASF licenses this file
    # to you under the Apache License, Version 2.0 (the
    # "License"); you may not use this file except in compliance
    # with the License.  You may obtain a copy of the License at
    #
    #   http://www.apache.org/licenses/LICENSE-2.0
    #
    # Unless required by applicable law or agreed to in writing,
    # software distributed under the License is distributed on an
    # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
    # KIND, either express or implied.  See the License for the
    # specific language governing permissions and limitations
    # under the License.
    #
    #---------

In [None]:
import findspark
findspark.init()
# findspark.find()
import pyspark 
from pyspark.sql import SparkSession

In [None]:
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext

sc = SparkContext("local[2]", "MarketStream") # 2 threads, app name 

ssc = StreamingContext(sc, 1)

In [None]:
help(systemml)

### Import SystemML API 

In [None]:
sc.version

In [None]:
from systemml import MLContext, dml
# Create a MLContext object
ml = MLContext(sc)
# And print the information of SystemML version
print(ml.info())

In [None]:
# Create a DML script for a Hello World' example and execute it using MLContext
script = dml(""" 
print('Hello World'); 
""")
ml.execute(script)

In [None]:
# Let's modify the above script to get the Hello World string
script = dml(""" 
s = 'Hello World' 
""").output("s")

hello_world_str = ml.execute(script).get("s")

print(hello_world_str)

### Import numpy, sklearn, and define some helper functions

In [None]:
import sys, os
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
plt.switch_backend('agg')

# Example 1: Matrix Multiplication

### SystemML script to generate a random matrix, perform matrix multiplication, and compute the sum of the output

In [None]:
script = """
    X = rand(rows=$nr, cols=1000, sparsity=0.5)
    A = t(X) %*% X
    s = sum(A)
"""
prog = dml(script).input('$nr', 1e6).output('s')
s = ml.execute(prog).get('s')
print(s)

# Load diabetes dataset from scikit-learn 

In [None]:
%matplotlib inline

In [None]:
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data[:, np.newaxis, 2]
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]
diabetes_y_train = np.matrix(diabetes.target[:-20]).T
diabetes_y_test = np.matrix(diabetes.target[-20:]).T

# plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
# plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

In [None]:
diabetes_y_train

# Example 2: Implement three different algorithms to train linear regression model

## Algorithm 1: Linear Regression - Direct Solve (no regularization) 

#### Preliminaries

1. The builtin function `solve(A, b)` computes the least squares solution for system of linear equations 
$$ Ax = b $$
for the vector x such that $$ || \; Ax \; – \; b \;|| $$ is minimized. It is important to note that this function can operate only on small-to-medium sized input matrix that can fit in the driver memory. See the [DML language reference](http://apache.github.io/systemml/dml-language-reference.html) for more details.

2. Linear regression model assumes that relationship between input explanatory (feature) variables X and numerical response variable y is linear. The goal is to estimate regression coefficient w (and residual variable) such that 

$$ y = \text{Normal}(Xw, \sigma^2) $$

$$ 
\begin{align*}
\text{Cost function, } J(w) &= \dfrac{1}{2} (Xw - y)^2
\end{align*}$$

Differentiating with respect to w,

$$ 
\begin{align*}
dw &= \dfrac{\partial}{\partial w} \dfrac{1}{2} (Xw - y)^2 \\
&= \dfrac{1}{2} 2 X^T (Xw - y) \\
&= (X^TX)w - X^Ty 
\end{align*}$$


#### Setting the gradient
To find minima, we set the derivative with respect to w to zero, 
$$\begin{align*}
(X^T X)w - (X^T y) &= 0 \\ 
w &= (X^T X)^{-1}(X^T y) \\
\text{Let } \; A &= X^T X \\
\text{and } \; b &= X^T y \\
\text{Therefore,} \; w &= solve(A, b)
\end{align*}$$

In [None]:
script = """
    # add constant feature to X to model intercept
    ones = matrix(1, rows=nrow(X), cols=1)
    X = cbind(X, ones)
    A = t(X) %*% X
    b = t(X) %*% y
    w = solve(A, b)
    bias = as.scalar(w[nrow(w),1])
    w = w[1:nrow(w)-1,]
"""

In [None]:
prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w', 'bias')
w, bias = ml.execute(prog).get('w','bias')
w = w.toNumPy()

In [None]:
# plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
# plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

# plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='blue', linestyle ='dotted')

## Algorithm 2: Linear Regression - Batch Gradient Descent (no regularization)

#### Algorithm
`Step 1: Start with an initial point 
while(not converged) { 
  Step 2: Compute gradient dw. 
  Step 3: Compute stepsize alpha.     
  Step 4: Update: w_new = w_old - alpha*dw 
}`

![Gradient Descent](http://blog.datumbox.com/wp-content/uploads/2013/10/gradient-descent.png)

#### Gradient formula

$$ dw = r = (X^T X)w - (X^T y) $$

#### Step size formula

We perform a line search to choose the step size `alpha` to minimize the cost function J(w). From basic calculus, `alpha` minimizes the function J(w) when the directional derivative with respect to `alpha` is zero. 

$$ alpha = \dfrac{r^T r}{ r^T X^T X r } $$

In [None]:
script = """
    # add constant feature to X to model intercepts
    ones = matrix(1, rows=nrow(X), cols=1)
    X = cbind(X, ones)
    max_iter = 100
    w = matrix(0, rows=ncol(X), cols=1)
    for(i in 1:max_iter){
        XtX = t(X) %*% X
        dw = XtX %*%w - t(X) %*% y
        alpha = (t(dw) %*% dw) / (t(dw) %*% XtX %*% dw)
        w = w - dw*alpha
    }
    bias = as.scalar(w[nrow(w),1])
    w = w[1:nrow(w)-1,]    
"""

In [None]:
prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w').output('bias')
w, bias = ml.execute(prog).get('w', 'bias')
w = w.toNumPy()

In [None]:
plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='red', linestyle ='dashed')

# Algorithm 3: Linear Regression - Conjugate Gradient (no regularization)

Problem with gradient descent: Takes very similar directions many times

Solution: Enforce conjugacy

`Step 1: Start with an initial point 
while(not converged) {
   Step 2: Compute gradient dw.
   Step 3: Compute stepsize alpha.
   Step 4: Compute next direction p by enforcing conjugacy with previous direction.
   Step 4: Update: w_new = w_old + alpha*p
}`

![Gradient Descent vs Conjugate Gradient](http://i.stack.imgur.com/zh1HH.png)


In [None]:
script = """
    # add constant feature to X to model intercepts
    X = cbind(X, matrix(1, rows=nrow(X), cols=1))
    m = ncol(X); i = 1; 
    max_iter = 20;
    w = matrix (0, rows = m, cols = 1); # initialize weights to 0
    dw = - t(X) %*% y; p = - dw;        # dw = (X'X)w - (X'y)
    norm_r2 = sum (dw ^ 2); 
    for(i in 1:max_iter) {
        q = t(X) %*% (X %*% p)
        alpha = norm_r2 / sum (p * q);  # Minimizes f(w - alpha*r)
        w = w + alpha * p;              # update weights
        dw = dw + alpha * q;           
        old_norm_r2 = norm_r2; norm_r2 = sum (dw ^ 2);
        p = -dw + (norm_r2 / old_norm_r2) * p; # next direction - conjugacy to previous direction
        i = i + 1;
    }
    bias = as.scalar(w[nrow(w),1])
    w = w[1:nrow(w)-1,]    
"""

In [None]:
prog = dml(script).input(X=diabetes_X_train, y=diabetes_y_train).output('w').output('bias')
w, bias = ml.execute(prog).get('w','bias')
w = w.toNumPy()

In [None]:
plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

plt.plot(diabetes_X_test, (w*diabetes_X_test)+bias, color='red', linestyle ='dashed')

# Example 3: Invoke existing SystemML algorithm script LinearRegDS.dml using MLContext API

In [None]:
from systemml import dmlFromResource
prog = dmlFromResource('scripts/algorithms/LinearRegDS.dml').input(X=diabetes_X_train, y=diabetes_y_train).input('$icpt',1.0).output('beta_out')
w = ml.execute(prog).get('beta_out')
w = w.toNumPy()
bias=w[1]

In [None]:
from systemml import dmlFromResource
prog = dmlFromResource('scripts/algorithms/LinearRegDS.dml').input(X=diabetes_X_train, y=diabetes_y_train).input('$icpt',1.0).output('beta_out')
w = ml.execute(prog).get('beta_out')
w = w.toNumPy()
bias=w[1]

In [None]:
# plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
# plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

# plt.plot(diabetes_X_test, (w[0]*diabetes_X_test)+bias, color='red', linestyle ='dashed')

# Example 4: Invoke existing SystemML algorithm using scikit-learn/SparkML pipeline like API

*mllearn* API allows a Python programmer to invoke SystemML's algorithms using scikit-learn like API as well as Spark's MLPipeline API.

In [None]:
# from pyspark.sql import SQLContext

from systemml.mllearn import LinearRegression

In [None]:
from pyspark.sql import SQLContext
from systemml.mllearn import LinearRegression
sqlCtx = SQLContext(sc)

In [None]:
regr = LinearRegression(sqlCtx)
# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

In [None]:
predictions = regr.predict(diabetes_X_test)

In [None]:
# Use the trained model to perform prediction
%matplotlib inline
plt.scatter(diabetes_X_train, diabetes_y_train,  color='black')
plt.scatter(diabetes_X_test, diabetes_y_test,  color='red')

plt.plot(diabetes_X_test, predictions, color='black')

## (Optional) Install OpenBLAS

# Example 5: Invoking a Keras model with SystemML

See [SystemML's deep learning documentation](http://apache.github.io/systemml/deep-learning) for more detail.

In [None]:
from systemml.mllearn import Keras2DML

In [None]:
from mlxtend.data import mnist_data
import numpy as np
from sklearn.utils import shuffle
# Download the MNIST dataset
X, y = mnist_data()
X, y = shuffle(X, y)
# Split the data into training and test
n_samples = len(X)
X_train = X[:int(.9 * n_samples)]
y_train = y[:int(.9 * n_samples)]
X_test = X[int(.9 * n_samples):]
y_test = y[int(.9 * n_samples):]
from keras.models import Sequential
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Dropout,Flatten
from keras import backend as K
from keras.models import Model
input_shape = (1,28,28) if K.image_data_format() == 'channels_first' else (28,28, 1)
keras_model = Sequential()
keras_model.add(Conv2D(32, kernel_size=(5, 5), activation='relu', input_shape=input_shape, padding='same'))
keras_model.add(MaxPooling2D(pool_size=(2, 2)))
keras_model.add(Conv2D(64, (5, 5), activation='relu', padding='same'))
keras_model.add(MaxPooling2D(pool_size=(2, 2)))
keras_model.add(Flatten())
keras_model.add(Dense(512, activation='relu'))
keras_model.add(Dropout(0.5))
keras_model.add(Dense(10, activation='softmax'))

# Scale the input features
scale = 0.00390625
X_train = X_train*scale
X_test = X_test*scale

from systemml.mllearn import Keras2DML
sysml_model = Keras2DML(spark, keras_model, input_shape=(1,28,28), weights='weights_dir')
sysml_model.setConfigProperty('sysml.native.blas', 'openblas')
sysml_model.setConfigProperty('sysml.native.blas.directory', os.path.join(os.getcwd(),'OpenBLAS-0.2.20/'))
# sysml_model.setGPU(True).setForceGPU(True)
sysml_model.summary()
sysml_model.fit(X_train, y_train)