In [1]:
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import udf, lit, col, when, avg, countDistinct, year, month
from pyspark.sql import Window, DataFrame
from pyspark.sql.types import IntegerType

from pyspark.ml.pipeline import Pipeline, Transformer
import numpy as np
import pandas as pd
import pickle
import dill
import codecs

%load_ext autoreload
%autoreload 2

import importlib

In [2]:
sc=SparkContext(appName='jlg')
sqlcontext=SQLContext(sc)

In [3]:
# create dumb pyspark dataframe

X1 = np.random.rand(1000).reshape(-1,1)
X2 = np.random.rand(1000).reshape(-1,1)
X3 = np.random.rand(1000).reshape(-1,1)
X4 = np.random.rand(1000).reshape(-1,1)
Y = X1*2 + X2*4 + X3*2 + X4*1

m = np.hstack([X1,X2,X3,X4,Y])
dataset = pd.DataFrame(m)
dataset.columns = ['X1','X2','X3','X4','Label']

dataset.to_csv('data/foo.csv', index=False)

df = sqlcontext.createDataFrame(dataset,schema=["F1", "F2", "F3", "F4", "Label"])

df.show(5)

+-------------------+-------------------+-------------------+-------------------+------------------+
|                 F1|                 F2|                 F3|                 F4|             Label|
+-------------------+-------------------+-------------------+-------------------+------------------+
|0.17417849792799078|0.43009799536538307| 0.8422974270679776| 0.6044842735723016| 4.357828105025771|
| 0.1585147214988274|0.19787563334191083|0.39240791717944856|  0.504527825099862|2.3978756358240574|
|  0.162428980461798| 0.6369621354799332| 0.9080660307106092| 0.9221890530493648| 5.611027617313912|
|0.21734977232977237| 0.5961077334846863| 0.3804233377117411| 0.6080774689085126| 4.188054622930284|
|0.21017937428222921| 0.6413138500857783|0.15822538788326335|0.10113932772551038|3.4032042523996084|
+-------------------+-------------------+-------------------+-------------------+------------------+
only showing top 5 rows



In [4]:
# create custome transfomer

def Linear_Scaler(params):
    """
    A custom Transformer which scale the value up
    """
    foo = udf(lambda x: x*2)
    
    context = params['context']
    df = context.read.csv('data/foo.csv', header='true', inferSchema = 'true')
    
    alpha = params['alpha'] 
    inputCol = 'X1'
    outputCol = 'F1'
    
    # do transform
    tmp = df.withColumn(outputCol, df[inputCol]*alpha)
    return tmp

    

In [5]:
foo = Linear_Scaler({'context':sqlcontext,'alpha':2.0})
foo.show(3)


+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|                 X1|                 X2|                 X3|                X4|             Label|                 F1|
+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|0.17417849792799078|0.43009799536538307| 0.8422974270679776|0.6044842735723016| 4.357828105025771|0.34835699585598157|
| 0.1585147214988274|0.19787563334191083|0.39240791717944856| 0.504527825099862|2.3978756358240574| 0.3170294429976548|
|  0.162428980461798| 0.6369621354799332| 0.9080660307106092|0.9221890530493648| 5.611027617313912|  0.324857960923596|
+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
only showing top 3 rows



In [10]:
from src.core.store import Store
from src.core.feature_meta import Feature_Meta

store = Store('store_config.json')
f = Feature_Meta('foo_scaler')
f.author = 'Kai Niu'
f.params = {'context':'The pyspark context, must provided.','alpha':'the scaler coef, optional.'}
f.comment = 'scale the data by the coef alpha'

store.info()

== Kai's Feature Store Information ==
- Meta Data Manager: default
- Supported Meta Data managers:  ['flat file meta manager (default)']
- Supported Persistors:  ['flat file persistor (default)']
- Supported Serializers:  ['dill Serializer (default)']


In [11]:
store.register(f, Linear_Scaler)

In [12]:
store.remove('145f33e9-bbaf-443e-ab15-d30e86770e65')
store.catalog()

== Feature Catalog ==
foo_scaler 	 e1c95611-cf7d-4097-9c41-4d564f8b0483 	 04, Jun 2019 	 Kai Niu


In [16]:
store.feature_info('e1c95611-cf7d-4097-9c41-4d564f8b0483')

== Feature Detail ==
foo_scaler 	 e1c95611-cf7d-4097-9c41-4d564f8b0483 	 04, Jun 2019 	 Kai Niu
params: 
     context: The pyspark context, must provided.
     alpha: the scaler coef, optional.
comments: scale the data by the coef alpha


In [17]:
params = {'context':sqlcontext,'alpha':2.0}
uid = 'e1c95611-cf7d-4097-9c41-4d564f8b0483'

p = store.checkout(uid, params)

In [18]:
p.show(3)

+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|                 X1|                 X2|                 X3|                X4|             Label|                 F1|
+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
|0.17417849792799078|0.43009799536538307| 0.8422974270679776|0.6044842735723016| 4.357828105025771|0.34835699585598157|
| 0.1585147214988274|0.19787563334191083|0.39240791717944856| 0.504527825099862|2.3978756358240574| 0.3170294429976548|
|  0.162428980461798| 0.6369621354799332| 0.9080660307106092|0.9221890530493648| 5.611027617313912|  0.324857960923596|
+-------------------+-------------------+-------------------+------------------+------------------+-------------------+
only showing top 3 rows

