In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from optimus import *

In [4]:
# Create optimus
op = Optimus()


             ____        __  _                     
            / __ \____  / /_(_)___ ___  __  _______
           / / / / __ \/ __/ / __ `__ \/ / / / ___/
          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
              /_/                                  
              
Just checking that all necessary environments vars are present...
-----
PYSPARK_PYTHON=python
SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
JAVA_HOME=C:\java8
-----
Starting or getting SparkSession and SparkContext...
Setting checkpoint folder ( local ). If you are in a cluster initialize optimus with master='your_ip' as param
Deleting previous folder if exists...
Creating the checkpoint directory...
Optimus successfully imported. Have fun :).


## Create dataframe
### Spark

This is ugly:

```
val someData = Seq(
  Row(8, "bat"),
  Row(64, "mouse"),
  Row(-27, "horse")
)

val someSchema = List(
  StructField("number", IntegerType, true),
  StructField("word", StringType, true)
)

val someDF = spark.createDataFrame(
  spark.sparkContext.parallelize(someData),
  StructType(someSchema)
)```

In [5]:
# Thanks Mr Powers
df = op.create.df(
    [
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("two strings", StringType(), True),
                ("filter", StringType(), True),
                ("num 2", "string", True),
                ("date", "string", True),
                ("num 3", "string", True)
                
            ],[
                ("  I like     fish  ", 1, "dog", "&^%$#housé", "cat-car", "a","1", "20150510", "3"),
                ("    zombies", 2, "cat", "tv", "dog-tv", "b","2", "20160510", "3"),
                ("simpsons   cat lady", 2, "frog", "table","eagle-tv-plus","1","3", "20170510", "4"),
                (None, 3, "eagle", "glass", "lion-pc", "c","4", "20180510", "5"),
    
            ]
            )

df.show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  1|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|
|               null|  3|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+



In [6]:
from pyspark.sql.functions import udf

# Use udf to define a row-at-a-time udf

# Input/output are both a single double value
@udf('int')
def plus_one(v):
    return v+1

df.withColumn('new_num', plus_one(df.num)).show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|new_num|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+
|  I like     fish  |  1|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|      2|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|      3|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|      3|
|               null|  3|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|      4|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+



In [7]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

# Use pandas_udf to define a Pandas UDF
@pandas_udf('double', PandasUDFType.SCALAR)
# Input/output are both a pandas.Series of doubles

def pandas_plus_one(v):
    return v + 1

df.withColumn('new_num', pandas_plus_one(df.num)).show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|new_num|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+
|  I like     fish  |  1|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|    2.0|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|    3.0|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|    3.0|
|               null|  3|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|    4.0|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+-------+



In [9]:
# Choose a column for analyzing
detector = op.OutlierDetector(df,"num")
# With the outliers() method you can use MAD to detect if there is an outlier in your column
detector.outliers()
# And with the run() method you can see which values are not outliers
detector.run()
# Finally with the delete_outliers() method you can delete existing outliers in your column. 
# This will modify the dataframe we have used when instantiating the OutlierDetector
# (deleting the whole row that contains the outlier value), but the original dataframe that we 
# read from disk will be intact.
detector.delete_outliers().show()

AttributeError: 'Optimus' object has no attribute 'OutlierDetector'

In [13]:
df.show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  1|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|
|               null|  3|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+



In [6]:
from pyspark.sql import functions as F

#def my_func(attr):
#    def inner(df):
#        print(attr)
#        return F.when(F.col(df)>0 ,1)
#    return inner

def my_func(col_name, attr):
    return F.when(F.col(col_name)>0 ,2)

def udf_my_func(value, attr):
    return str(value +1)
    
    
def apply(col,func, type = "columnExp"):
    if type is "udf":
        def apply_func(attr, func):            
            return F.udf(lambda l: func(l, attr))
           
    else:
        def apply_func(attr, func):
            def inner(df):
                return func(df, attr)
            return inner                
    
    #return df.withColumn(col, func("function attrs")(col))
    return df.withColumn(col, apply_func("function attrs", func)(col))
    
apply("num", my_func).show()
apply("num", udf_my_func, "udf").show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  2|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|
|               null|  2|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  2|    dog|&^%$#hous

In [9]:
from pyspark.sql import functions as F
def my_func(col_name, attr):
    return F.when(F.col(col_name)>0 ,2)

def udf_my_func(value, attr):
    return str(value +1)
    
df.cols().apply("num", my_func).show()
df.cols().apply("num", udf_my_func, "udf").show()

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  2|    dog|&^%$#housé|      cat-car|     a|    1|20150510|    3|
|            zombies|  2|    cat|        tv|       dog-tv|     b|    2|20160510|    3|
|simpsons   cat lady|  2|   frog|     table|eagle-tv-plus|     1|    3|20170510|    4|
|               null|  2|  eagle|     glass|      lion-pc|     c|    4|20180510|    5|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+

+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|              words|num|animals|     thing|  two strings|filter|num 2|    date|num 3|
+-------------------+---+-------+----------+-------------+------+-----+--------+-----+
|  I like     fish  |  2|    dog|&^%$#hous

In [12]:
from pyspark.sql.functions import udf, col
from pyspark.sql import SQLContext
sqlContext = SQLContext(op.get_sc())

#sample data
a= sqlContext.createDataFrame([("A", 20), ("B", 30), ("D", 80)],["Letter", "distances"])
label_list = ["Great", "Good", "OK", "Please Move", "Dead"]

def cate(label, feature_list):

    if feature_list == 0:
        return label[4]
    else:  #you may need to add 'else' condition as well otherwise 'null' will be added in this case
        return label

def udf_score(label_list):
    return udf(lambda l: cate(l, label_list))
a.withColumn("category", udf_score(label_list)(col("distances"))).show()

+------+---------+--------+
|Letter|distances|category|
+------+---------+--------+
|     A|       20|      20|
|     B|       30|      30|
|     D|       80|      80|
+------+---------+--------+



In [51]:
columns_attr =  [('col1','1','3'),('col2','2','4'),('col3','3','5')]
cols = [('col1'),('col2'),('col3')]
attr = [(1,4),(5,4),(5,6)]


In [132]:
cols = ['col1','col2','col3']
attr =  [(1,4),(5,4),(5,6)]

for i, (a, b) in enumerate(zip(cols, attr)):
    print (a, b)

col1 (1, 4)
col2 (5, 4)
col3 (5, 6)


In [142]:
columns_attr =  [('col1','1','3'),('col2','2','4'),('col3','3','5')]
#columns_attr =  ['col1','col2']
def parse(attr):
    cols = [(i[0:1][0]) for i in attr]
    attrs = [(i[1:]) for i in attr]

    return cols, attrs

cols, attrs = parse(columns_attr)
print(cols)
print(attrs)

cols = ['num', 'num 2']
#attrs =[('double',), ('double',)]
attrs = None

enum = 
enum = cols



for i, (a, b) in enumerate(zip(cols, attrs)):
    print (a, b)

['col1', 'col2', 'col3']
[('1', '3'), ('2', '4'), ('3', '5')]


TypeError: zip argument #2 must support iteration