In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

In [3]:
from optimus import Optimus

In [4]:
# Create optimus
op = Optimus()

## Create dataframe
### Spark

This is ugly:

```
val someData = Seq(
  Row(8, "bat"),
  Row(64, "mouse"),
  Row(-27, "horse")
)

val someSchema = List(
  StructField("number", IntegerType, true),
  StructField("word", StringType, true)
)

val someDF = spark.createDataFrame(
  spark.sparkContext.parallelize(someData),
  StructType(someSchema)
)```

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType

df = op.create.df(
            [
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("two strings", StringType(), True),
                ("filter", StringType(), True),
                ("num 2", "string", True),
                ("col_array",  ArrayType(StringType()), True),
                ("col_int",  ArrayType(IntegerType()), True)

            ]
,
[
                ("  I like     fish  ", 1, "dog", "housé", "cat-car", "a","1",["baby", "sorry"],[1,2,3]),
                ("    zombies", 2, "cat", "tv", "dog-tv", "b","2",["baby 1", "sorry 1"],[3,4]),
                ("simpsons   cat lady", 2, "frog", "table","eagle-tv-plus","1","3", ["baby 2", "sorry 2"], [5,6,7]),
                (None, 3, "eagle", "glass", "lion-pc", "c","4", ["baby 3", "sorry 3"] ,[7,8])
            ])

df.show()

+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+
|              words|num|animals|thing|  two strings|filter|num 2|        col_array|  col_int|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|    [baby, sorry]|[1, 2, 3]|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|[baby 1, sorry 1]|   [3, 4]|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|[baby 2, sorry 2]|[5, 6, 7]|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|[baby 3, sorry 3]|   [7, 8]|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+



## Create Columns
### Spark
* You can not create multiple columns at the same time
* You need to use the lit function. lit???

### Pandas
* Assing function seems to do the job https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas


In [6]:
df = df.cols().append("new_col_1", 1)
df.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [7]:
from pyspark.sql.functions import *

df.cols().append([
    ("new_col_2", 2.22),
    ("new_col_3", lit(3)),
    ("new_col_4", "test"),
    ("new_col_5", df['num']*2)
    ]).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|new_col_2|new_col_3|new_col_4|new_col_5|
+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|     2.22|        3|     test|        2|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|     2.22|        3|     test|        4|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|     2.22|        3|     test|        4|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|     2.22|        3|     test|        6|
+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+



## Select columns
### Spark
* You can not select columns by string and index at the same time

### Pandas
* You can not select columns by string and index at the same time

In [8]:
columns = ["words", 1, "animals", 3, 0]
df.cols().filter(columns).show()

+-------------------+---+-------+-----+-------------------+
|              words|num|animals|thing|              words|
+-------------------+---+-------+-----+-------------------+
|  I like     fish  |  1|    dog|housé|  I like     fish  |
|            zombies|  2|    cat|   tv|            zombies|
|simpsons   cat lady|  2|   frog|table|simpsons   cat lady|
|               null|  3|  eagle|glass|               null|
+-------------------+---+-------+-----+-------------------+



In [9]:
df.cols().filter("n.*", regex = True).show()

+---+-----+---------+
|num|num 2|new_col_1|
+---+-----+---------+
|  1|    1|        1|
|  2|    2|        1|
|  2|    3|        1|
|  3|    4|        1|
+---+-----+---------+



In [10]:
df.cols().filter("*", data_type = "str").show()
df.show()

+------+-------------+-------------------+-------+-----+-----+
|filter|  two strings|              words|animals|thing|num 2|
+------+-------------+-------------------+-------+-----+-----+
|     a|      cat-car|  I like     fish  |    dog|housé|    1|
|     b|       dog-tv|            zombies|    cat|   tv|    2|
|     1|eagle-tv-plus|simpsons   cat lady|   frog|table|    3|
|     c|      lion-pc|               null|  eagle|glass|    4|
+------+-------------+-------------------+-------+-----+-----+

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|

## Rename Column
### Spark
You can not rename multiple columns using Spark Vanilla API


### Pandas
* Almost the same behavior https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html

In [11]:
df.cols().rename([('num','number')]).show()

+-------------------+------+-------+-----+-------------+------+-----+---------+
|              words|number|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+------+-------+-----+-------------+------+-----+---------+
|  I like     fish  |     1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|     2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|     2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|     3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+------+-------+-----+-------------+------+-----+---------+



In [12]:
df.cols().rename(func = str.lower).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [12]:
df.cols().rename(func = str.upper).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              WORDS|NUM|ANIMALS|THING|  TWO STRINGS|FILTER|NUM 2|NEW_COL_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



## Cast a columns

### Spark
* Can not cast multiple columns

### Pandas
This is a opinionated way to handle column casting. 
One of the first thing that every data cleaning process need to acomplish is define a data dictionary.
Because of that we prefer to create a tuple like this:

df.cols().cast(
[("words","str"),
("num","int"),
("animals","float"),
("thing","str")]
)


https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.astype.html

In [13]:
df.cols().cast([("num", "string"),("num 2", "integer")])

DataFrame[words: string, num: string, animals: string, thing: string, two strings: string, filter: string, num 2: int, new_col_1: int]

## Keep columns
### Spark
* You can not remove multiple columns

### Pandas
* Handle in pandas with drop


In [14]:
from pyspark.sql.functions import *
df.withColumn("num", col("num").cast(StringType()))


DataFrame[words: string, num: string, animals: string, thing: string, two strings: string, filter: string, num 2: string, new_col_1: int]

In [15]:
df.cols().keep("num").show()

+---+
|num|
+---+
|  1|
|  2|
|  2|
|  3|
+---+



## Move columns
### Spark
Do not exist in spark

### Pandas
Do not exist in pandas

In [16]:
df.cols().move("words", "thing", "after").show()

+---+-------+-----+-------------------+-------------+------+-----+---------+
|num|animals|thing|              words|  two strings|filter|num 2|new_col_1|
+---+-------+-----+-------------------+-------------+------+-----+---------+
|  1|    dog|housé|  I like     fish  |      cat-car|     a|    1|        1|
|  2|    cat|   tv|            zombies|       dog-tv|     b|    2|        1|
|  2|   frog|table|simpsons   cat lady|eagle-tv-plus|     1|    3|        1|
|  3|  eagle|glass|               null|      lion-pc|     c|    4|        1|
+---+-------+-----+-------------------+-------------+------+-----+---------+



## Sorting Columns
### Spark
You can not sort columns using Spark Vanilla API 

### Pandas
Similar to pandas
http://pandas.pydata.org/pandas-docs/version/0.19/generated/pandas.DataFrame.sort_values.html#pandas.DataFrame.sort_values

In [17]:
df.cols().sort().show()

+-------+------+---------+---+-----+-----+-------------+-------------------+
|animals|filter|new_col_1|num|num 2|thing|  two strings|              words|
+-------+------+---------+---+-----+-----+-------------+-------------------+
|    dog|     a|        1|  1|    1|housé|      cat-car|  I like     fish  |
|    cat|     b|        1|  2|    2|   tv|       dog-tv|            zombies|
|   frog|     1|        1|  2|    3|table|eagle-tv-plus|simpsons   cat lady|
|  eagle|     c|        1|  3|    4|glass|      lion-pc|               null|
+-------+------+---------+---+-----+-----+-------------+-------------------+



In [20]:
df.cols().sort(order = "desc").show()

+-------------------+-------------+-----+-----+---+---------+------+-------+
|              words|  two strings|thing|num 2|num|new_col_1|filter|animals|
+-------------------+-------------+-----+-----+---+---------+------+-------+
|  I like     fish  |      cat-car|housé|    1|  1|        1|     a|    dog|
|            zombies|       dog-tv|   tv|    2|  2|        1|     b|    cat|
|simpsons   cat lady|eagle-tv-plus|table|    3|  2|        1|     1|   frog|
|               null|      lion-pc|glass|    4|  3|        1|     c|  eagle|
+-------------------+-------------+-----+-----+---+---------+------+-------+



## Drop columns
### Spark 
* You can not delete multiple colums

### Pandas
* Almost the same as pandas
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [21]:
df2 = df.cols().drop("num")
df2 = df.cols().drop(["num","words"])
df2.show()

+-------+-----+-------------+------+-----+---------+
|animals|thing|  two strings|filter|num 2|new_col_1|
+-------+-----+-------------+------+-----+---------+
|    dog|housé|      cat-car|     a|    1|        1|
|    cat|   tv|       dog-tv|     b|    2|        1|
|   frog|table|eagle-tv-plus|     1|    3|        1|
|  eagle|glass|      lion-pc|     c|    4|        1|
+-------+-----+-------------+------+-----+---------+



## Chaining

cols y rows functions are used to organize and encapsulate optimus' functionality apart of Apache Spark Dataframe API. This have a disadvantage at chaining time because we need to user invoke cols or rows in every step.

At the same time it can be helpfull when you look at the code because every line is self explained.

In [23]:
df\
    .cols().rename([('num','number')])\
    .cols().drop(["number","words"])\
    .withColumn("new_col_2", lit("spongebob"))\
    .cols().append("new_col_1", 1)\
    .cols().sort(order= "desc")\
    .show()

+-------------+-----+-----+---------+---------+------+-------+
|  two strings|thing|num 2|new_col_2|new_col_1|filter|animals|
+-------------+-----+-----+---------+---------+------+-------+
|      cat-car|housé|    1|spongebob|        1|     a|    dog|
|       dog-tv|   tv|    2|spongebob|        1|     b|    cat|
|eagle-tv-plus|table|    3|spongebob|        1|     1|   frog|
|      lion-pc|glass|    4|spongebob|        1|     c|  eagle|
+-------------+-----+-----+---------+---------+------+-------+



## Split Columns
### Spark

### Pandas

In [6]:
df.cols().split("two strings","-", n=3).show()

+-------------------+---+-------+-----+-------------+------+-----+-----+-----+-----+
|              words|num|animals|thing|  two strings|filter|num 2|COL_0|COL_1|COL_2|
+-------------------+---+-------+-----+-------------+------+-----+-----+-----+-----+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|  cat|  car| null|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|  dog|   tv| null|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|eagle|   tv| plus|
|               null|  3|  eagle|glass|      lion-pc|     c|    4| lion|   pc| null|
+-------------------+---+-------+-----+-------------+------+-----+-----+-----+-----+



In [25]:
df.cols().split("two strings","-", get = 1).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+-----+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|COL_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+-----+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|  car|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|   tv|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|   tv|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|   pc|
+-------------------+---+-------+-----+-------------+------+-----+---------+-----+



## Impute
### Spark

In [26]:
df_cast =df.cols().cast([("num","double"),("num 2", "double")])
df_cast.dtypes

df_cast.cols().impute(["num","num 2"], ["out_a","out_B"], strategy="mean")
df_cast.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |1.0|    dog|housé|      cat-car|     a|  1.0|        1|
|            zombies|2.0|    cat|   tv|       dog-tv|     b|  2.0|        1|
|simpsons   cat lady|2.0|   frog|table|eagle-tv-plus|     1|  3.0|        1|
|               null|3.0|  eagle|glass|      lion-pc|     c|  4.0|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



## Get columns by type
### Spark

### Pandas

In [27]:
df.cols().filter_by_dtypes("int").show()

+---+---------+
|num|new_col_1|
+---+---------+
|  1|        1|
|  2|        1|
|  2|        1|
|  3|        1|
+---+---------+



## Apply custom function
### Spark
You need to declare a UDF Spark function

### Pandas
Almost the same behavior

In [28]:
df.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [29]:
def sum_(val, attr):
    return attr

df.cols().apply_by_dtypes("filter", sum_, "string", "10", data_type="integer").show()

Py4JJavaError: An error occurred while calling o322.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 24.0 failed 1 times, most recent failure: Lost task 0.0 in stage 24.0 (TID 24, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 219, in main
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 139, in read_udfs
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 119, in read_single_udf
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 59, in read_command
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'optimus'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:171)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:121)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:90)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:131)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:93)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3273)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.GeneratedMethodAccessor29.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 219, in main
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 139, in read_udfs
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 119, in read_single_udf
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 59, in read_command
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'optimus'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:171)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:121)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:90)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:131)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:93)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [46]:
def func(val, attr):
    return val + attr

df.cols().apply(["num", "num"], func, "int", 32 ,"udf").show()

Using 'UDFs' to process column 'num'
Using 'UDFs' to process column 'num'
+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  | 65|    dog|housé|      cat-car|     a|    1|        1|
|            zombies| 66|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady| 66|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null| 67|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [31]:
from optimus.functions import filter_row_by_data_type as fbdt

df.where(fbdt("filter", "integer")).show()

Py4JJavaError: An error occurred while calling o414.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 26.0 failed 1 times, most recent failure: Lost task 0.0 in stage 26.0 (TID 26, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 219, in main
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 139, in read_udfs
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 119, in read_single_udf
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 59, in read_command
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'optimus'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:171)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:121)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:90)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:131)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:93)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	at java.lang.Thread.run(Unknown Source)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1602)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1590)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1589)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1589)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1823)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1772)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1761)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:363)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
	at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collectFromPlan(Dataset.scala:3273)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2484)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2698)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:254)
	at sun.reflect.GeneratedMethodAccessor29.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
	at java.lang.reflect.Method.invoke(Unknown Source)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 219, in main
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 139, in read_udfs
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 119, in read_single_udf
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 59, in read_command
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "C:\opt\spark\spark-2.3.1-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ModuleNotFoundError: No module named 'optimus'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:171)
	at org.apache.spark.sql.execution.python.ArrowPythonRunner$$anon$1.read(ArrowPythonRunner.scala:121)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec$$anon$2.<init>(ArrowEvalPythonExec.scala:90)
	at org.apache.spark.sql.execution.python.ArrowEvalPythonExec.evaluate(ArrowEvalPythonExec.scala:88)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:131)
	at org.apache.spark.sql.execution.python.EvalPythonExec$$anonfun$doExecute$1.apply(EvalPythonExec.scala:93)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$23.apply(RDD.scala:800)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
	... 1 more


In [29]:
from optimus.functions import abstract_udf as audf 

def func(val, attr):
    return val>1

df.where(audf("num", func, "bool")).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [30]:
def func(val, attr):
    return val + attr

df.cols().apply(["num", "num"], func, "int", 10).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  | 21|    dog|housé|      cat-car|     a|    1|        1|
|            zombies| 22|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady| 22|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null| 23|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [31]:
# https://stackoverflow.com/questions/31400143/column-filtering-in-pyspark
from optimus.functions import abstract_udf as audf 

def func(val, attr):
    return val+attr[0]

df.withColumn("num", audf ("num", func, "int", [10,20])).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  | 11|    dog|housé|      cat-car|     a|    1|        1|
|            zombies| 12|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady| 12|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null| 13|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [32]:
from pyspark.sql import functions as F
def func(col_name, attr):
    # return F.col(col_name) + 1
    return F.when(F.col(col_name)>0 ,2)

df.cols().apply_exp(["num","num 2"], func).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  2|    dog|housé|      cat-car|     a|    2|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    2|        1|
|               null|  2|  eagle|glass|      lion-pc|     c|    2|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [33]:
from pyspark.sql import functions as F
def func(col_name, attr):
    return F.upper(F.col(col_name))

df.cols().apply_exp(["two strings","animals"], func).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    DOG|housé|      CAT-CAR|     a|    1|        1|
|            zombies|  2|    CAT|   tv|       DOG-TV|     b|    2|        1|
|simpsons   cat lady|  2|   FROG|table|EAGLE-TV-PLUS|     1|    3|        1|
|               null|  3|  EAGLE|glass|      LION-PC|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [34]:
def func(val, attr):
    return 10

col = "num"

df.cols().apply(col, func, "int", when= df["num"]>1).show()

df.cols().apply(col, func, "int", when= fbdt(col, "integer")).show()


+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies| 10|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady| 10|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null| 10|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  | 10|    dog|housé|      cat-car|     a|    1|        1|
|            zombies| 10|    cat|   tv|       dog-tv|     b|    2|        1

## Count Nulls
### Spark

### Pandas

In [35]:
import numpy as np

df_null = op.get_ss().createDataFrame(
    [(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')), (1, 6, float('nan'))],
    ('session', "timestamp1", "id2"))

In [36]:
df_null.cols().count_na("id2")

{'id2': 5}

In [37]:
df_null.cols().count_na("*")

{'id2': 5, 'session': 0, 'timestamp1': 0}

## Count uniques
### Spark

### Pandas


In [38]:
df.cols().count_uniques("*")

{'animals': 4,
 'filter': 4,
 'new_col_1': 1,
 'num': 3,
 'num 2': 4,
 'thing': 4,
 'two strings': 4,
 'words': 4}

## Unique
### Spark
An abstraction of distinct to be use in multiple columns at the same time

### Pandas
Similar behavior than pandas

In [39]:
df.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [40]:
df_distinct = op.create.df(
            [
                ("words", "str", True),
                ("num", "int", True)
            ],
[
                ("  I like     fish  ", 1),
                ("    zombies", 2),
                ("simpsons   cat lady", 2),
                (None, 3),
                  (None, 0)
            ])

In [41]:
df_distinct\
    .select("num")\
    .cols().unique().show()

+---+
|num|
+---+
|  1|
|  3|
|  2|
|  0|
+---+



## Count Zeros

In [42]:
df_zeros = df_distinct
df_zeros.show()
df_zeros.cols().count_zeros("*")

+-------------------+---+
|              words|num|
+-------------------+---+
|  I like     fish  |  1|
|            zombies|  2|
|simpsons   cat lady|  2|
|               null|  3|
|               null|  0|
+-------------------+---+



{'num': 1, 'words': 0}

## Column Data Types

In [43]:
df.cols().dtypes('*')

{'animals': 'string',
 'filter': 'string',
 'new_col_1': 'int',
 'num': 'int',
 'num 2': 'string',
 'thing': 'string',
 'two strings': 'string',
 'words': 'string'}

## Replace

In [44]:
df.cols().replace(["two strings","animals"], ["dog-tv", "cat", "eagle", "fish"], "animals").show()

df.cols().replace('animals',["dog","cat"],"animals").show()

df.cols().replace('animals',[("dog","animals"),("cat","animals")]).show()

df.cols().replace('animals',"dog",'animal').show()

df.cols().replace('num',["3",2], 10).show()

df.cols().replace('num',[("3",6),(2,6)]).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|animals|   tv|      animals|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|animals|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|animals|housé|      cat-car|     a|    1|        1|
|            zombies|  2|animals|   tv|       dog-tv|     b|    2|        1

In [45]:
df.cols().replace('*','.*[Cc]at.*', 'animal', regex=True).show()
df.cols().replace('*','cat', 'animal', regex=True).show()


+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|       animal|     a|    1|        1|
|            zombies|  2| animal|   tv|       dog-tv|     b|    2|        1|
|             animal|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+

+--------------------+---+-------+-----+-------------+------+-----+---------+
|               words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+--------------------+---+-------+-----+-------------+------+-----+---------+
|   I like     fish  |  1|    dog|housé|   animal-car|     a|    1|        1|
|             zombies|  2| animal|   tv|       dog-tv|     b|    2|    

In [34]:
"array" in "array<string>"

True

## Nest

In [40]:
df.dtypes

[('words', 'string'),
 ('num', 'int'),
 ('animals', 'string'),
 ('thing', 'string'),
 ('two strings', 'string'),
 ('filter', 'string'),
 ('num 2', 'string'),
 ('col_array', 'array<string>')]

In [41]:
df.cols().unnest("col_array").show()

+-------------------+---+-------+-----+-------------+------+-----+-----------------+------+-------+-----+
|              words|num|animals|thing|  two strings|filter|num 2|        col_array| COL_0|  COL_1|COL_2|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+------+-------+-----+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|    [baby, sorry]|  baby|  sorry| null|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|[baby 1, sorry 1]|baby 1|sorry 1| null|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|[baby 2, sorry 2]|baby 2|sorry 2| null|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|[baby 3, sorry 3]|baby 3|sorry 3| null|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+------+-------+-----+



In [132]:
df.dtypes


[('words', 'string'),
 ('num', 'int'),
 ('animals', 'string'),
 ('thing', 'string'),
 ('two strings', 'string'),
 ('filter', 'string'),
 ('num 2', 'string'),
 ('col_array', 'array<string>')]

In [131]:
df.cols().dtypes("*")

[{'words': StringType},
 {'num': IntegerType},
 {'animals': StringType},
 {'thing': StringType},
 {'two strings': StringType},
 {'filter': StringType},
 {'num 2': StringType},
 {'col_array': ArrayType(StringType,true)}]

## Nest

In [33]:
df.cols().nest(["num", "num"], "nested",shape="vector").show()



+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+
|              words|num|animals|thing|  two strings|filter|num 2|        col_array|  col_int|   nested|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|    [baby, sorry]|[1, 2, 3]|[1.0,1.0]|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|[baby 1, sorry 1]|   [3, 4]|[2.0,2.0]|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|[baby 2, sorry 2]|[5, 6, 7]|[2.0,2.0]|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|[baby 3, sorry 3]|   [7, 8]|[3.0,3.0]|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+



In [7]:
df.cols().nest(["animals", "two strings"], ["nested"],shape="string").show()

['animals', 'two strings']
Using 'Column Expression' to process column 'nested'
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+------------------+
|              words|num|animals|thing|  two strings|filter|num 2|        col_array|  col_int|            nested|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+------------------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|    [baby, sorry]|[1, 2, 3]|       dog cat-car|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|[baby 1, sorry 1]|   [3, 4]|        cat dog-tv|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|[baby 2, sorry 2]|[5, 6, 7]|frog eagle-tv-plus|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|[baby 3, sorry 3]|   [7, 8]|     eagle lion-pc|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+------------------+



In [11]:
df.cols().nest(["animals", "two strings"], ["nested"],shape="array").show()

Using 'Column Expression' to process column 'nested'
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+--------------------+
|              words|num|animals|thing|  two strings|filter|num 2|        col_array|  col_int|              nested|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+--------------------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|    [baby, sorry]|[1, 2, 3]|      [dog, cat-car]|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|[baby 1, sorry 1]|   [3, 4]|       [cat, dog-tv]|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|[baby 2, sorry 2]|[5, 6, 7]|[frog, eagle-tv-p...|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|[baby 3, sorry 3]|   [7, 8]|    [eagle, lion-pc]|
+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+--------------------+



## Unnest
Works with arrays, Vectors and strings(ala split)

In [17]:
df\
    .cols().unnest(["col_array"])\
    .cols().unnest(["col_int"])\
    .cols().unnest(["two strings"], n= 3, mark = "-")\
    .cols().filter("two.*", regex = True)\
    .show()

+-------------+-------------+-------------+-------------+
|  two strings|two strings_0|two strings_1|two strings_2|
+-------------+-------------+-------------+-------------+
|      cat-car|          cat|          car|         null|
|       dog-tv|          dog|           tv|         null|
|eagle-tv-plus|        eagle|           tv|         plus|
|      lion-pc|         lion|           pc|         null|
+-------------+-------------+-------------+-------------+



In [14]:
from pyspark.ml.linalg import Vectors

df1 = op.sc.parallelize([
    ("assert", Vectors.dense([1, 2, 3])),
    ("require", Vectors.sparse(3, {1: 2}))
]).toDF(["word", "vector"])


df1\
    .cols().unnest(["vector"])\
    .show()
    

+-------+-------------+---+---+---+
|   word|       vector| _3| _4| _5|
+-------+-------------+---+---+---+
| assert|[1.0,2.0,3.0]|1.0|2.0|3.0|
|require|(3,[1],[2.0])|0.0|2.0|0.0|
+-------+-------------+---+---+---+

