In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from optimus import Optimus

In [3]:
# Create optimus
op = Optimus()


             ____        __  _                     
            / __ \____  / /_(_)___ ___  __  _______
           / / / / __ \/ __/ / __ `__ \/ / / / ___/
          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
              /_/                                  
              
Just checking that all necessary environments vars are present...
-----
PYSPARK_PYTHON=python
SPARK_HOME=C:\opt\spark\spark-2.3.1-bin-hadoop2.7
JAVA_HOME=C:\java8
-----
Starting or getting SparkSession and SparkContext...
Setting checkpoint folder ( local ). If you are in a cluster initialize optimus with master='your_ip' as param
Deleting previous folder if exists...
Creating the checkpoint directory...
Optimus successfully imported. Have fun :).


## Create dataframe
### Spark

This is ugly:

```
val someData = Seq(
  Row(8, "bat"),
  Row(64, "mouse"),
  Row(-27, "horse")
)

val someSchema = List(
  StructField("number", IntegerType, true),
  StructField("word", StringType, true)
)

val someDF = spark.createDataFrame(
  spark.sparkContext.parallelize(someData),
  StructType(someSchema)
)```

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType

df = op.create.df(
            [
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("two strings", StringType(), True),
                ("filter", StringType(), True),
                ("num 2", "string", True)

            ]
,
[
                ("  I like     fish  ", 1, "dog", "housé", "cat-car", "a","1"),
                ("    zombies", 2, "cat", "tv", "dog-tv", "b","2"),
                ("simpsons   cat lady", 2, "frog", "table","eagle-tv-plus","1","3"),
                (None, 3, "eagle", "glass", "lion-pc", "c","4")
            ])

df.show()

+-------------------+---+-------+-----+-------------+------+-----+
|              words|num|animals|thing|  two strings|filter|num 2|
+-------------------+---+-------+-----+-------------+------+-----+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|
+-------------------+---+-------+-----+-------------+------+-----+



## Create Columns
### Spark
* You can not create multiple columns at the same time
* You need to use the lit function. lit???

### Pandas
* Assing function seems to do the job https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas


In [5]:
df = df.cols().append("new_col_1", 1)
df.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [6]:
from pyspark.sql.functions import *

df.cols().append([
    ("new_col_2", 2.22),
    ("new_col_3", lit(3)),
    ("new_col_4", "test"),
    ("new_col_5", df['num']*2)
    ]).show()



+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|new_col_2|new_col_3|new_col_4|new_col_5|
+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|     2.22|        3|     test|        2|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|     2.22|        3|     test|        4|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|     2.22|        3|     test|        4|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|     2.22|        3|     test|        6|
+-------------------+---+-------+-----+-------------+------+-----+---------+---------+---------+---------+---------+



## Select columns
### Spark
* You can not select columns by string and index at the same time

### Pandas
* You can not select columns by string and index at the same time

In [7]:
columns = ["words", 1, "animals", 3, 0]
df.cols().filter(columns).show()

['words', 'num', 'animals', 'thing', 'words']
+-------------------+---+-------+-----+-------------------+
|              words|num|animals|thing|              words|
+-------------------+---+-------+-----+-------------------+
|  I like     fish  |  1|    dog|housé|  I like     fish  |
|            zombies|  2|    cat|   tv|            zombies|
|simpsons   cat lady|  2|   frog|table|simpsons   cat lady|
|               null|  3|  eagle|glass|               null|
+-------------------+---+-------+-----+-------------------+



In [8]:
df.cols().filter("n.*", regex = True).show()

['num', 'num 2', 'new_col_1']
+---+-----+---------+
|num|num 2|new_col_1|
+---+-----+---------+
|  1|    1|        1|
|  2|    2|        1|
|  2|    3|        1|
|  3|    4|        1|
+---+-----+---------+



## Rename Column
### Spark
You can not rename multiple columns using Spark Vanilla API


### Pandas
* Almost the same behavior https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html

In [9]:
df.cols().rename([('num','number')]).show()

+-------------------+------+-------+-----+-------------+------+-----+---------+
|              words|number|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+------+-------+-----+-------------+------+-----+---------+
|  I like     fish  |     1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|     2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|     2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|     3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+------+-------+-----+-------------+------+-----+---------+



In [10]:
df.cols().rename(func = str.lower).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [11]:
df.cols().rename(func = str.upper).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              WORDS|NUM|ANIMALS|THING|  TWO STRINGS|FILTER|NUM 2|NEW_COL_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



## Cast a columns

### Spark
* Can not cast multiple columns

### Pandas
This is a opinionated way to handle column casting. 
One of the first thing that every data cleaning process need to acomplish is define a data dictionary.
Because of that we prefer to create a tuple like this:

df.cols().cast(
[("words","str"),
("num","int"),
("animals","float"),
("thing","str")]
)

instead of pandas

pd.Series([1], dtype='int32')
pd.Series([2], dtype='string')

https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.astype.html

In [12]:
df.cols().cast([("num", "string"),("num 2", "integer")])

['num', 'num 2']


DataFrame[words: string, num: string, animals: string, thing: string, two strings: string, filter: string, num 2: int, new_col_1: int]

## Keep columns
### Spark
* You can not remove multiple columns

### Pandas
* Handle in pandas with drop


In [13]:
from pyspark.sql.functions import *
df.withColumn("num", col("num").cast(StringType()))


DataFrame[words: string, num: string, animals: string, thing: string, two strings: string, filter: string, num 2: string, new_col_1: int]

In [14]:
df.cols().keep("num").show()

['num']
+---+
|num|
+---+
|  1|
|  2|
|  2|
|  3|
+---+



## Move columns
### Spark
Do not exist in spark

### Pandas
Do not exist in pandas

In [15]:
df.cols().move("words", "thing", "after").show()

['words']
['thing']
+---+-------+-----+-------------------+-------------+------+-----+---------+
|num|animals|thing|              words|  two strings|filter|num 2|new_col_1|
+---+-------+-----+-------------------+-------------+------+-----+---------+
|  1|    dog|housé|  I like     fish  |      cat-car|     a|    1|        1|
|  2|    cat|   tv|            zombies|       dog-tv|     b|    2|        1|
|  2|   frog|table|simpsons   cat lady|eagle-tv-plus|     1|    3|        1|
|  3|  eagle|glass|               null|      lion-pc|     c|    4|        1|
+---+-------+-----+-------------------+-------------+------+-----+---------+



## Sorting Columns
### Spark
You can not sort columns using Spark Vanilla API 

### Pandas
Similar to pandas
http://pandas.pydata.org/pandas-docs/version/0.19/generated/pandas.DataFrame.sort_values.html#pandas.DataFrame.sort_values

In [16]:
df.cols().sort().show()

+-------+------+---------+---+-----+-----+-------------+-------------------+
|animals|filter|new_col_1|num|num 2|thing|  two strings|              words|
+-------+------+---------+---+-----+-----+-------------+-------------------+
|    dog|     a|        1|  1|    1|housé|      cat-car|  I like     fish  |
|    cat|     b|        1|  2|    2|   tv|       dog-tv|            zombies|
|   frog|     1|        1|  2|    3|table|eagle-tv-plus|simpsons   cat lady|
|  eagle|     c|        1|  3|    4|glass|      lion-pc|               null|
+-------+------+---------+---+-----+-----+-------------+-------------------+



In [17]:
df.cols().sort(reverse = True).show()

+-------------------+-------------+-----+-----+---+---------+------+-------+
|              words|  two strings|thing|num 2|num|new_col_1|filter|animals|
+-------------------+-------------+-----+-----+---+---------+------+-------+
|  I like     fish  |      cat-car|housé|    1|  1|        1|     a|    dog|
|            zombies|       dog-tv|   tv|    2|  2|        1|     b|    cat|
|simpsons   cat lady|eagle-tv-plus|table|    3|  2|        1|     1|   frog|
|               null|      lion-pc|glass|    4|  3|        1|     c|  eagle|
+-------------------+-------------+-----+-----+---+---------+------+-------+



## Drop columns
### Spark 
* You can not delete multiple colums

### Pandas
* Almost the same as pandas
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [18]:
df2 = df.cols().drop("num")
df2 = df.cols().drop(["num","words"])
df2.show()

['num']
['num', 'words']
+-------+-----+-------------+------+-----+---------+
|animals|thing|  two strings|filter|num 2|new_col_1|
+-------+-----+-------------+------+-----+---------+
|    dog|housé|      cat-car|     a|    1|        1|
|    cat|   tv|       dog-tv|     b|    2|        1|
|   frog|table|eagle-tv-plus|     1|    3|        1|
|  eagle|glass|      lion-pc|     c|    4|        1|
+-------+-----+-------------+------+-----+---------+



## Chaining

cols y rows functions are used to organize and encapsulate optimus' functionality apart of Apache Spark Dataframe API. This have a disadvantage at chaining time because we need to user invoke cols or rows in every step.

At the same time it can be helpfull when you look at the code because every line is self explained.

In [20]:
df\
    .cols().rename([('num','number')])\
    .cols().drop(["number","words"])\
    .withColumn("new_col_2", lit("spongebob"))\
    .cols().append("new_col_1", 1)\
    .cols().sort(reverse= True)\
    .show()

+-------------+-----+-----+---------+---------+------+-------+
|  two strings|thing|num 2|new_col_2|new_col_1|filter|animals|
+-------------+-----+-----+---------+---------+------+-------+
|      cat-car|housé|    1|spongebob|        1|     a|    dog|
|       dog-tv|   tv|    2|spongebob|        1|     b|    cat|
|eagle-tv-plus|table|    3|spongebob|        1|     1|   frog|
|      lion-pc|glass|    4|spongebob|        1|     c|  eagle|
+-------------+-----+-----+---------+---------+------+-------+



## Split Columns
### Spark

### Pandas

In [21]:
df.cols().split("two strings","-", n=3).show()

AssertionError: Error: get param must be an integer

In [24]:
df.cols().split("two strings","-", get = 1).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+-----+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|COL_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+-----+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|  car|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|   tv|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|   tv|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|   pc|
+-------------------+---+-------+-----+-------------+------+-----+---------+-----+



## Impute
### Spark

In [25]:
df1 =df.cols().cast([("num","double"),("num 2", "double")])

df1.cols().impute(["num","num 2"], ["out_a","out_B"], strategy="mean")
df1.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |1.0|    dog|housé|      cat-car|     a|  1.0|        1|
|            zombies|2.0|    cat|   tv|       dog-tv|     b|  2.0|        1|
|simpsons   cat lady|2.0|   frog|table|eagle-tv-plus|     1|  3.0|        1|
|               null|3.0|  eagle|glass|      lion-pc|     c|  4.0|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



## Get columns by type
### Spark

### Pandas

In [38]:
df.cols().filter_by_type("int").show()

+---+---------+
|num|new_col_1|
+---+---------+
|  1|        1|
|  2|        1|
|  2|        1|
|  3|        1|
+---+---------+



## Apply custom function
### Spark
You need to declare a UDF Spark function

### Pandas
Almost the same behavior

In [27]:
def func(val, attr):
    return str(int(val) + 1 )

df.cols().apply( ["num","num 2"], func, "udf").show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  2|    dog|housé|      cat-car|     a|    2|        1|
|            zombies|  3|    cat|   tv|       dog-tv|     b|    3|        1|
|simpsons   cat lady|  3|   frog|table|eagle-tv-plus|     1|    4|        1|
|               null|  4|  eagle|glass|      lion-pc|     c|    5|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [28]:
from pyspark.sql import functions as F
def func(col_name, attr):
    return F.when(F.col(col_name)>0 ,2)

df.cols().apply( ["num","num 2"], func).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  2|    dog|housé|      cat-car|     a|    2|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    2|        1|
|               null|  2|  eagle|glass|      lion-pc|     c|    2|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [29]:
from pyspark.sql import functions as F
def func(col_name, attr):
    return F.upper(F.col(col_name))

df.cols().apply(["two strings","animals"], func).show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    DOG|housé|      CAT-CAR|     a|    1|        1|
|            zombies|  2|    CAT|   tv|       DOG-TV|     b|    2|        1|
|simpsons   cat lady|  2|   FROG|table|EAGLE-TV-PLUS|     1|    3|        1|
|               null|  3|  EAGLE|glass|      LION-PC|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



## Count Nulls
### Spark

### Pandas

In [30]:
import numpy as np

df_null = op.get_ss().createDataFrame(
    [(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')), (1, 6, float('nan'))],
    ('session', "timestamp1", "id2"))

In [31]:
df_null.cols().count_na("id2")

{'id2': 5}

In [32]:
df_null.cols().count_na("*")

{'session': 0, 'timestamp1': 0, 'id2': 5}

## Count uniques
### Spark

### Pandas


In [33]:
df.cols().count_uniques("*")

{'words': 4,
 'num': 3,
 'animals': 4,
 'thing': 4,
 'two strings': 4,
 'filter': 4,
 'num 2': 4,
 'new_col_1': 1}

## Unique
### Spark
An abstraction of distinct to be use in multiple columns at the same time

### Pandas
Similar behavior than pandas

In [34]:
df.show()

+-------------------+---+-------+-----+-------------+------+-----+---------+
|              words|num|animals|thing|  two strings|filter|num 2|new_col_1|
+-------------------+---+-------+-----+-------------+------+-----+---------+
|  I like     fish  |  1|    dog|housé|      cat-car|     a|    1|        1|
|            zombies|  2|    cat|   tv|       dog-tv|     b|    2|        1|
|simpsons   cat lady|  2|   frog|table|eagle-tv-plus|     1|    3|        1|
|               null|  3|  eagle|glass|      lion-pc|     c|    4|        1|
+-------------------+---+-------+-----+-------------+------+-----+---------+



In [35]:
df_distinct = op.create.df(
            [
                ("words", "str", True),
                ("num", "int", True)
            ],
[
                ("  I like     fish  ", 1),
                ("    zombies", 2),
                ("simpsons   cat lady", 2),
                (None, 3),
                  (None, 0)
            ])

In [36]:
df_distinct\
    .select("num")\
    .cols().unique().show()

+---+
|num|
+---+
|  1|
|  3|
|  2|
|  0|
+---+



## Count Zeros

In [37]:
df_zeros = df_distinct
df_zeros.show()
df_zeros.cols().count_zeros("*")

+-------------------+---+
|              words|num|
+-------------------+---+
|  I like     fish  |  1|
|            zombies|  2|
|simpsons   cat lady|  2|
|               null|  3|
|               null|  0|
+-------------------+---+



{'words': 0, 'num': 1}

## Pandas comparision
Pandas vs Spark
https://www.analyticsvidhya.com/blog/2016/10/spark-dataframe-and-operations/