In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("..")

In [3]:
from optimus import Optimus

from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType

In [4]:
# Create optimus
op = Optimus()

In [6]:
df = op.create.df([
                ("words", "str", True),
                ("num", "int", True),
                ("animals", "str", True),
                ("thing", StringType(), True),
                ("second", "int", True),
                ("filter", StringType(), True)
            ],
            [
                ("  I like     fish  ", 1, "dog dog", "housé", 5 , "a"),
                ("    zombies", 2, "cat", "tv", 6, "b"),
                ("simpsons   cat lady", 2, "frog", "table", 7, "1"),
                (None, 3, "eagle", "glass", 8, "c")
                
            ])

df.show()

+-------------------+---+-------+-----+------+------+
|              words|num|animals|thing|second|filter|
+-------------------+---+-------+-----+------+------+
|  I like     fish  |  1|dog dog|housé|     5|     a|
|            zombies|  2|    cat|   tv|     6|     b|
|simpsons   cat lady|  2|   frog|table|     7|     1|
|               null|  3|  eagle|glass|     8|     c|
+-------------------+---+-------+-----+------+------+



In [6]:
df.dtypes

[('words', 'string'),
 ('num', 'int'),
 ('animals', 'string'),
 ('thing', 'string'),
 ('second', 'int'),
 ('filter', 'string')]

## Append row
### Spark
Not available in Spark. You need to create a dataframe and the union to append a row

In [8]:
df.rows.append(["this is a word",2, "this is an animal", "this is a thing", 64, "this is a filter"]).table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b
simpsons⸱⸱⸱cat⸱lady,2,frog,table,7,1
,3,eagle,glass,8,c
this⸱is⸱a⸱word,2,this⸱is⸱an⸱animal,this⸱is⸱a⸱thing,64,this⸱is⸱a⸱filter


## Sort

### Sort columns desc (This is the default value)

In [13]:
df.rows.sort("animals").table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
simpsons⸱⸱⸱cat⸱lady,2,frog,table,7,1
,3,eagle,glass,8,c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b


In [14]:
df.rows.sort("animals", "desc").table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
simpsons⸱⸱⸱cat⸱lady,2,frog,table,7,1
,3,eagle,glass,8,c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b


### Sort by multiples columns

In [16]:
df.rows.sort([("animals","desc"),("thing","asc")]).table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
simpsons⸱⸱⸱cat⸱lady,2,frog,table,7,1
,3,eagle,glass,8,c
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b


In [21]:
a = [("animals","desc"),("thing","asc")]
for c in a:
    print(c[0])

animals
thing


## Select

In [18]:
df.rows.select(df["num"]==1).table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a


## Select by type
### Spark 
Not available in Spark Vanilla.

In [20]:
df.rows.select_by_dtypes("filter", "integer").table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
simpsons⸱⸱⸱cat⸱lady,2,frog,table,7,1


## Drop 
### Spark
Drop by row not available in Spark Vanilla

In [23]:
df.rows.drop((df["num"]==2) | (df["second"]==5)).table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
,3,eagle,glass,8,c


In [24]:
df.rows.drop_by_dtypes("filter", "int").table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b
,3,eagle,glass,8,c


### Drop by type

In [13]:
df.rows.drop_by_dtypes("filter", "integer").table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
⸱⸱⸱⸱zombies,2,cat,tv,6,b
,3,eagle,glass,8,c


### Drop using an abstract UDF

In [26]:
from optimus.functions import abstract_udf as audf

def func_data_type(value, attr):
    return value >1


df.rows.drop(audf("num", func_data_type, "boolean")).table()

words  (string),num  (int),animals  (string),thing  (string),second  (int),filter  (string)
⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱,1,dog⸱dog,housé,5,a
