diff --git a/examples/new-api-column.ipynb b/examples/new-api-column.ipynb index cb139372e..ecfc6cc24 100644 --- a/examples/new-api-column.ipynb +++ b/examples/new-api-column.ipynb @@ -1,12 +1,5 @@ { "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", "metadata": {}, @@ -54,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -71,88 +64,142 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -323,7 +370,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 9 columns
\n" + "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -382,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -391,87 +439,143 @@ "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -658,7 +762,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -682,106 +787,166 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1000,7 +1165,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "
new_col_2
\n", "
11 (double)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "
new_col_3
\n", "
12 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 12 columns
\n" + "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -1028,112 +1194,176 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1368,7 +1598,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "
new_col_4
\n", "
11 (string)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "
new_col_5
\n", "
12 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_6
\n", "
13 (array<int>)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 13 columns
\n" + "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -1402,94 +1633,150 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1676,7 +1963,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -1689,51 +1977,85 @@ "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 4 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1824,7 +2146,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 4 columns
\n" + "
Viewing 4 of 4 rows / 4 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -1849,52 +2172,80 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 3 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -1969,7 +2320,8 @@ " \n", "
\n", "
num
\n", "
1 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
2 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
3 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 3 columns
\n" + "
Viewing 4 of 4 rows / 3 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -1992,70 +2344,112 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 6 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2066,19 +2460,19 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2094,19 +2488,19 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2122,19 +2516,19 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2150,19 +2544,19 @@ " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2178,7 +2572,8 @@ " \n", "
\n", - "
filter
\n", + "
num 2
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - "
two strings
\n", + "
filter
\n", "
2 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - "
thing
\n", + "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - "
animals
\n", + "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
words
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - "
num 2
\n", + "
two strings
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", - " a\n", + " 1\n", " \n", - " cat-car\n", + " a\n", " \n", - " housé\n", + " dog\n", " \n", - " dog\n", + " housé\n", " \n", @@ -2086,7 +2480,7 @@ " \n", - " 1\n", + " cat-car\n", "
\n", - " b\n", + " 2\n", " \n", - " dog-tv\n", + " b\n", " \n", - " tv\n", + " cat\n", " \n", - " cat\n", + " tv\n", " \n", @@ -2114,7 +2508,7 @@ " \n", - " 2\n", + " dog-tv\n", "
\n", - " 1\n", + " 3\n", " \n", - " eagle-tv-plus\n", + " 1\n", " \n", - " table\n", + " frog\n", " \n", - " frog\n", + " table\n", " \n", @@ -2142,7 +2536,7 @@ " \n", - " 3\n", + " eagle-tv-plus\n", "
\n", - " c\n", + " 4\n", " \n", - " lion-pc\n", + " c\n", " \n", - " glass\n", + " eagle\n", " \n", - " eagle\n", + " glass\n", " \n", @@ -2170,7 +2564,7 @@ " \n", - " 4\n", + " lion-pc\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 6 columns
\n" + "
Viewing 4 of 4 rows / 6 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -2207,94 +2602,150 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2481,7 +2932,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
number
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -2504,94 +2956,150 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -2778,7 +3286,8 @@ " \n", "
\n", "
WORDS
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NUM
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
ANIMALS
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
THING
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
TWO STRINGS
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
FILTER
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NUM 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
COL_ARRAY
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
COL_INT
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NEW_COL_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -2801,94 +3310,150 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3075,7 +3640,8 @@ " \n", "
\n", "
words
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
animals
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
thing
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
two strings
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
filter
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
num 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_array
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
col_int
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
new_col_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -3098,94 +3664,150 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -3372,7 +3994,8 @@ " \n", "
\n", "
WORDS
\n", "
1 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NUM
\n", "
2 (int)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
ANIMALS
\n", "
3 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
THING
\n", "
4 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
TWO STRINGS
\n", "
5 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
FILTER
\n", "
6 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NUM 2
\n", "
7 (string)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
COL_ARRAY
\n", "
8 (array<string>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
COL_INT
\n", "
9 (array<int>)
\n", - "\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", "
\n", "
NEW_COL_1
\n", "
10 (int)
\n", - "\n", + "
\n", + " \n", + "
\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" ], "text/plain": [ "" @@ -3414,7 +4037,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -3432,7 +4055,7 @@ " ('new_col_1', 'int')]" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -3450,7 +4073,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -3468,7 +4091,7 @@ " ('new_col_1', 'int')]" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -3486,7 +4109,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -3504,7 +4127,7 @@ " ('new_col_1', 'string')]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -3522,23 +4145,16 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 20, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, { "data": { "text/plain": [ "DataFrame[words: string, num: int, animals: string, thing: string, two strings: string, filter: string, num 2: string, col_array: array, col_int: vector, new_col_1: int]" ] }, - "execution_count": 53, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -3563,9 +4179,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "DataFrame[words: string, num: string, animals: string, thing: string, two strings: string, filter: string, num 2: string, col_array: array, col_int: array, new_col_1: int]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pyspark.sql.functions import *\n", "df.withColumn(\"num\", col(\"num\").cast(StringType()))\n" @@ -3573,1189 +4200,23297 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.table()\n", - "df.cols.keep(\"num\").show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Move columns\n", - "### Spark\n", - "Do not exist in spark\n", - "\n", - "### Pandas\n", - "Do not exist in pandas" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.move(\"words\", \"after\", \"thing\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Sorting Columns\n", - "### Spark\n", - "You can not sort columns using Spark Vanilla API \n", - "\n", - "### Pandas\n", - "df.reindex_axis(sorted(df.columns), axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort in Alphabetical order" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.sort().table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Sort in Reverse Alphabetical order" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.sort(order = \"desc\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Drop columns\n", - "### Spark \n", - "* You can not delete multiple colums\n", - "\n", - "### Pandas\n", - "* Almost the same as pandas\n", - "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Drop one columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = df.cols.drop(\"num\")\n", - "df2.table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Drop multiple columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df2 = df.cols.drop([\"num\",\"words\"])\n", - "df2.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Chaining\n", - "\n", - ".cols y .rows attributes are used to organize and encapsulate `optimus` functionality apart from Apache Spark Dataframe API.\n", - "\n", - "At the same time it can be helpfull when you look at the code because every line is self explained.\n", - "\n", - "The past transformations were done step by step, but this can be achieved by chaining all operations into one line of code, like the cell below. This way is much more efficient and scalable because it uses all optimization issues from the lazy evaluation approach." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.table()\n", - "df\\\n", - " .cols.rename([('num','number')])\\\n", - " .cols.drop([\"number\",\"words\"])\\\n", - " .withColumn(\"new_col_2\", lit(\"spongebob\"))\\\n", - " .cols.append(\"new_col_1\", 1)\\\n", - " .cols.sort(order= \"desc\")\\\n", - " .rows.drop(df[\"num 2\"] == 3)\\\n", - " .table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Unnest Columns\n", - "\n", - "With unnest you can convert one column into multiple ones. it can hadle string, array and vectors\n", - "\n", - "### Spark\n", - "Can split strings with split()\n", - "\n", - "### Pandas\n", - "via str.split()" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], - "source": [ - "df.table()\n", - "df.cols.unnest(\"two strings\",\"-\")\\\n", - " .table()" - ] - }, - { + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---+\n", + "|num|\n", + "+---+\n", + "| 1|\n", + "| 2|\n", + "| 2|\n", + "| 3|\n", + "+---+\n", + "\n" + ] + } + ], + "source": [ + "df.table()\n", + "df.cols.keep(\"num\").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Move columns\n", + "### Spark\n", + "Do not exist in spark\n", + "\n", + "### Pandas\n", + "Do not exist in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
num
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
words
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " None\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.move(\"words\", \"after\", \"thing\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sorting Columns\n", + "### Spark\n", + "You can not sort columns using Spark Vanilla API \n", + "\n", + "### Pandas\n", + "df.reindex_axis(sorted(df.columns), axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sort in Alphabetical order" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
animals
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
2 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
3 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
5 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
words
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " dog\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + "
\n", + " cat\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " b\n", + " \n", + " 1\n", + " \n", + " 2\n", + " \n", + " 2\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " ⸱⸱⸱⸱zombies\n", + "
\n", + " frog\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " 2\n", + " \n", + " 3\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " simpsons⸱⸱⸱cat⸱lady\n", + "
\n", + " eagle\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " c\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " 4\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " None\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.sort().table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sort in Reverse Alphabetical order" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
5 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
6 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
8 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
9 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
10 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " cat-car\n", + " \n", + " housé\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " a\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " dog\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " dog-tv\n", + " \n", + " tv\n", + " \n", + " 2\n", + " \n", + " 2\n", + " \n", + " 1\n", + " \n", + " b\n", + " \n", + " [3,⸱4]\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " cat\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " eagle-tv-plus\n", + " \n", + " table\n", + " \n", + " 3\n", + " \n", + " 2\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " frog\n", + "
\n", + " None\n", + " \n", + " lion-pc\n", + " \n", + " glass\n", + " \n", + " 4\n", + " \n", + " 3\n", + " \n", + " 1\n", + " \n", + " c\n", + " \n", + " [7,⸱8]\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " eagle\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.sort(order = \"desc\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Drop columns\n", + "### Spark \n", + "* You can not delete multiple colums\n", + "\n", + "### Pandas\n", + "* Almost the same as pandas\n", + "https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Drop one columns" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
7 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
8 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
9 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df2 = df.cols.drop(\"num\")\n", + "df2.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Drop multiple columns" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 8 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
animals
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
6 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
7 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
8 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 8 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df2 = df.cols.drop([\"num\",\"words\"])\n", + "df2.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Chaining\n", + "\n", + ".cols y .rows attributes are used to organize and encapsulate `optimus` functionality apart from Apache Spark Dataframe API.\n", + "\n", + "At the same time it can be helpfull when you look at the code because every line is self explained.\n", + "\n", + "The past transformations were done step by step, but this can be achieved by chaining all operations into one line of code, like the cell below. This way is much more efficient and scalable because it uses all optimization issues from the lazy evaluation approach." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 3 of 3 rows / 9 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
two strings
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_2
\n", + "
4 (string)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
5 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
7 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " cat-car\n", + " \n", + " housé\n", + " \n", + " 1\n", + " \n", + " spongebob\n", + " \n", + " 1\n", + " \n", + " a\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " dog\n", + "
\n", + " dog-tv\n", + " \n", + " tv\n", + " \n", + " 2\n", + " \n", + " spongebob\n", + " \n", + " 1\n", + " \n", + " b\n", + " \n", + " [3,⸱4]\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " cat\n", + "
\n", + " lion-pc\n", + " \n", + " glass\n", + " \n", + " 4\n", + " \n", + " spongebob\n", + " \n", + " 1\n", + " \n", + " c\n", + " \n", + " [7,⸱8]\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " eagle\n", + "
\n", + "\n", + "
Viewing 3 of 3 rows / 9 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()\n", + "df\\\n", + " .cols.rename([('num','number')])\\\n", + " .cols.drop([\"number\",\"words\"])\\\n", + " .withColumn(\"new_col_2\", lit(\"spongebob\"))\\\n", + " .cols.append(\"new_col_1\", 1)\\\n", + " .cols.sort(order= \"desc\")\\\n", + " .rows.drop(df[\"num 2\"] == 3)\\\n", + " .table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unnest Columns\n", + "\n", + "With unnest you can convert one column into multiple ones. it can hadle string, array and vectors\n", + "\n", + "### Spark\n", + "Can split strings with split()\n", + "\n", + "### Pandas\n", + "via str.split()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
two strings_0
\n", + "
11 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings_1
\n", + "
12 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " cat\n", + " \n", + " car\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " tv\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " eagle\n", + " \n", + " tv\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " lion\n", + " \n", + " pc\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()\n", + "df.cols.unnest(\"two strings\",\"-\")\\\n", + " .table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Only get the first element" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
two strings_1
\n", + "
11 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " car\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " tv\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " tv\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " pc\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.unnest(\"two strings\",\"-\", index = 1).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unnest array of string" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
col_array_0
\n", + "
11 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array_1
\n", + "
12 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " baby\n", + " \n", + " sorry\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " baby⸱1\n", + " \n", + " sorry⸱1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " baby⸱2\n", + " \n", + " sorry⸱2\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " baby⸱3\n", + " \n", + " sorry⸱3\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 12 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df\\\n", + " .cols.unnest([\"col_array\"])\\\n", + " .table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unnest and array of ints" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
col_int_0
\n", + "
11 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int_1
\n", + "
12 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int_2
\n", + "
13 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " 1\n", + " \n", + " 2\n", + " \n", + " 3\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " 4\n", + " \n", + " None\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " 5\n", + " \n", + " 6\n", + " \n", + " 7\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " 7\n", + " \n", + " 8\n", + " \n", + " None\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df\\\n", + " .cols.unnest([\"col_int\"])\\\n", + " .table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Spits in 3 parts" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
two strings_0
\n", + "
11 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings_1
\n", + "
12 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings_2
\n", + "
13 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " cat\n", + " \n", + " car\n", + " \n", + " None\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " tv\n", + " \n", + " None\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " eagle\n", + " \n", + " tv\n", + " \n", + " plus\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " lion\n", + " \n", + " pc\n", + " \n", + " None\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 13 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df\\\n", + " .cols.unnest([\"two strings\"], n= 3, mark = \"-\")\\\n", + " .table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unnest a Vector" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "from pyspark.ml.linalg import Vectors\n", + "\n", + "df1 = op.sc.parallelize([\n", + " (\"assert\", Vectors.dense([1, 2, 3])),\n", + " (\"require\", Vectors.sparse(3, {1: 2}))\n", + "]).toDF([\"word\", \"vector\"]) " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 2 of 2 rows / 5 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
word
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
vector
\n", + "
2 (vector)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
_3
\n", + "
3 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
_4
\n", + "
4 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
_5
\n", + "
5 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " assert\n", + " \n", + " [1.0,2.0,3.0]\n", + " \n", + " 1.0\n", + " \n", + " 2.0\n", + " \n", + " 3.0\n", + "
\n", + " require\n", + " \n", + " (3,[1],[2.0])\n", + " \n", + " 0.0\n", + " \n", + " 2.0\n", + " \n", + " 0.0\n", + "
\n", + "\n", + "
Viewing 2 of 2 rows / 5 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df1\\\n", + " .cols.unnest([\"vector\"])\\\n", + " .table()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.cols.append(\"new_col_1\", 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Impute" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fill missing data" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 5 of 5 rows / 4 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
a
\n", + "
1 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
b
\n", + "
2 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
out_a
\n", + "
3 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
out_b
\n", + "
4 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1.0\n", + " \n", + " nan\n", + " \n", + " 1.0\n", + " \n", + " 4.0\n", + "
\n", + " 2.0\n", + " \n", + " nan\n", + " \n", + " 2.0\n", + " \n", + " 4.0\n", + "
\n", + " nan\n", + " \n", + " 3.0\n", + " \n", + " 2.0\n", + " \n", + " 3.0\n", + "
\n", + " 4.0\n", + " \n", + " 4.0\n", + " \n", + " 4.0\n", + " \n", + " 4.0\n", + "
\n", + " 5.0\n", + " \n", + " 5.0\n", + " \n", + " 5.0\n", + " \n", + " 5.0\n", + "
\n", + "\n", + "
Viewing 5 of 5 rows / 4 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_fill = op.spark.createDataFrame([(1.0, float(\"nan\")), (2.0, float(\"nan\")), \n", + " (float(\"nan\"), 3.0), (4.0, 4.0), (5.0, 5.0)], [\"a\", \"b\"])\n", + "\n", + "imputer = df_fill.cols.impute([\"a\", \"b\"], [\"out_a\", \"out_b\"], \"median\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get columns by type\n", + "### Spark\n", + "Not implemented in Spark Vanilla\n", + "\n", + "### Pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 2 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
new_col_1
\n", + "
1 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " 1\n", + "
\n", + " 1\n", + " \n", + " 2\n", + "
\n", + " 1\n", + " \n", + " 2\n", + "
\n", + " 1\n", + " \n", + " 3\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 2 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.select_by_dtypes(\"int\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Apply custom function\n", + "\n", + "Spark have few ways to transform data rdd, Columns Expression, UDF and Pandas UDF. apply() and apply_expr() try to make a consistent way to call this expression without knowing the implementation details.\n", + "\n", + "### Spark\n", + "You need to declare a UDF Spark function\n", + "\n", + "### Pandas\n", + "Almost the same behavior that Optimus" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a function that only apply to string value in column filter\n", + "\n", + "Sometimes there are columns with for example with numbers even when are supposed to be only of words or letters. \n", + "\n", + "In order to solve this problem, apply_by_dtypes() function can be used. \n", + "\n", + "In the next example we replace a number in a string column with \"new string\"" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " new⸱string\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def func(val, attr):\n", + " return attr\n", + "\n", + "df.cols.apply_by_dtypes(\"filter\", func, \"string\", \"new string\", data_type=\"integer\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a UDF function that sum a values(32 in this case) to two columns" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 33\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 33\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 34\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 33\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 34\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 33\n", + "
\n", + " None\n", + " \n", + " 35\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 33\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def func(val, attr):\n", + " return val + attr\n", + "\n", + "df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 32 ,\"udf\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create a Pandas UDF function that sum a values(32 in this case) to two columns" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 11\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 11\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 12\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 11\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 12\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 11\n", + "
\n", + " None\n", + " \n", + " 13\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 11\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def func(val, attr):\n", + " return val + attr\n", + "\n", + "df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 10).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select row where column \"filter\" is \"integer\"" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 1 of 1 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 1 of 1 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from optimus.functions import filter_row_by_data_type as fbdt\n", + "\n", + "df.rows.select(fbdt(\"filter\", \"integer\")).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an abstract dataframe to filter a rows where the value of column \"num\"> 1" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 3 of 3 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 3 of 3 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from optimus.functions import abstract_udf as audf \n", + "\n", + "def func(val, attr):\n", + " return val>1\n", + "\n", + "df.rows.select(audf(\"num\", func, \"boolean\")).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an abstract dataframe (Pandas UDF) to pass two arguments to a function a apply a sum operation" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
num_sum
\n", + "
11 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " 31\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " 32\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " 32\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " 33\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from optimus.functions import abstract_udf as audf \n", + "\n", + "def func(val, attr):\n", + " return val+attr[0]+ attr[1]\n", + "\n", + "df.withColumn(\"num_sum\", audf (\"num\", func, \"int\", [10,20])).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply a column expression to when the value of \"num\" or \"num 2\" is grater than 2" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 1\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 1\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 1\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 10\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 10\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 10\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import functions as F\n", + "def func(col_name, attr):\n", + " return F.when(F.col(col_name)>2 ,10).otherwise(1)\n", + "\n", + "df.cols.apply_expr([\"num\",\"num 2\"], func).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert to uppercase" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " DOG\n", + " \n", + " housé\n", + " \n", + " CAT-CAR\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " CAT\n", + " \n", + " tv\n", + " \n", + " DOG-TV\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " FROG\n", + " \n", + " table\n", + " \n", + " EAGLE-TV-PLUS\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " EAGLE\n", + " \n", + " glass\n", + " \n", + " LION-PC\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from pyspark.sql import functions as F\n", + "def func(col_name, attr):\n", + " return F.upper(F.col(col_name))\n", + "\n", + "df.cols.apply_expr([\"two strings\",\"animals\"], func).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using apply with a condition" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 10\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 10\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 10\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 10\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 10\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 10\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 10\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def func(val, attr):\n", + " return 10\n", + "\n", + "col = \"num\"\n", + "\n", + "df.cols.apply(col, func, \"int\", when= df[\"num\"]>1).table()\n", + "\n", + "df.cols.apply(col, func, \"int\", when= fbdt(col, \"int\")).table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count Nulls" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "df_null = op.spark.createDataFrame(\n", + " [(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')), (1, 6, float('nan'))],\n", + " ('session', \"timestamp1\", \"id2\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 7 of 7 rows / 3 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
session
\n", + "
1 (bigint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
timestamp1
\n", + "
2 (bigint)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
id2
\n", + "
3 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " 1\n", + " \n", + " None\n", + "
\n", + " 1\n", + " \n", + " 2\n", + " \n", + " 5.0\n", + "
\n", + " 1\n", + " \n", + " 3\n", + " \n", + " nan\n", + "
\n", + " 1\n", + " \n", + " 4\n", + " \n", + " None\n", + "
\n", + " 1\n", + " \n", + " 5\n", + " \n", + " 10.0\n", + "
\n", + " 1\n", + " \n", + " 6\n", + " \n", + " nan\n", + "
\n", + " 1\n", + " \n", + " 6\n", + " \n", + " nan\n", + "
\n", + "\n", + "
Viewing 7 of 7 rows / 3 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_null.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_null.cols.count_na(\"id2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'session': 0, 'timestamp1': 0, 'id2': 5}" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_null.cols.count_na(\"*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count uniques\n", + "### Spark\n", + "\n", + "### Pandas\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'words': {'approx_count_distinct': 3},\n", + " 'num': {'approx_count_distinct': 3},\n", + " 'animals': {'approx_count_distinct': 4},\n", + " 'thing': {'approx_count_distinct': 4},\n", + " 'two strings': {'approx_count_distinct': 4},\n", + " 'filter': {'approx_count_distinct': 4},\n", + " 'num 2': {'approx_count_distinct': 4},\n", + " 'col_array': {'approx_count_distinct': 3},\n", + " 'col_int': {'approx_count_distinct': 4},\n", + " 'new_col_1': {'approx_count_distinct': 1}}" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.cols.count_uniques(\"*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Unique\n", + "### Spark\n", + "An abstraction of distinct to be use in multiple columns at the same time\n", + "\n", + "### Pandas\n", + "Similar behavior than pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.table()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "df_distinct = op.create.df(\n", + " [\n", + " (\"words\", \"str\", True),\n", + " (\"num\", \"int\", True)\n", + " ],\n", + "[\n", + " (\" I like fish \", 1),\n", + " (\" zombies\", 2),\n", + " (\"simpsons cat lady\", 2),\n", + " (None, 3),\n", + " (None, 0)\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 1 columns
\n", + "
200 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
num
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + "
\n", + " 3\n", + "
\n", + " 2\n", + "
\n", + " 0\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 1 columns
\n", + "
200 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df_distinct.cols.unique(\"num\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count Zeros" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 5 of 5 rows / 2 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + "
\n", + " None\n", + " \n", + " 3\n", + "
\n", + " None\n", + " \n", + " 0\n", + "
\n", + "\n", + "
Viewing 5 of 5 rows / 2 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "{'words': 0, 'num': 1}" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_zeros = df_distinct\n", + "df_zeros.table()\n", + "df_zeros.cols.count_zeros(\"*\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Column Data Types" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'words': 'string',\n", + " 'num': 'int',\n", + " 'animals': 'string',\n", + " 'thing': 'string',\n", + " 'two strings': 'string',\n", + " 'filter': 'string',\n", + " 'num 2': 'string',\n", + " 'col_array': 'array',\n", + " 'col_int': 'array',\n", + " 'new_col_1': 'int'}" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.cols.dtype('*')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Replace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replace \"dog\",\"cat\" in column \"animals\" by the word \"animals\"" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " animals\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " animals\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace(\"animals\",[\"dog\",\"cat\"],\"animals\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replace \"dog-tv\", \"cat\", \"eagle\", \"fish\" in columns \"two strings\",\"animals\" by \"animals\"" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " animals\n", + " \n", + " tv\n", + " \n", + " animals\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " animals\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace([\"two strings\",\"animals\"], [\"dog-tv\", \"cat\", \"eagle\", \"fish\"], \"animals\").table()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replace \"dog\" by \"dog_1\" and \"cat\" by \"cat_1\" in columns \"animals\"" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog_1\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat_1\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace(\"animals\",[(\"dog\",\"dog_1\"),(\"cat\",\"cat_1\")]).table()" + ] + }, + { "cell_type": "markdown", "metadata": {}, "source": [ - "### Only get the first element" + "### Replace in column \"animals\", \"dog\" by \"pet\" " + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " animal\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace(\"animals\",\"dog\",\"animal\").table()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.cols.replace('num',[\"3\",2], 10).table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.unnest(\"two strings\",\"-\", index = 1).table()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 66, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "### Unnest array of string" + "df.cols.replace('num',[(\"3\",6),(2,6)]).table()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 67, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat_1\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat_1\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + "
\n", + " cat_1\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 10 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "df\\\n", - " .cols.unnest([\"col_array\"])\\\n", - " .table()" + "df.cols.replace('*','.*[Cc]at.*', 'cat_1', regex=True).table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Unnest and array of ints" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df\\\n", - " .cols.unnest([\"col_int\"])\\\n", - " .table()" + "## Nest" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Spits in 3 parts" + "### Merge two columns in a column vector" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 68, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+----------+\n", + "| words|num|animals|thing| two strings|filter|num 2| col_array| col_int|new_col_1|col_nested|\n", + "+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+----------+\n", + "| I like fish | 1| dog|housé| cat-car| a| 1| [baby, sorry]|[1, 2, 3]| 1| [1.0,1.0]|\n", + "| zombies| 2| cat| tv| dog-tv| b| 2|[baby 1, sorry 1]| [3, 4]| 1| [2.0,1.0]|\n", + "|simpsons cat lady| 2| frog|table|eagle-tv-plus| 1| 3|[baby 2, sorry 2]|[5, 6, 7]| 1| [2.0,1.0]|\n", + "| null| 3| eagle|glass| lion-pc| c| 4|[baby 3, sorry 3]| [7, 8]| 1| [3.0,1.0]|\n", + "+-------------------+---+-------+-----+-------------+------+-----+-----------------+---------+---------+----------+\n", + "\n" + ] + } + ], "source": [ - "df\\\n", - " .cols.unnest([\"two strings\"], n= 3, mark = \"-\")\\\n", - " .table()" + "df.cols.nest([\"num\", \"new_col_1\"], output_col = \"col_nested\", shape =\"vector\").show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Unnest a Vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.ml.linalg import Vectors\n", - "\n", - "df1 = op.sc.parallelize([\n", - " (\"assert\", Vectors.dense([1, 2, 3])),\n", - " (\"require\", Vectors.sparse(3, {1: 2}))\n", - "]).toDF([\"word\", \"vector\"]) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df1\\\n", - " .cols.unnest([\"vector\"])\\\n", - " .table()" + "### Merge two columns in a string columns" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.cols.append(\"new_col_1\", 1)" - ] - }, - { - "cell_type": "markdown", + "execution_count": 69, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
col_nested
\n", + "
11 (string)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " dogcat-car\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " catdog-tv\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " frogeagle-tv-plus\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " eaglelion-pc\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "## Impute" + "df.cols.nest([\"animals\", \"two strings\"], output_col= \"col_nested\", shape = \"string\").table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Fill missing data" + "### Merge three columns in an array" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 70, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_col_1
\n", + "
10 (int)
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
col_nested
\n", + "
11 (array<string>)
\n", + "
\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + " \n", + " 1\n", + " \n", + " ['dog',⸱'cat-car',⸱'1']\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + " \n", + " 1\n", + " \n", + " ['cat',⸱'dog-tv',⸱'2']\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + " \n", + " 1\n", + " \n", + " ['frog',⸱'eagle-tv-plus',⸱'3']\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + " \n", + " 1\n", + " \n", + " ['eagle',⸱'lion-pc',⸱'4']\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 11 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "df_fill = op.spark.createDataFrame([(1.0, float(\"nan\")), (2.0, float(\"nan\")), \n", - " (float(\"nan\"), 3.0), (4.0, 4.0), (5.0, 5.0)], [\"a\", \"b\"])\n", - "\n", - "imputer = df_fill.cols.impute([\"a\", \"b\"], [\"out_a\", \"out_b\"], \"median\").table()" + "df.cols.nest([\"animals\", \"two strings\",\"num 2\"], \"col_nested\", shape=\"array\").table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Get columns by type\n", - "### Spark\n", - "Not implemented in Spark Vanilla\n", - "\n", - "### Pandas" + "## Histograms" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ - "df.cols.select_by_dtypes(\"int\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Apply custom function\n", - "\n", - "Spark have few ways to transform data rdd, Columns Expression, UDF and Pandas UDF. apply() and apply_expr() try to make a consistent way to call this expression without knowing the implementation details.\n", - "\n", - "### Spark\n", - "You need to declare a UDF Spark function\n", - "\n", - "### Pandas\n", - "Almost the same behavior that Optimus" + "from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType\n", + "df =op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/data/foo.csv\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df.table()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a function that only apply to string value in column filter\n", - "\n", - "Sometimes there are columns with for example with numbers even when are supposed to be only of words or letters. \n", - "\n", - "In order to solve this problem, apply_by_dtypes() function can be used. \n", - "\n", - "In the next example we replace a number in a string column with \"new string\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def func(val, attr):\n", - " return attr\n", - "\n", - "df.cols.apply_by_dtypes(\"filter\", func, \"string\", \"new string\", data_type=\"integer\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a UDF function that sum a values(32 in this case) to two columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def func(val, attr):\n", - " return val + attr\n", - "\n", - "df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 32 ,\"udf\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create a Pandas UDF function that sum a values(32 in this case) to two columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def func(val, attr):\n", - " return val + attr\n", - "\n", - "df.cols.apply([\"num\", \"new_col_1\"], func, \"int\", 10).table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select row where column \"filter\" is \"integer\"" - ] - }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from optimus.functions import filter_row_by_data_type as fbdt\n", - "\n", - "df.rows.select(fbdt(\"filter\", \"integer\")).table()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 75, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'count': 1, 'lower': 1.0, 'upper': 1.9},\n", + " {'count': 1, 'lower': 1.9, 'upper': 2.8},\n", + " {'count': 4, 'lower': 2.8, 'upper': 3.6999999999999997},\n", + " {'count': 1, 'lower': 3.6999999999999997, 'upper': 4.6},\n", + " {'count': 2, 'lower': 4.6, 'upper': 5.5},\n", + " {'count': 0, 'lower': 5.5, 'upper': 6.4},\n", + " {'count': 0, 'lower': 6.4, 'upper': 7.300000000000001},\n", + " {'count': 6, 'lower': 7.300000000000001, 'upper': 8.200000000000001},\n", + " {'count': 2, 'lower': 8.200000000000001, 'upper': 9.100000000000001},\n", + " {'count': 2, 'lower': 9.100000000000001, 'upper': 10.000000000000002}]" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Create an abstract dataframe to filter a rows where the value of column \"num\"> 1" + "df.cols.hist(\"price\", 10)" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from optimus.functions import abstract_udf as audf \n", - "\n", - "def func(val, attr):\n", - " return val>1\n", - "\n", - "df.rows.select(audf(\"num\", func, \"boolean\")).table()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 76, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'value': 992, 'count': 1},\n", + " {'value': 912, 'count': 1},\n", + " {'value': 886, 'count': 1},\n", + " {'value': 875, 'count': 1},\n", + " {'value': 812, 'count': 1},\n", + " {'value': 735, 'count': 1},\n", + " {'value': 672, 'count': 1},\n", + " {'value': 634, 'count': 1},\n", + " {'value': 624, 'count': 1},\n", + " {'value': 553, 'count': 1}]" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Create an abstract dataframe (Pandas UDF) to pass two arguments to a function a apply a sum operation" + "df.cols.frequency(\"billingId\")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from optimus.functions import abstract_udf as audf \n", - "\n", - "def func(val, attr):\n", - " return val+attr[0]+ attr[1]\n", - "\n", - "df.withColumn(\"num_sum\", audf (\"num\", func, \"int\", [10,20])).table()" + "## Statistics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Apply a column expression to when the value of \"num\" or \"num 2\" is grater than 2" + "### Quantile Statistics" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql import functions as F\n", - "def func(col_name, attr):\n", - " return F.when(F.col(col_name)>2 ,10).otherwise(1)\n", - "\n", - "df.cols.apply_expr([\"num\",\"num 2\"], func).table()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 77, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "111\n", + "{'billingId': {0.05: 111.0, 0.25: 111.0, 0.5: 111.0, 0.75: 111.0, 0.95: 111.0}, 'price': {0.05: 1.0, 0.25: 1.0, 0.5: 1.0, 0.75: 1.0, 0.95: 1.0}}\n", + "992\n", + "{'billingId': 111.0, 'price': 1.0}\n", + "{'billingId': {'min': 111, 'max': 992}, 'price': {'min': 1, 'max': 10}}\n", + "{'billingId': {'stddev': 280.1973510859008}, 'price': {'stddev': 2.9528457876452054}}\n" + ] + } + ], "source": [ - "### Convert to uppercase" + "print(df.cols.min(\"billingId\"))\n", + "print(df.cols.percentile(['billingId', 'price'], [0.05, 0.25, 0.5, 0.75, 0.95]))\n", + "print(df.cols.max(\"billingId\"))\n", + "print(df.cols.median([\"billingId\",\"price\"]))\n", + "print(df.cols.range([\"billingId\",\"price\"]))\n", + "print(df.cols.std([\"billingId\",\"price\"]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 78, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'id': {'min': 1}, 'firstName': {'min': '((( Heinrich )))))'}, 'lastName': {'min': ' GALiLEI'}, 'billingId': {'min': 111}, 'product': {'min': '110790'}, 'price': {'min': 1}, 'birth': {'min': '1899/01/01'}, 'dummyCol': {'min': '#'}}\n" + ] + } + ], "source": [ - "from pyspark.sql import functions as F\n", - "def func(col_name, attr):\n", - " return F.upper(F.col(col_name))\n", - "\n", - "df.cols.apply_expr([\"two strings\",\"animals\"], func).table()" + "print(df.cols.min(\"*\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Using apply with a condition" + "### Descriptive Statistics" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 79, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-1.0411548120618528\n", + "556.0\n", + "-0.2137018086949909\n", + "10564\n", + "78510.55555555555\n", + "0.0\n" + ] + } + ], "source": [ - "def func(val, attr):\n", - " return 10\n", - "\n", - "col = \"num\"\n", - "\n", - "df.cols.apply(col, func, \"int\", when= df[\"num\"]>1).table()\n", - "\n", - "df.cols.apply(col, func, \"int\", when= fbdt(col, \"int\")).table()" + "print(df.cols.kurt(\"billingId\"))\n", + "print(df.cols.mean(\"billingId\"))\n", + "print(df.cols.skewness(\"billingId\"))\n", + "print(df.cols.sum(\"billingId\"))\n", + "print(df.cols.variance(\"billingId\"))\n", + "print(df.cols.mad(\"billingId\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Count Nulls" + "### Calculate Median Absolute deviation" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 80, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "import numpy as np\n", - "\n", - "df_null = op.spark.createDataFrame(\n", - " [(1, 1, None), (1, 2, float(5)), (1, 3, np.nan), (1, 4, None), (1, 5, float(10)), (1, 6, float('nan')), (1, 6, float('nan'))],\n", - " ('session', \"timestamp1\", \"id2\"))" + "df.cols.mad(\"price\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'mad': 0.0, 'median': 1.0}" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df_null.table()" + "df.cols.mad(\"price\", more= True)" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df_null.cols.count_na(\"id2\")" + "### Calculate precentiles" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{0.05: 1.0, 0.25: 1.0, 0.5: 1.0, 0.75: 1.0, 0.95: 1.0}\n" + ] + } + ], "source": [ - "df_null.cols.count_na(\"*\")" + "print(df.cols.percentile(['price'], [0.05, 0.25, 0.5, 0.75, 0.95]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Count uniques\n", - "### Spark\n", - "\n", - "### Pandas\n" + "### Calculate Mode" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'price': 8}, {'billingId': None}]\n" + ] + } + ], "source": [ - "df.cols.count_uniques(\"*\")" + "print(df.cols.mode([\"price\",\"billingId\"]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Unique\n", - "### Spark\n", - "An abstraction of distinct to be use in multiple columns at the same time\n", - "\n", - "### Pandas\n", - "Similar behavior than pandas" + "## String Operations" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df.table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_distinct = op.create.df(\n", - " [\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True)\n", - " ],\n", - "[\n", - " (\" I like fish \", 1),\n", - " (\" zombies\", 2),\n", - " (\"simpsons cat lady\", 2),\n", - " (None, 3),\n", - " (None, 0)\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_distinct.cols.unique(\"num\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Count Zeros" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_zeros = df_distinct\n", - "df_zeros.table()\n", - "df_zeros.cols.count_zeros(\"*\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Column Data Types" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.dtypes('*')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Replace" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Replace \"dog\",\"cat\" in column \"animals\" by the word \"animals\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace(\"animals\",[\"dog\",\"cat\"],\"animals\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Replace \"dog-tv\", \"cat\", \"eagle\", \"fish\" in columns \"two strings\",\"animals\" by \"animals\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace([\"two strings\",\"animals\"], [\"dog-tv\", \"cat\", \"eagle\", \"fish\"], \"animals\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Replace \"dog\" by \"dog_1\" and \"cat\" by \"cat_1\" in columns \"animals\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace(\"animals\",[(\"dog\",\"dog_1\"),(\"cat\",\"cat_1\")]).table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Replace in column \"animals\", \"dog\" by \"pet\" " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace(\"animals\",\"dog\",\"animal\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace('num',[\"3\",2], 10).table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace('num',[(\"3\",6),(2,6)]).table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.replace('*','.*[Cc]at.*', 'cat_1', regex=True).table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Nest" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Merge two columns in a column vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " SIUL\n", + " \n", + " alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " CAKE\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " ÉRDNA\n", + " \n", + " ampère\n", + " \n", + " 423\n", + " \n", + " PIZA\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " SLEIN\n", + " \n", + " böhr//((%%\n", + " \n", + " 551\n", + " \n", + " PIZZA\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " LUAP\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " PIZZA\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " TREBLA\n", + " \n", + " einstein\n", + " \n", + " 634\n", + " \n", + " PIZZA\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " OELILAG\n", + " \n", + " galilei\n", + " \n", + " 672\n", + " \n", + " AREPA\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " LRAC\n", + " \n", + " ga%%%uss\n", + " \n", + " 323\n", + " \n", + " TACO\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " DIVAD\n", + " \n", + " h$$$ilbert\n", + " \n", + " 624\n", + " \n", + " TAAACCOO\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " SENNAHOJ\n", + " \n", + " kepler\n", + " \n", + " 735\n", + " \n", + " TACO\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " SEMAJ\n", + " \n", + " m$$ax%%well\n", + " \n", + " 875\n", + " \n", + " TACO\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " CAASI\n", + " \n", + " newton\n", + " \n", + " 992\n", + " \n", + " PASTA\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " %%YMME\n", + " \n", + " nöether$\n", + " \n", + " 234\n", + " \n", + " PASTA\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " !!!XAM\n", + " \n", + " planck!!!\n", + " \n", + " 111\n", + " \n", + " HAMBURGUER\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " DERF\n", + " \n", + " hoy&&&le\n", + " \n", + " 553\n", + " \n", + " PIZZZA\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " )))))⸱HCIRNIEH⸱⸱⸱(((\n", + " \n", + " hertz\n", + " \n", + " 116\n", + " \n", + " PIZZA\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " MAILLIW\n", + " \n", + " gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " EIRAM\n", + " \n", + " curie\n", + " \n", + " 912\n", + " \n", + " RICE\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " RUHTRA\n", + " \n", + " com%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " SEMAJ\n", + " \n", + " chadwick\n", + " \n", + " 467\n", + " \n", + " NULL\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "df.cols.nest([\"num\", \"new_col_1\"], output_col = \"col_nested\", shape =\"vector\").show()" + "df\\\n", + " .cols.trim(\"lastName\")\\\n", + " .cols.lower(\"lastName\")\\\n", + " .cols.upper([\"product\", \"firstName\"])\\\n", + " .cols.reverse(\"firstName\")\\\n", + " .table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Merge two columns in a string columns" + "### Calculate the interquartile range" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.nest([\"animals\", \"two strings\"], output_col= \"col_nested\", shape = \"string\").table()" - ] - }, - { - "cell_type": "markdown", + "execution_count": 86, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Merge three columns in an array" + "df.cols.iqr(\"price\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'iqr': 0.0, 'q1': 1.0, 'q3': 1.0}" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df.cols.nest([\"animals\", \"two strings\",\"num 2\"], \"col_nested\", shape=\"array\").table()" + "df.cols.iqr(\"price\", more= True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Histograms" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, ArrayType\n", - "df =op.load.url(\"https://raw.githubusercontent.com/ironmussa/Optimus/master/examples/foo.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.hist(\"price\", 10)" + "### Calculate Zscore" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.frequency(\"billingId\")" - ] - }, - { - "cell_type": "markdown", + "execution_count": 88, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
z_col_price
\n", + "
9 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + " \n", + " 1.3368014129178498\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + " \n", + " 0.3564803767780932\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + " \n", + " 1.0337930926564705\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + " \n", + " 1.0337930926564705\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + " \n", + " 1.0337930926564705\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + " \n", + " 1.0337930926564705\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + " \n", + " 0.9981450549786612\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + " \n", + " 0.9981450549786612\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + " \n", + " 0.6951367347172818\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + " \n", + " 0.6594886970394727\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + " \n", + " 1.372449450595659\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + " \n", + " 1.7111058085348476\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + " \n", + " 0.3564803767780932\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + " \n", + " 1.3368014129178498\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "## Statistics" + "df.cols.z_score(\"price\").table()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Quantile Statistics" + "## Cleaning and Date Operations Operations" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 89, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new_date
\n", + "
9 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + " \n", + " 07-07-1980\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + " \n", + " 08-07-1950\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + " \n", + " 09-07-1990\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + " \n", + " 10-07-1954\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + " \n", + " 11-07-1990\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + " \n", + " 12-08-1930\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + " \n", + " 13-07-1970\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + " \n", + " 14-07-1950\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + " \n", + " 22-04-1920\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + " \n", + " 12-03-1923\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + " \n", + " 15-02-1999\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + " \n", + " 08-12-1993\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + " \n", + " 04-01-1994\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + " \n", + " 27-06-1997\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + " \n", + " 30-11-1956\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + " \n", + " 26-03-1958\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + " \n", + " 22-03-2000\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + " \n", + " 01-01-1899\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + " \n", + " 03-05-1921\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "print(df.cols.min(\"billingId\"))\n", - "print(df.cols.percentile(['billingId', 'price'], [0.05, 0.25, 0.5, 0.75, 0.95]))\n", - "print(df.cols.max(\"billingId\"))\n", - "print(df.cols.median([\"billingId\",\"price\"]))\n", - "print(df.cols.range([\"billingId\",\"price\"]))\n", - "print(df.cols.std([\"billingId\",\"price\"]))" + "df.cols.date_transform(\"birth\", \"new_date\", \"yyyy/MM/dd\", \"dd-MM-YYYY\").table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(df.cols.min(\"*\"))" - ] - }, - { - "cell_type": "markdown", + "execution_count": 90, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
new date
\n", + "
9 (float)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + " \n", + " None\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + " \n", + " None\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + " \n", + " None\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + " \n", + " None\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + " \n", + " None\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + " \n", + " None\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + " \n", + " None\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + " \n", + " None\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + " \n", + " None\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + " \n", + " None\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + " \n", + " None\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + " \n", + " None\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + " \n", + " None\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + " \n", + " None\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + " \n", + " None\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + " \n", + " None\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + " \n", + " None\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + " \n", + " None\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + " \n", + " None\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "### Descriptive Statistics" + "df.cols.years_between(\"birth\", \"new date\", \"yyyyMMdd\",).table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(df.cols.kurt(\"billingId\"))\n", - "print(df.cols.mean(\"billingId\"))\n", - "print(df.cols.skewness(\"billingId\"))\n", - "print(df.cols.sum(\"billingId\"))\n", - "print(df.cols.variance(\"billingId\"))\n", - "print(df.cols.mad(\"billingId\"))" - ] - }, - { - "cell_type": "markdown", + "execution_count": 91, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampere\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Bohr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Noether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "### Calculate Median Absolute deviation" + "df.cols.remove_accents(\"lastName\").table()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 92, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Gauss\n", + " \n", + " 323\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " Hilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " Maxwell\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoyle\n", + " \n", + " 553\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert\n", + " \n", + " 886\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 912\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COMpton\n", + " \n", + " 812\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "df.cols.mad(\"price\")" + "df.cols.remove_special_chars(\"lastName\").table()" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.mad(\"price\", more= True)" - ] - }, - { - "cell_type": "markdown", + "execution_count": 93, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 200\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 200\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 200\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 200\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 200\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + "
\n", + " 7\n", + " \n", + " CaRL\n", + " \n", + " Ga%%%uss\n", + " \n", + " 200\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1970/07/13\n", + " \n", + " gonna\n", + "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 200\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 200\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 200\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 200\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 200\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + "
\n", + " 14\n", + " \n", + " Fred\n", + " \n", + " Hoy&&&le\n", + " \n", + " 200\n", + " \n", + " pizzza\n", + " \n", + " 8\n", + " \n", + " 1997/06/27\n", + " \n", + " around\n", + "
\n", + " 15\n", + " \n", + " (((⸱⸱⸱Heinrich⸱)))))\n", + " \n", + " Hertz\n", + " \n", + " 116\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1956/11/30\n", + " \n", + " and\n", + "
\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 200\n", + " \n", + " BEER\n", + " \n", + " 2\n", + " \n", + " 1958/03/26\n", + " \n", + " desert\n", + "
\n", + " 17\n", + " \n", + " Marie\n", + " \n", + " CURIE\n", + " \n", + " 200\n", + " \n", + " Rice\n", + " \n", + " 1\n", + " \n", + " 2000/03/22\n", + " \n", + " you\n", + "
\n", + " 18\n", + " \n", + " Arthur\n", + " \n", + " COM%%%pton\n", + " \n", + " 200\n", + " \n", + " 110790\n", + " \n", + " 5\n", + " \n", + " 1899/01/01\n", + " \n", + " #\n", + "
\n", + " 19\n", + " \n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 200\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + "
\n", + "\n", + "
Viewing 19 of 19 rows / 8 columns
\n", + "
1 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "### Calculate precentiles" + "df.cols.clip(\"billingId\", 100 , 200).table()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ - "print(df.cols.percentile(['price'], [0.05, 0.25, 0.5, 0.75, 0.95]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Calculate Mode" + "df_abs = op.create.df(\n", + " [\n", + " (\"words\", \"str\", True),\n", + " (\"num\", \"int\", True),\n", + " (\"animals\", \"str\", True),\n", + " (\"thing\", StringType(), True),\n", + " (\"two strings\", StringType(), True),\n", + " (\"filter\", StringType(), True),\n", + " (\"num 2\", \"string\", True),\n", + " (\"col_array\", ArrayType(StringType()), True),\n", + " (\"col_int\", ArrayType(IntegerType()), True)\n", + "\n", + " ]\n", + ",\n", + "[\n", + " (\" I like fish \", -1, \"dog\", \"housé\", \"cat-car\", \"a\",\"-1\",[\"baby\", \"sorry\"],[1,2,3]),\n", + " (\" zombies\", -2, \"cat\", \"tv\", \"dog-tv\", \"b\",\"-2\",[\"baby 1\", \"sorry 1\"],[3,4]),\n", + " (\"simpsons cat lady\", -2, \"frog\", \"table\",\"eagle-tv-plus\",\"1\",\"3\", [\"baby 2\", \"sorry 2\"], [5,6,7]),\n", + " (None, 3, \"eagle\", \"glass\", \"lion-pc\", \"c\",\"4\", [\"baby 3\", \"sorry 3\"] ,[7,8])\n", + " ])\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 95, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "['words', 'num', 'animals', 'thing', 'two strings', 'filter', 'num 2', 'col_array', 'col_int', 'new_col_1']\n", - "10\n" - ] - }, - { - "ename": "ValueError", - "evalue": "'missing_columns' must be 'words', 'num', 'animals', 'thing', 'two strings', 'filter', 'num 2', 'col_array', 'col_int', 'new_col_1', received '['billingId', 'price']'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"price\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"billingId\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/optimus/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36mmode\u001b[0;34m(columns)\u001b[0m\n\u001b[1;32m 636\u001b[0m \"\"\"\n\u001b[1;32m 637\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 638\u001b[0;31m \u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mparse_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 639\u001b[0m \u001b[0mmode_result\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 640\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36mparse_columns\u001b[0;34m(df, cols_args, get_args, is_regex, filter_by_column_dtypes, accepts_missing_cols)\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;31m# Check for missing columns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0maccepts_missing_cols\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0mcheck_for_missing_columns\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0;31m# Filter by column data type\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/helpers/functions.py\u001b[0m in \u001b[0;36mcheck_for_missing_columns\u001b[0;34m(df, col_names)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 217\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mRaiseIt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmissing_columns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/helpers/raiseit.py\u001b[0m in \u001b[0;36mvalue_error\u001b[0;34m(var, _list)\u001b[0m\n\u001b[1;32m 59\u001b[0m type=divisor.join(map(\n\u001b[1;32m 60\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m\"'\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 61\u001b[0;31m _list)), var_type=var))\n\u001b[0m\u001b[1;32m 62\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 63\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mstaticmethod\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: 'missing_columns' must be 'words', 'num', 'animals', 'thing', 'two strings', 'filter', 'num 2', 'col_array', 'col_int', 'new_col_1', received '['billingId', 'price']'" - ] + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
words
\n", + "
1 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num
\n", + "
2 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
animals
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
thing
\n", + "
4 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
two strings
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
filter
\n", + "
6 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
num 2
\n", + "
7 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_array
\n", + "
8 (array<string>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
col_int
\n", + "
9 (array<int>)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", + " \n", + " 1\n", + " \n", + " dog\n", + " \n", + " housé\n", + " \n", + " cat-car\n", + " \n", + " a\n", + " \n", + " 1.0\n", + " \n", + " ['baby',⸱'sorry']\n", + " \n", + " [1,⸱2,⸱3]\n", + "
\n", + " ⸱⸱⸱⸱zombies\n", + " \n", + " 2\n", + " \n", + " cat\n", + " \n", + " tv\n", + " \n", + " dog-tv\n", + " \n", + " b\n", + " \n", + " 2.0\n", + " \n", + " ['baby⸱1',⸱'sorry⸱1']\n", + " \n", + " [3,⸱4]\n", + "
\n", + " simpsons⸱⸱⸱cat⸱lady\n", + " \n", + " 2\n", + " \n", + " frog\n", + " \n", + " table\n", + " \n", + " eagle-tv-plus\n", + " \n", + " 1\n", + " \n", + " 3.0\n", + " \n", + " ['baby⸱2',⸱'sorry⸱2']\n", + " \n", + " [5,⸱6,⸱7]\n", + "
\n", + " None\n", + " \n", + " 3\n", + " \n", + " eagle\n", + " \n", + " glass\n", + " \n", + " lion-pc\n", + " \n", + " c\n", + " \n", + " 4.0\n", + " \n", + " ['baby⸱3',⸱'sorry⸱3']\n", + " \n", + " [7,⸱8]\n", + "
\n", + "\n", + "
Viewing 4 of 4 rows / 9 columns
\n", + "
4 partition(s)
\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "print(df.cols.mode([\"price\",\"billingId\"]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## String Operations" + "df_abs.cols.abs([\"num\",\"num 2\"]).table()" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n", + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n", "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - "\n", - " \n", - " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -4763,43 +27498,39 @@ " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", @@ -4807,7 +27538,23 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", @@ -4895,7 +27698,8 @@ " \n", "
\n", - "
words
\n", - "
1 (string)
\n", - "\n", - "
\n", + "
id
\n", + "
1 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
firstName
\n", + "
2 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
lastName
\n", + "
3 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId
\n", + "
4 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
product
\n", + "
5 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
price
\n", + "
6 (int)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
birth
\n", + "
7 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
dummyCol
\n", + "
8 (string)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + "
billingId_ad
\n", + "
9 (double)
\n", + "
\n", + " \n", + " nullable\n", + " \n", + "
\n", + "
\n", + " 1\n", + " \n", + " Luis\n", + " \n", + " Alvarez$$%!\n", + " \n", + " 123\n", + " \n", + " Cake\n", + " \n", + " 10\n", + " \n", + " 1980/07/07\n", + " \n", + " never\n", + " \n", + " 0.0\n", + "
\n", + " 2\n", + " \n", + " André\n", + " \n", + " Ampère\n", + " \n", + " 423\n", + " \n", + " piza\n", + " \n", + " 8\n", + " \n", + " 1950/07/08\n", + " \n", + " gonna\n", + " \n", + " 1.0\n", + "
\n", + " 3\n", + " \n", + " NiELS\n", + " \n", + " Böhr//((%%\n", + " \n", + " 551\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/09\n", + " \n", + " give\n", + " \n", + " 2.0\n", + "
\n", + " 4\n", + " \n", + " PAUL\n", + " \n", + " dirac$\n", + " \n", + " 521\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1954/07/10\n", + " \n", + " you\n", + " \n", + " 2.0\n", + "
\n", + " 5\n", + " \n", + " Albert\n", + " \n", + " Einstein\n", + " \n", + " 634\n", + " \n", + " pizza\n", + " \n", + " 8\n", + " \n", + " 1990/07/11\n", + " \n", + " up\n", + " \n", + " 3.0\n", + "
\n", + " 6\n", + " \n", + " Galileo\n", + " \n", + " ⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱⸱GALiLEI\n", + " \n", + " 672\n", + " \n", + " arepa\n", + " \n", + " 5\n", + " \n", + " 1930/08/12\n", + " \n", + " never\n", + " \n", + " 3.0\n", + "
\n", + " 7\n", + " \n", - "
num
\n", - "
2 (int)
\n", - "\n", - "
\n", + " CaRL\n", + " \n", - "
animals
\n", - "
3 (string)
\n", - "\n", - "
\n", + " Ga%%%uss\n", + " \n", - "
thing
\n", - "
4 (string)
\n", - "\n", - "
\n", + " 323\n", + " \n", - "
two strings
\n", - "
5 (string)
\n", - "\n", - "
\n", + " taco\n", + " \n", - "
filter
\n", - "
6 (string)
\n", - "\n", - "
\n", + " 3\n", + " \n", - "
num 2
\n", - "
7 (string)
\n", - "\n", - "
\n", + " 1970/07/13\n", + " \n", - "
col_array
\n", - "
8 (array<string>)
\n", - "\n", - "
\n", + " gonna\n", + " \n", - "
col_int
\n", - "
9 (array<int>)
\n", - "\n", - "
\n", + " 1.0\n", + " \n", - "
new_col_1
\n", - "
10 (int)
\n", - "\n", - "
\n", + " 8\n", + " \n", + " David\n", + " \n", + " H$$$ilbert\n", + " \n", + " 624\n", + " \n", + " taaaccoo\n", + " \n", + " 3\n", + " \n", + " 1950/07/14\n", + " \n", + " let\n", + " \n", + " 2.0\n", + "
\n", + " 9\n", + " \n", + " Johannes\n", + " \n", + " KEPLER\n", + " \n", + " 735\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1920/04/22\n", + " \n", + " you\n", + " \n", + " 3.0\n", + "
\n", + " 10\n", + " \n", + " JaMES\n", + " \n", + " M$$ax%%well\n", + " \n", + " 875\n", + " \n", + " taco\n", + " \n", + " 3\n", + " \n", + " 1923/03/12\n", + " \n", + " down\n", + " \n", + " 4.0\n", + "
\n", + " 11\n", + " \n", + " Isaac\n", + " \n", + " Newton\n", + " \n", + " 992\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1999/02/15\n", + " \n", + " never⸱\n", + " \n", + " 4.0\n", + "
\n", + " 12\n", + " \n", + " Emmy%%\n", + " \n", + " Nöether$\n", + " \n", + " 234\n", + " \n", + " pasta\n", + " \n", + " 9\n", + " \n", + " 1993/12/08\n", + " \n", + " gonna\n", + " \n", + " 1.0\n", + "
\n", + " 13\n", + " \n", + " Max!!!\n", + " \n", + " Planck!!!\n", + " \n", + " 111\n", + " \n", + " hamburguer\n", + " \n", + " 4\n", + " \n", + " 1994/01/04\n", + " \n", + " run⸱\n", + " \n", + " 0.0\n", + "
\n", - " ⸱⸱I⸱like⸱⸱⸱⸱⸱fish⸱⸱\n", - " \n", - " 1\n", + " 14\n", " \n", - " dog\n", + " Fred\n", " \n", - " housé\n", + " Hoy&&&le\n", " \n", - " cat-car\n", + " 553\n", " \n", - " a\n", + " pizzza\n", " \n", - " 1\n", + " 8\n", " \n", - " ['baby',⸱'sorry']\n", + " 1997/06/27\n", " \n", - " [1,⸱2,⸱3]\n", + " around\n", " \n", - " 1\n", + " 2.0\n", "
\n", - " ⸱⸱⸱⸱zombies\n", - " \n", - " 2\n", + " 15\n", " \n", - " cat\n", + " (((⸱⸱⸱Heinrich⸱)))))\n", " \n", - " tv\n", + " Hertz\n", " \n", - " dog-tv\n", + " 116\n", " \n", - " b\n", + " pizza\n", " \n", - " 2\n", + " 8\n", " \n", - " ['baby⸱1',⸱'sorry⸱1']\n", + " 1956/11/30\n", " \n", - " [3,⸱4]\n", + " and\n", " \n", - " 1\n", + " 0.0\n", "
\n", - " simpsons⸱⸱⸱cat⸱lady\n", + " 16\n", + " \n", + " William\n", + " \n", + " Gilbert###\n", + " \n", + " 886\n", + " \n", + " BEER\n", " \n", @@ -4815,79 +27562,135 @@ " \n", - " frog\n", + " 1958/03/26\n", " \n", - " table\n", + " desert\n", " \n", - " eagle-tv-plus\n", + " 4.0\n", + "
\n", + " 17\n", " \n", - " 1\n", + " Marie\n", " \n", - " 3\n", + " CURIE\n", " \n", - " ['baby⸱2',⸱'sorry⸱2']\n", + " 912\n", " \n", - " [5,⸱6,⸱7]\n", + " Rice\n", " \n", " 1\n", " \n", + " 2000/03/22\n", + " \n", + " you\n", + " \n", + " 4.0\n", + "
\n", - " None\n", + " 18\n", " \n", - " 3\n", + " Arthur\n", " \n", - " eagle\n", + " COM%%%pton\n", " \n", - " glass\n", + " 812\n", " \n", - " lion-pc\n", + " 110790\n", " \n", - " c\n", + " 5\n", " \n", - " 4\n", + " 1899/01/01\n", " \n", - " ['baby⸱3',⸱'sorry⸱3']\n", + " #\n", " \n", - " [7,⸱8]\n", + " 3.0\n", + "
\n", + " 19\n", " \n", - " 1\n", + " JAMES\n", + " \n", + " Chadwick\n", + " \n", + " 467\n", + " \n", + " null\n", + " \n", + " 10\n", + " \n", + " 1921/05/03\n", + " \n", + " #\n", + " \n", + " 1.0\n", "
\n", "\n", - "
Viewing 4 of 4 rows / 10 columns
\n" + "
Viewing 19 of 19 rows / 9 columns
\n", + "
1 partition(s)
\n" ], "text/plain": [ "" @@ -4905,192 +27709,9 @@ "output_type": "display_data" } ], - "source": [ - "df.table()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "ename": "AnalysisException", - "evalue": "\"cannot resolve '`lastName`' given input columns: [col_int, animals, new_col_1, col_array, thing, num 2, filter, two strings, num, words];;\\n'Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, new_col_1#42, trim('lastName, None) AS lastName#632]\\n+- AnalysisBarrier\\n +- Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, 1 AS new_col_1#42]\\n +- LogicalRDD [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8], false\\n\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[0;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[1;32m 327\u001b[0m \u001b[0;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 328\u001b[0;31m format(target_id, \".\", name), value)\n\u001b[0m\u001b[1;32m 329\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mPy4JJavaError\u001b[0m: An error occurred while calling o69.withColumn.\n: org.apache.spark.sql.AnalysisException: cannot resolve '`lastName`' given input columns: [col_int, animals, new_col_1, col_array, thing, num 2, filter, two strings, num, words];;\n'Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, new_col_1#42, trim('lastName, None) AS lastName#632]\n+- AnalysisBarrier\n +- Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, 1 AS new_col_1#42]\n +- LogicalRDD [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8], false\n\n\tat org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:88)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1$$anonfun$apply$2.applyOrElse(CheckAnalysis.scala:85)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformUp$1.apply(TreeNode.scala:289)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:288)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$3.apply(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:286)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:95)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$transformExpressionsUp$1.apply(QueryPlan.scala:95)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:107)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$1.apply(QueryPlan.scala:107)\n\tat org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:106)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:118)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1$1.apply(QueryPlan.scala:122)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)\n\tat scala.collection.immutable.List.foreach(List.scala:381)\n\tat scala.collection.TraversableLike$class.map(TraversableLike.scala:234)\n\tat scala.collection.immutable.List.map(List.scala:285)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.org$apache$spark$sql$catalyst$plans$QueryPlan$$recursiveTransform$1(QueryPlan.scala:122)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan$$anonfun$2.apply(QueryPlan.scala:127)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:127)\n\tat org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:95)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:85)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:80)\n\tat org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:127)\n\tat org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:80)\n\tat org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:92)\n\tat org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105)\n\tat org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:57)\n\tat org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:55)\n\tat org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:47)\n\tat org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:74)\n\tat org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$withPlan(Dataset.scala:3296)\n\tat org.apache.spark.sql.Dataset.select(Dataset.scala:1307)\n\tat org.apache.spark.sql.Dataset.withColumns(Dataset.scala:2192)\n\tat org.apache.spark.sql.Dataset.withColumn(Dataset.scala:2159)\n\tat sun.reflect.GeneratedMethodAccessor65.invoke(Unknown Source)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:498)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:282)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:238)\n\tat java.lang.Thread.run(Thread.java:748)\n", - "\nDuring handling of the above exception, another exception occurred:\n", - "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"lastName\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"lastName\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"product\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"firstName\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreverse\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"firstName\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m.\u001b[0m\u001b[0mtable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/optimus/optimus/helpers/decorators.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 25\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 26\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 27\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 28\u001b[0m \u001b[0msetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36mtrim\u001b[0;34m(columns)\u001b[0m\n\u001b[1;32m 693\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrim\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mF\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 694\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 695\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mapply_expr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_trim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 696\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 697\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0madd_attr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcols\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/optimus/optimus/dataframe/columns.py\u001b[0m in \u001b[0;36mapply_expr\u001b[0;34m(columns, func, args, filter_col_by_dtypes, verbose)\u001b[0m\n\u001b[1;32m 124\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mcol_name\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 126\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwithColumn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maudf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0m_func\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattrs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc_type\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"column_exp\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverbose\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 127\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 128\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/pyspark/sql/dataframe.py\u001b[0m in \u001b[0;36mwithColumn\u001b[0;34m(self, colName, col)\u001b[0m\n\u001b[1;32m 1847\u001b[0m \"\"\"\n\u001b[1;32m 1848\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mColumn\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"col should be Column\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1849\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwithColumn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolName\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_jc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msql_ctx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1850\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1851\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mignore_unicode_prefix\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1255\u001b[0m \u001b[0manswer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1256\u001b[0m return_value = get_return_value(\n\u001b[0;32m-> 1257\u001b[0;31m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[1;32m 1258\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1259\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda/lib/python3.6/site-packages/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 67\u001b[0m e.java_exception.getStackTrace()))\n\u001b[1;32m 68\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.AnalysisException: '\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 70\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'org.apache.spark.sql.catalyst.analysis'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 71\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mAnalysisException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m': '\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstackTrace\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAnalysisException\u001b[0m: \"cannot resolve '`lastName`' given input columns: [col_int, animals, new_col_1, col_array, thing, num 2, filter, two strings, num, words];;\\n'Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, new_col_1#42, trim('lastName, None) AS lastName#632]\\n+- AnalysisBarrier\\n +- Project [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8, 1 AS new_col_1#42]\\n +- LogicalRDD [words#0, num#1, animals#2, thing#3, two strings#4, filter#5, num 2#6, col_array#7, col_int#8], false\\n\"" - ] - } - ], - "source": [ - "df\\\n", - " .cols.trim(\"lastName\")\\\n", - " .cols.lower(\"lastName\")\\\n", - " .cols.upper([\"product\", \"firstName\"])\\\n", - " .cols.reverse(\"firstName\")\\\n", - " .table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Calculate the interquartile range" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.iqr(\"price\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.iqr(\"price\", more= True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Calculate Zscore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.z_score(\"price\").table()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Cleaning and Date Operations Operations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.date_transform(\"birth\", \"new_date\", \"yyyy/MM/dd\", \"dd-MM-YYYY\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.years_between(\"birth\", \"new date\", \"yyyyMMdd\",).table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.remove_accents(\"lastName\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.remove_special_chars(\"lastName\").table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.cols.clip(\"billingId\", 100 , 200).table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_abs = op.create.df(\n", - " [\n", - " (\"words\", \"str\", True),\n", - " (\"num\", \"int\", True),\n", - " (\"animals\", \"str\", True),\n", - " (\"thing\", StringType(), True),\n", - " (\"two strings\", StringType(), True),\n", - " (\"filter\", StringType(), True),\n", - " (\"num 2\", \"string\", True),\n", - " (\"col_array\", ArrayType(StringType()), True),\n", - " (\"col_int\", ArrayType(IntegerType()), True)\n", - "\n", - " ]\n", - ",\n", - "[\n", - " (\" I like fish \", -1, \"dog\", \"housé\", \"cat-car\", \"a\",\"-1\",[\"baby\", \"sorry\"],[1,2,3]),\n", - " (\" zombies\", -2, \"cat\", \"tv\", \"dog-tv\", \"b\",\"-2\",[\"baby 1\", \"sorry 1\"],[3,4]),\n", - " (\"simpsons cat lady\", -2, \"frog\", \"table\",\"eagle-tv-plus\",\"1\",\"3\", [\"baby 2\", \"sorry 2\"], [5,6,7]),\n", - " (None, 3, \"eagle\", \"glass\", \"lion-pc\", \"c\",\"4\", [\"baby 3\", \"sorry 3\"] ,[7,8])\n", - " ])\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_abs.cols.abs([\"num\",\"num 2\"]).table()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "df.cols.qcut(\"billingId\",\"billingId_ad\",5).table()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {