In [None]:
!pip install "fugue[all]"

# Just Like SQL

In [None]:
from fugue_notebook import setup

setup()

In [28]:
import pandas as pd

df = pd.DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "c"]})
df

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c
3,4,c


<IPython.core.display.Javascript object>

In [5]:
%%fsql 

SELECT *
FROM df 
WHERE col2="c"
PRINT

Unnamed: 0,col1,col2
0,3,c
1,4,c


<IPython.core.display.Javascript object>

In [6]:
%%fsql 

SELECT col2, AVG(col1) AS avg_col1
FROM df 
GROUP BY col2
PRINT

Unnamed: 0,col2,avg_col1
0,a,1.0
1,b,2.0
2,c,3.5


<IPython.core.display.Javascript object>

# Enhance SQL Interface

In [7]:
%%fsql  

df2 = SELECT *
FROM df 
WHERE col2="c"

SAVE df2 OVERWRITE '/tmp/df2.csv' (header=true)

<IPython.core.display.Javascript object>

In [8]:
%%fsql  

df3 = LOAD '/tmp/df2.csv' (header=true)

SELECT *
FROM df3
PRINT

Unnamed: 0,col1,col2
0,3,c
1,4,c


<IPython.core.display.Javascript object>

# Added Keywords

## DROP

In [9]:
%%fsql 

df4 = DROP COLUMNS col2 IF EXISTS FROM df
PRINT df4

Unnamed: 0,col1
0,1
1,2
2,3
3,4


<IPython.core.display.Javascript object>

## FILL

In [10]:
import numpy as np

null_df = pd.DataFrame(
    {"col1": [np.nan, np.nan, 1], 
     "col2": [2, 3, np.nan]}
)

<IPython.core.display.Javascript object>

In [11]:
%%fsql
-- Fill nan at col1 with 1 and nan at col2 with 2
df1 = FILL NULLS PARAMS col1:1, col2:2 FROM null_df
PRINT df1

Unnamed: 0,col1,col2
0,1.0,2.0
1,1.0,3.0
2,1.0,2.0


<IPython.core.display.Javascript object>

## SAMPLE

In [12]:
%%fsql
df2 = SAMPLE 2 ROWS SEED 42 FROM df
PRINT df2
df3 = SAMPLE 50 PERCENT SEED 1 FROM df
PRINT df3

Unnamed: 0,col1,col2
0,2,b
1,4,c


Unnamed: 0,col1,col2
0,4,c
1,3,c


<IPython.core.display.Javascript object>

# Intergrate with Python

In [13]:
# schema: *, col3:str
def str_concat(df: pd.DataFrame, delimeter: str) -> pd.DataFrame:
    df = df.assign(col3=df["col1"].astype(str) + delimeter + df["col2"])
    return df

<IPython.core.display.Javascript object>

In [30]:
%%fsql 
SELECT * 
FROM df
PRINT

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c
3,4,c


<IPython.core.display.Javascript object>

In [14]:
%%fsql 
SELECT * 
FROM df 
TRANSFORM USING str_concat(delimeter="_")
PRINT

Unnamed: 0,col1,col2,col3
0,1,a,1_a
1,2,b,2_b
2,3,c,3_c
3,4,c,4_c


<IPython.core.display.Javascript object>

# Scale to Big Data

In [18]:
%%fsql spark 
SELECT * 
FROM df 
TRANSFORM USING str_concat(delimeter="_")
PRINT

Unnamed: 0,col1,col2,col3
0,1,a,1_a
1,2,b,2_b
2,3,c,3_c
3,4,c,4_c


<IPython.core.display.Javascript object>

In [19]:
# schema: *
def get_median(df: pd.DataFrame) -> List[Dict[str, Any]]:
    return [{"col1": df["col1"].median(), "col2": df["col2"].iloc[0]}]

<IPython.core.display.Javascript object>

In [20]:
%%fsql spark 
SELECT * 
FROM df 
TRANSFORM PREPARTITION BY col2 USING get_median
PRINT

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


<IPython.core.display.Javascript object>

# FugueSQL in Production


In [27]:
from fugue_sql import fsql
import fugue_spark

fsql(
    """SELECT * 
        FROM df 
        TRANSFORM PREPARTITION BY col2 USING get_median
        PRINT"""
).run("spark")

Unnamed: 0,col1,col2
0,1,a
1,2,b
2,3,c


DataFrames()

<IPython.core.display.Javascript object>