In [1]:
import ibis
from ibis import *

## Get a DuckDB connection

In [2]:
con = ibis.duckdb.connect()

## Get some data into tables

In [3]:
ratings = con.read_parquet("data/imdb_title_ratings.parquet", table_name="ratings")
basics = con.read_parquet("data/imdb_title_basics.parquet", table_name="basics")

In [4]:
con.list_tables()

['basics', 'ratings']

In [5]:
ratings

In [6]:
basics

In [7]:
ratings.execute(limit=10)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2036
1,tt0000002,5.7,272
2,tt0000003,6.5,1985
3,tt0000004,5.4,178
4,tt0000005,6.2,2745
5,tt0000006,5.0,183
6,tt0000007,5.4,852
7,tt0000008,5.4,2181
8,tt0000009,5.3,209
9,tt0000010,6.8,7506


In [8]:
basics.to_pandas(limit=10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,,45,Romance
9,tt0000010,short,Leaving the Factory,La sortie de l'usine Lumière à Lyon,0,1895,,1,"Documentary,Short"


In [9]:
ratings.to_pyarrow(limit=10)

pyarrow.Table
tconst: string
averageRating: string
numVotes: string
----
tconst: [["tt0000001","tt0000002","tt0000003","tt0000004","tt0000005","tt0000006","tt0000007","tt0000008","tt0000009","tt0000010"]]
averageRating: [["5.7","5.7","6.5","5.4","6.2","5.0","5.4","5.4","5.3","6.8"]]
numVotes: [["2036","272","1985","178","2745","183","852","2181","209","7506"]]

In [10]:
#to show this we will need to install polars 
#basics.to_polars()

## Properly named columns and interactive mode

In [11]:
ibis.options.interactive = True

In [12]:
basics = basics.rename("snake_case")

In [13]:
ratings = ratings.rename("snake_case")

In [14]:
ratings

In [15]:
basics

## Casting

In [16]:
ratings.select(avg_rating=ratings.average_rating.cast("float"))

In [17]:
ratings.select(
    avg_rating=ratings.average_rating.cast("float"),
    num_votes=ratings.num_votes.cast("int"),
)

In [18]:
ratings = ratings.select(
    ratings.tconst,
    avg_rating=ratings.average_rating.cast("float"),
    num_votes=ratings.num_votes.cast("int"),
)

## `to_sql()`

In [19]:
ibis.to_sql(ratings)

```sql
SELECT
  "t0"."tconst",
  CAST("t0"."averageRating" AS DOUBLE) AS "avg_rating",
  CAST("t0"."numVotes" AS BIGINT) AS "num_votes"
FROM "ratings" AS "t0"
```

In [20]:
ibis.to_sql(ratings, dialect="sqlite") # this will show different types in casting when casting makes sense

```sql
SELECT
  "t0"."tconst",
  CAST("t0"."averageRating" AS REAL) AS "avg_rating",
  CAST("t0"."numVotes" AS INTEGER) AS "num_votes"
FROM "ratings" AS "t0"
```

In [21]:
ibis.options.interactive = False

## `.sql`

In [22]:
con.sql("""
        SELECT
        "t0"."tconst",
        CAST("t0"."averageRating" AS VARCHAR) AS "avg_rating",
        CAST("t0"."numVotes" AS VARCHAR) AS "num_votes"
        FROM "ratings" AS "t0"
    """)

In [23]:
con.sql("""
        SELECT
        "t0"."tconst",
        CAST("t0"."averageRating" AS VARCHAR) AS "avg_rating",
        CAST("t0"."numVotes" AS VARCHAR) AS "num_votes"
        FROM "ratings" AS "t0"
    """).execute()

Unnamed: 0,tconst,avg_rating,num_votes
0,tt0000001,5.7,2036
1,tt0000002,5.7,272
2,tt0000003,6.5,1985
3,tt0000004,5.4,178
4,tt0000005,6.2,2745
...,...,...,...
1416615,tt9916730,7.0,12
1416616,tt9916766,7.1,23
1416617,tt9916778,7.2,36
1416618,tt9916840,6.7,8


In [24]:
ibis.options.interactive = True

In [25]:
con.sql("""
        SELECT
        "t0"."tconst",
        CAST("t0"."averageRating" AS VARCHAR) AS "avg_rating",
        CAST("t0"."numVotes" AS VARCHAR) AS "num_votes"
        FROM "ratings" AS "t0"
    """)

## Other basic operations

In [26]:
basics.columns

['tconst',
 'title_type',
 'primary_title',
 'original_title',
 'is_adult',
 'start_year',
 'end_year',
 'runtime_minutes',
 'genres']

In [27]:
basics.title_type.value_counts()

In [28]:
basics.is_adult.value_counts()

In [29]:
basics = basics.filter([basics.title_type == "movie", basics.is_adult == 0]).select("tconst", 
                                                                                    "primary_title", 
                                                                                    )


In [30]:
basics

In [31]:
basics.join(ratings, "tconst").execute(limit=10)

Unnamed: 0,tconst,primary_title,avg_rating,num_votes
0,tt0378071,Hum Tere Ashiq Hain,6.3,32
1,tt0378072,Hum Tum,7.0,15065
2,tt0378078,Once Upon a Time in the Battlefield,5.7,302
3,tt0378088,Impakto,6.0,37
4,tt0378109,Into the Blue,5.9,79351
5,tt0378117,Istokwa,7.0,7
6,tt0378118,Ivar,4.6,71
7,tt0378120,Jabidah Massacre,7.8,8
8,tt0378124,Jawani Ke Gunah,5.7,13
9,tt0378147,El nüremberg argentino,6.9,23


In [32]:
basics.join(ratings, "tconst").order_by(_.avg_rating.desc())

In [33]:
basics.join(ratings, "tconst").order_by(_.avg_rating.desc()).filter(_.num_votes > 1e6)

## `join` tables

In [34]:
topfilms = basics.join(ratings, "tconst").order_by(_.avg_rating.desc()).filter(_.num_votes > 1e6)

In [35]:
ibis.options.interactive = False

In [36]:
topfilms

In [37]:
topfilms.execute(limit=10)

Unnamed: 0,tconst,primary_title,avg_rating,num_votes
0,tt0111161,The Shawshank Redemption,9.3,2873242
1,tt0068646,The Godfather,9.2,2001286
2,tt0071562,The Godfather Part II,9.0,1357703
3,tt0108052,Schindler's List,9.0,1443212
4,tt0468569,The Dark Knight,9.0,2855626
5,tt0167260,The Lord of the Rings: The Return of the King,9.0,1968792
6,tt0110912,Pulp Fiction,8.9,2207397
7,tt0120737,The Lord of the Rings: The Fellowship of the Ring,8.9,1996832
8,tt0109830,Forrest Gump,8.8,2243918
9,tt1375666,Inception,8.8,2536542


## UDFs

In [38]:
@ibis.udf.scalar.builtin
def jaccard(s1: str, s2: str) -> float:
    ...

In [39]:
expr = jaccard(topfilms.primary_title, "The Godmother")

In [40]:
expr.execute(limit=10)

0    0.421053
1    0.750000
2    0.400000
3    0.352941
4    0.642857
5    0.333333
6    0.166667
7    0.363636
8    0.176471
9    0.421053
Name: jaccard_0(primary_title, 'The Godmother'), dtype: float64

In [41]:
ibis.to_sql(expr)

```sql
SELECT
  JACCARD("t6"."primary_title", 'The Godmother') AS "jaccard_0(primary_title, 'The Godmother')"
FROM (
  SELECT
    "t4"."tconst",
    "t4"."primary_title",
    "t5"."avg_rating",
    "t5"."num_votes"
  FROM (
    SELECT
      "t0"."tconst",
      "t0"."primaryTitle" AS "primary_title"
    FROM "basics" AS "t0"
    WHERE
      "t0"."titleType" = 'movie' AND "t0"."isAdult" = CAST(0 AS TINYINT)
  ) AS "t4"
  INNER JOIN (
    SELECT
      "t1"."tconst",
      CAST("t1"."averageRating" AS DOUBLE) AS "avg_rating",
      CAST("t1"."numVotes" AS BIGINT) AS "num_votes"
    FROM "ratings" AS "t1"
  ) AS "t5"
    ON "t4"."tconst" = "t5"."tconst"
) AS "t6"
WHERE
  "t6"."num_votes" > CAST(1000000.0 AS DOUBLE)
ORDER BY
  "t6"."avg_rating" DESC
```

In [42]:
expr2 = topfilms.select(_.primary_title, jaccard=jaccard(_.primary_title, "The Gothfather")).order_by(_.jaccard.desc())

In [43]:
expr2.execute(limit=10)

Unnamed: 0,primary_title,jaccard
0,The Godfather,0.909091
1,The Godfather Part II,0.769231
2,The Wolf of Wall Street,0.692308
3,The Matrix,0.538462
4,The Departed,0.538462
5,V for Vendetta,0.538462
6,Guardians of the Galaxy,0.5
7,Raiders of the Lost Ark,0.470588
8,Back to the Future,0.466667
9,The Truman Show,0.466667


## Connect from other backends

In [44]:
#con2 = ibis.postgres.connect(port=5438, user="postgres", password="postgres") #probably need to change port

In [45]:
#con2.execute(topfilms, limit=10)

In [46]:
#con3 = ibis.sqlite.connect("") or mysql, not sure what we want to show

In [47]:
#con3.execute(topfilms, limit=10)