In [1]:
# load library

import os, glob
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt

import ibis
import ibis.selectors as s
from ibis.interactive import *

In [2]:
# options 

ibis.options.interactive = True

import warnings
warnings.filterwarnings('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### pandas 대비 장점 확인

In [3]:
%ls -lh

total 49654376
-rw-r--r--@ 1 hooseonlee  staff   155K Dec 18 15:35 (기술공통) DATA Engineering_커리큘럼.pdf
-rwx------@ 1 hooseonlee  staff   199K Dec 23 09:59 [31m00_python_training.ipynb[m[m*
-rw-r--r--@ 1 hooseonlee  staff    41K Jan  1 16:01 01_pandas_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff   142K Jan  4 15:25 02_pandas_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff    37K Jan  7 13:29 03_numpy_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff    16K Jan  7 14:58 04_numpy_training.ipynb
-rwx------@ 1 hooseonlee  staff   202K Jan 10 18:10 [31m05_ibis_training.ipynb[m[m*
-rwx------@ 1 hooseonlee  staff   498K Dec 21 19:26 [31m06_ibis_training.ipynb[m[m*
-rwx------@ 1 hooseonlee  staff    68K May 10  2023 [31m08_크롤링_인터넷.ipynb[m[m*
-rw-r--r--@ 1 hooseonlee  staff   2.5K Jan  1 15:27 100_my_pandas.py
-rw-r--r--@ 1 hooseonlee  staff   606B Dec 29 13:40 101_my_ibis.py
-rw-r--r--@ 1 hooseonlee  staff   1.4K Dec 29 20:35 102_my_ibis.py
-rwx------@ 1 hooseonlee

#### pandas는 다운되지만 ibis는 읽습니다.

In [4]:
taxi = ibis.read_csv("taxi.csv")
taxi.head()

In [5]:
taxi.count()
taxi.group_by("vendor_name").count()

┌───────────┐
│ [1;36m139372677[0m │
└───────────┘

#### database에 적재 및 운영

In [6]:
# in-memory database
# con = ibis.duckdb.connect()

In [7]:
# local database 생성 및 확인

con = ibis.duckdb.connect("mydb.duckdb")

%ls -lh

total 49654400
-rw-r--r--@ 1 hooseonlee  staff   155K Dec 18 15:35 (기술공통) DATA Engineering_커리큘럼.pdf
-rwx------@ 1 hooseonlee  staff   199K Dec 23 09:59 [31m00_python_training.ipynb[m[m*
-rw-r--r--@ 1 hooseonlee  staff    41K Jan  1 16:01 01_pandas_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff   142K Jan  4 15:25 02_pandas_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff    37K Jan  7 13:29 03_numpy_training.ipynb
-rw-r--r--@ 1 hooseonlee  staff    16K Jan  7 14:58 04_numpy_training.ipynb
-rwx------@ 1 hooseonlee  staff   202K Jan 10 18:10 [31m05_ibis_training.ipynb[m[m*
-rwx------@ 1 hooseonlee  staff   498K Dec 21 19:26 [31m06_ibis_training.ipynb[m[m*
-rwx------@ 1 hooseonlee  staff    68K May 10  2023 [31m08_크롤링_인터넷.ipynb[m[m*
-rw-r--r--@ 1 hooseonlee  staff   2.5K Jan  1 15:27 100_my_pandas.py
-rw-r--r--@ 1 hooseonlee  staff   606B Dec 29 13:40 101_my_ibis.py
-rw-r--r--@ 1 hooseonlee  staff   1.4K Dec 29 20:35 102_my_ibis.py
-rwx------@ 1 hooseonlee

#### taxi 데이터 업로드

In [8]:
con.list_tables()

[]

In [9]:
table = con.read_csv("taxi.csv")
con.create_table("taxi", table)

con.list_tables()

['ibis_read_csv_hpduhgazpvhdlp5phbzwxazjxu', 'taxi']

In [10]:
# 특정 테이블 불러오기

df = con.table('taxi')

df

#### 빠른 실행 확인, 파일로 실행했을 때 30초 정도 소요되었음

In [11]:
df.count()

taxi.group_by("vendor_name").count()

┌───────────┐
│ [1;36m139372677[0m │
└───────────┘

#### Database vs. PC, 쿼리 vs. 결과

In [12]:
# exectue() : 전자는 쿼리 실행 (Database), 후자는 유저에게 가져옴(개인 pc)

type(df.head())
type(df.head().execute())

ibis.expr.types.relations.Table

pandas.core.frame.DataFrame

#### 테이블 관련 명령어 몇 개만 실행

In [13]:
df.columns

('vendor_name',
 'pickup_datetime',
 'dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'pickup_longitude',
 'pickup_latitude',
 'rate_code',
 'store_and_fwd',
 'dropoff_longitude',
 'dropoff_latitude',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'total_amount',
 'improvement_surcharge',
 'month')

In [14]:
# 원하는 명령어 계속 사용 가능

df.group_by('vendor_name').count()

df.filter(_.vendor_name == "CMT")\
  .group_by("payment_type")\
  .count()\
  .order_by(ibis.desc("CountStar()"))

In [15]:
# group_by 및 요약, 정렬

df.group_by(["vendor_name", "payment_type"])\
  .agg(mean_fare_amount = _.fare_amount.mean(), sum_fare_amount = _.fare_amount.sum(), count = _.vendor_name.count())\
  .order_by("vendor_name")
  

In [16]:
# 변수 선택 관련 명령어

df.select(["vendor_name", "payment_type"])

df.select(s.contains("amount"))

df.select("vendor_name", s.numeric())

df.select(~s.matches("amount"))

In [17]:
# 변수 생성 및 rename

df.mutate(km = _.trip_distance * 1.6)\
  .select("trip_distance", "km")

df.trip_distance.mean()
df["trip_distance"].mean()

df.agg([df.trip_distance.mean(), df.trip_distance.min(), df.trip_distance.max().name("max_distance")])
df[["vendor_name", "payment_type"]].distinct()

┌──────────┐
│ [1;36m2.448804[0m │
└──────────┘

┌──────────┐
│ [1;36m2.448804[0m │
└──────────┘

#### sql 문법과 비교

In [18]:
df_filtered = df.filter(_.vendor_name == "VST")\
                .agg([_.fare_amount.sum(), _.fare_amount.mean()])

ibis.to_sql(df_filtered)

```sql
SELECT
  SUM("t1"."fare_amount") AS "Sum(fare_amount)",
  AVG("t1"."fare_amount") AS "Mean(fare_amount)"
FROM (
  SELECT
    *
  FROM "taxi" AS "t0"
  WHERE
    "t0"."vendor_name" = 'VST'
) AS "t1"
```

#### 파일로 내보내기

In [19]:
type(df_filtered)

con.to_csv(df_filtered, path = "tmp.csv")

os.listdir(".")
glob.glob('*.csv')

ibis.expr.types.relations.Table

['.Rhistory',
 '(기술공통) DATA Engineering_커리큘럼.pdf',
 '연습장.ipynb',
 '.DS_Store',
 '04_numpy_training.ipynb',
 '101_my_ibis.py',
 '02_pandas_training.ipynb',
 'old',
 '99_ibis_training_3_big data.ipynb',
 '08_크롤링_인터넷.ipynb',
 'mydb.duckdb',
 'tmp.csv',
 '03_numpy_training.ipynb',
 '102_my_ibis.py',
 '06_ibis_training.ipynb',
 '05_ibis_training.ipynb',
 '.Rapp.history',
 'taxi.csv',
 '강의_Python.key',
 '01_pandas_training.ipynb',
 '00_python_training.ipynb',
 '100_my_pandas.py']

['tmp.csv', 'taxi.csv']

#### 원격 데이터 베이스 연결 

In [20]:
# 원격 PostgreSQL 서버 연결
# conn = ibis.postgres.connect(
#     host = 'your-server.com',        # 또는 IP 주소 (예: '192.168.1.100')
#     port = 5432,                     # 서버 관리자 제공 (기본은 5432)
#     user = 'postgres',               # 서버 관리자 제공
#     password = '1234',               # 서버 관리자 제공
#     database = 'postgres'            # 접속할 데이터 베이스 이름
# )

# 데이터베이스 및 테이블 확인
# dbs = conn.list_databases()
# print(dbs)

# tables = conn.list_tables()
# print(tables)