In [4]:
from pyspark.sql import SparkSession

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import optuna

from giskard import Dataset, Model, scan, testing
import pickle

from minio import Minio
from minio.error import S3Error

import clickhouse_connect

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

SPARK_COMPAT_VERSION = os.getenv('SPARK_COMPAT_VERSION')
SCALA_COMPAT_VERSION = os.getenv('SCALA_COMPAT_VERSION')
CATBOOST_SPARK_VERSION = os.getenv('CATBOOST_SPARK_VERSION')
CLICKHOUSE_HOST = os.getenv('CLICKHOUSE_HOST')
CLICKHOUSE_PORT = os.getenv('CLICKHOUSE_PORT')
CLICKHOUSE_USER = os.getenv('CLICKHOUSE_USER')
CLICKHOUSE_PASSWORD = os.getenv('CLICKHOUSE_PASSWORD')

In [5]:
# Spark session & context
spark = (
    SparkSession
    .builder
    .config("spark.jars.packages", f"ai.catboost:catboost-spark_{SPARK_COMPAT_VERSION}_{SCALA_COMPAT_VERSION}:{CATBOOST_SPARK_VERSION}")
    .config("spark.driver.extraClassPath","./clickhouse-native-jdbc-shaded-2.5.4.jar")
    .master("local[1]")
    .getOrCreate()
)
sc = spark.sparkContext

# Sum of the first 100 whole numbers
rdd = sc.parallelize(range(100 + 1))
rdd.sum()
# 5050

5050

25/05/05 21:52:55 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
import catboost_spark
from catboost_spark import CatBoostClassifier

In [3]:
client = clickhouse_connect.get_client(host = CLICKHOUSE_HOST, 
                                       port = CLICKHOUSE_PORT, 
                                       user = CLICKHOUSE_USER, 
                                       password = CLICKHOUSE_PASSWORD)

In [6]:
from pyspark.sql import SparkSession

packages = [
    "com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0",
    "com.clickhouse:clickhouse-client:0.7.0",
    "com.clickhouse:clickhouse-http-client:0.7.0",
    "org.apache.httpcomponents.client5:httpclient5:5.2.1"

]

spark = (SparkSession.builder
         .config("spark.jars.packages", ",".join(packages))
         .getOrCreate())

spark.conf.set("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
spark.conf.set("spark.sql.catalog.clickhouse.host", "127.0.0.1")
spark.conf.set("spark.sql.catalog.clickhouse.protocol", "http")
spark.conf.set("spark.sql.catalog.clickhouse.http_port", "8123")
spark.conf.set("spark.sql.catalog.clickhouse.user", "konsin1988")
spark.conf.set("spark.sql.catalog.clickhouse.password", "r13l02c1988")
spark.conf.set("spark.sql.catalog.clickhouse.database", "credit")
spark.conf.set("spark.clickhouse.write.format", "json")

df = spark.sql("select * from clickhouse.credit.credit")
df.show()

+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+-------+-------------------+---------+
|age|   sex|job|housing|saving_accounts|checking_account|credit_amount|duration|            purpose|default|        contract_dt|client_id|
+---+------+---+-------+---------------+----------------+-------------+--------+-------------------+-------+-------------------+---------+
| 20|  male|  2|    own|           NULL|          little|         2996|      24|furniture/equipment|      1|2007-05-01 11:17:29|      495|
| 49|female|  2|    own|         little|        moderate|         1092|      12|           radio/TV|      0|2007-05-01 12:41:46|      288|
| 30|female|  3|    own|         little|        moderate|         4795|      36|           radio/TV|      0|2007-05-01 16:42:45|      141|
| 39|female|  1|    own|           NULL|        moderate|          932|       6|          education|      0|2007-05-02 00:28:44|      215|
| 31|  male|  2|    own|   