# Pyspark - Catboost
- Pyspark 를 사용해서 Catboost 사용하는 Example
- Colab 에서 작동합니다.

In [1]:
# 먼저 Google-colab 에 mount 를 시킨다. 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# java 설치
!apt-get install openjdk-8-jdk-headless -qq > /dev/null 

In [3]:
# spark 설치
!wget -q https://dlcdn.apache.org/spark/spark-3.2.2/spark-3.2.2-bin-hadoop2.7.tgz

In [4]:
# 설치 파일 압축 풀기
!tar xf spark-3.2.2-bin-hadoop2.7.tgz

In [5]:
# catboost 와 findsaprk 설치 
!pip install -q findspark==1.4.2 catboost==1.0.3

[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
[?25h

In [8]:
# JAVA_Home 과 SPARK_HOME 세팅
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.2-bin-hadoop2.7"
import findspark
findspark.init()

# spark - submit 사용

In [1]:
%%writefile example.py
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("appName").getOrCreate()
sc = spark.sparkContext
rdd = sc.parallelize([1,2,3,4,5,6,7])
print(rdd.count())

Writing example.py


In [None]:
bin/spark-submit /content/example.py

In [None]:
%run -i example.py

In [11]:
!bash

bash: cannot set terminal process group (538): Inappropriate ioctl for device
bash: no job control in this shell
[01;34m/content[00m# ls
[0m[01;34mdrive[0m       [01;34msample_data[0m                [01;31mspark-3.2.2-bin-hadoop2.7.tgz[0m
example.py  [01;34mspark-3.2.2-bin-hadoop2.7[0m
[01;34m/content[00m# cd spark-3.2.2-bin-hadoop2.7
[01;34m/content/spark-3.2.2-bin-hadoop2.7[00m# ls
[0m[01;34mbin[0m   [01;34mdata[0m      [01;34mjars[0m        LICENSE   NOTICE  [01;34mR[0m          RELEASE  [01;34myarn[0m
[01;34mconf[0m  [01;34mexamples[0m  [01;34mkubernetes[0m  [01;34mlicenses[0m  [01;34mpython[0m  README.md  [01;34msbin[0m
[01;34m/content/spark-3.2.2-bin-hadoop2.7[00m# bin/spark-submit
Usage: spark-submit [options] <app jar | python file | R file> [app arguments]
Usage: spark-submit --kill [submission ID] --master [spark://...]
Usage: spark-submit --status [submission ID] --master [spark://...]
Usage: spark-submit run-example [options] example-

In [None]:
# 필요한 모듈 import
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import StructField, StructType

# Web UI Setting

In [None]:
# ngrok 설치하기
!wget -qnc https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip
!unzip -n -q ngrok-stable-linux-amd64.zip

In [None]:
get_ipython().system_raw('./ngrok http 4050 &')

In [None]:
import sys 
sys.path.append('/content/drive/MyDrive/Analysis')
from config import config

In [None]:
# ngrok 페이지에 들어가서 내 토큰을 입력한다.
!./ngrok authtoken {config.NGROK_TOKEN}

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
!curl -s http://localhost:4040/api/tunnels | grep -Po 'public_url":"(?=https)\K[^"]*'

https://493d-35-196-68-137.ngrok.io
