In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('MCB').getOrCreate()
df = spark.read.csv("TEST.csv", header=True, inferSchema=True)
df.show()

+----------+--------------------+----------+------+---------+----------+--------+------+----------+--------+---------+
|atcStep4Cd|        atcStep4CdNm|    clCdNm|diagYm|insupTpCd|msupUseAmt|recuClCd|sgguCd|  sgguCdNm|sidoCdNm|totUseQty|
+----------+--------------------+----------+------+---------+----------+--------+------+----------+--------+---------+
|     A02BX|Other drugs for p...|  종합병원|202203|        4|   5739020|      11|370600|    영천시|    경북|    58785|
|     A02BX|Other drugs for p...|  종합병원|202203|        7|     11640|      11|370600|    영천시|    경북|      120|
|     A02BX|Other drugs for p...|  종합병원|202203|        5|   1319212|      11|370600|    영천시|    경북|    12108|
|     A02BX|Other drugs for p...|  종합병원|202203|        4|  69725606|      11|310603|수원팔달구|    경기|   503734|
|     A02BX|Other drugs for p...|  종합병원|202203|        5|   7193639|      11|310603|수원팔달구|    경기|    46869|
|     A02BX|Other drugs for p...|  종합병원|202203|        7|    335793|      11|310603|수원팔달구|    경기|

### 다중 분류

In [100]:
from sklearn.datasets import load_iris
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# iris 데이터셋 로드
iris = load_iris()

# SparkSession 생성
spark = SparkSession.builder.appName("IrisExample").getOrCreate()

# iris 데이터셋을 Spark DataFrame으로 변환
df_iris = spark.createDataFrame(
    [(float(x[0]), float(x[1]), float(x[2]), float(x[3]), int(y)) 
     for x, y in zip(iris.data, iris.target)],
    ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
)

# species 컬럼에서 versicolor인 행만 선택
df_versicolor = df_iris.filter(col("species") == 1)
# df_versicolor.printSchema()

# versicolor인 행과 나머지 행을 80:20 비율로 분리하여 train_data와 test_data로 생성
train_data, test_data = df_versicolor.randomSplit([0.8, 0.2], seed=1234)

# features 컬럼 생성
assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features")
train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

# Random Forest 모델 학습
rf = RandomForestClassifier(labelCol="species", featuresCol="features")
model = rf.fit(train_data)

# 테스트 데이터에 대한 예측
predictions = model.transform(test_data)

# # 다중 분류 평가자 생성 및 AUC 계산
evaluator = MulticlassClassificationEvaluator(labelCol="species", metricName="f1")
f1_score = evaluator.evaluate(predictions)
print("F1-Score: {}".format(f1_score))

evaluator = MulticlassClassificationEvaluator(labelCol="species", metricName="weightedPrecision")
precision = evaluator.evaluate(predictions)
print("Precision: {}".format(precision))

evaluator = MulticlassClassificationEvaluator(labelCol="species", metricName="weightedRecall")
recall = evaluator.evaluate(predictions)
print("Recall: {}".format(recall))


F1-Score: 1.0
Precision: 1.0
Recall: 1.0


### 이진분류

In [99]:

from sklearn.datasets import load_breast_cancer
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# breast_cancer 데이터셋 로드
breast_cancer = load_breast_cancer()

# SparkSession 생성
spark = SparkSession.builder.appName("BreastCancerExample").getOrCreate()

# breast_cancer 데이터셋을 Spark DataFrame으로 변환
df_breast_cancer = spark.createDataFrame(
    [(float(x[0]), float(x[1]), float(x[2]), float(x[3]), float(x[4]), 
      float(x[5]), float(x[6]), float(x[7]), float(x[8]), float(x[9]), 
      float(x[10]), float(x[11]), float(x[12]), float(x[13]), float(x[14]), 
      float(x[15]), float(x[16]), float(x[17]), float(x[18]), float(x[19]), 
      float(x[20]), float(x[21]), float(x[22]), float(x[23]), float(x[24]), 
      float(x[25]), float(x[26]), float(x[27]), float(x[28]), float(x[29]), int(y)) 
     for x, y in zip(breast_cancer.data, breast_cancer.target)],
    ["mean_radius", "mean_texture", "mean_perimeter", "mean_area", "mean_smoothness", 
     "mean_compactness", "mean_concavity", "mean_concave_points", "mean_symmetry", "mean_fractal_dimension", 
     "radius_error", "texture_error", "perimeter_error", "area_error", "smoothness_error", 
     "compactness_error", "concavity_error", "concave_points_error", "symmetry_error", "fractal_dimension_error", 
     "worst_radius", "worst_texture", "worst_perimeter", "worst_area", "worst_smoothness", 
     "worst_compactness", "worst_concavity", "worst_concave_points", "worst_symmetry", "worst_fractal_dimension", 
     "label"]
)

# features 컬럼 생성
assembler = VectorAssembler(inputCols=df_breast_cancer.columns[:-1], outputCol="features")
df_breast_cancer = assembler.transform(df_breast_cancer)

# train_data, test_data 생성
train_data, test_data = df_breast_cancer.randomSplit([0.8, 0.2], seed=1234)

# Random Forest 모델 학습
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
model = rf.fit(train_data)

# 테스트 데이터에 대한 예측
predictions = model.transform(test_data)

# 이진 분류 평가자 생성 및 AUC 계산
evaluator = BinaryClassificationEvaluator(labelCol="label")
auc = evaluator.evaluate(predictions)

print("AUC: {}".format(auc))


# 모델 학습 및 예측

# 이진 분류 평가자 생성
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

# areaUnderROC 값 계산
auc_roc = evaluator.evaluate(predictions)
print("areaUnderROC: {}".format(auc_roc))

# 이진 분류 평가자 생성
evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderPR")

# areaUnderPR 값 계산
auc_pr = evaluator.evaluate(predictions)
print("areaUnderPR: {}".format(auc_pr))


AUC: 0.9523891966759004
areaUnderROC: 0.9523891966759004
areaUnderPR: 0.9686614406657555
