In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#Initialize Spark Session
spark = SparkSession.builder.appName("classificationExample1").getOrCreate()
spark

In [5]:
#csv 파일 로드
data = spark.read.csv("learning_spark_data/titanic.csv", header=True, inferSchema=True)

# 데이터 확인
data.show(10)

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|Gender| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| NULL|       S|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| NULL|       S|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| NULL|       S|
|          6|       0|     3|    Moran, Mr. James|  male|NULL|    0|    0|      

In [6]:
from pyspark.sql.functions import col, sum, when, isnan


# 결측치 처리
null_counts = data.select(
                    [
                        sum( when(col(c).isNull() | isnan(c),1).otherwise(0) ).alias(c)
                        for c in data.columns
                    ]
                )
null_counts.show()

+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Gender|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|     0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+------+---+-----+-----+------+----+-----+--------+



In [9]:
# feature slection
data_1 = data.select('Survived', 'Pclass', 'Gender', 'Age' , 'SibSp', 'Parch')
data_1.show()

+--------+------+------+----+-----+-----+
|Survived|Pclass|Gender| Age|SibSp|Parch|
+--------+------+------+----+-----+-----+
|       0|     3|  male|22.0|    1|    0|
|       1|     1|female|38.0|    1|    0|
|       1|     3|female|26.0|    0|    0|
|       1|     1|female|35.0|    1|    0|
|       0|     3|  male|35.0|    0|    0|
|       0|     3|  male|NULL|    0|    0|
|       0|     1|  male|54.0|    0|    0|
|       0|     3|  male| 2.0|    3|    1|
|       1|     3|female|27.0|    0|    2|
|       1|     2|female|14.0|    1|    0|
|       1|     3|female| 4.0|    1|    1|
|       1|     1|female|58.0|    0|    0|
|       0|     3|  male|20.0|    0|    0|
|       0|     3|  male|39.0|    1|    5|
|       0|     3|female|14.0|    0|    0|
|       1|     2|female|55.0|    0|    0|
|       0|     3|  male| 2.0|    4|    1|
|       1|     2|  male|NULL|    0|    0|
|       0|     3|female|31.0|    1|    0|
|       1|     3|female|NULL|    0|    0|
+--------+------+------+----+-----

In [10]:
mean_age = data_1.select('Age').agg({
                    "Age":"mean"
                }).collect()[0][0]
mean_age

29.69911764705882

In [12]:
data_1 = data_1.fillna({'Age': mean_age})
data_1.show(10)

+--------+------+------+-----------------+-----+-----+
|Survived|Pclass|Gender|              Age|SibSp|Parch|
+--------+------+------+-----------------+-----+-----+
|       0|     3|  male|             22.0|    1|    0|
|       1|     1|female|             38.0|    1|    0|
|       1|     3|female|             26.0|    0|    0|
|       1|     1|female|             35.0|    1|    0|
|       0|     3|  male|             35.0|    0|    0|
|       0|     3|  male|29.69911764705882|    0|    0|
|       0|     1|  male|             54.0|    0|    0|
|       0|     3|  male|              2.0|    3|    1|
|       1|     3|female|             27.0|    0|    2|
|       1|     2|female|             14.0|    1|    0|
+--------+------+------+-----------------+-----+-----+
only showing top 10 rows



In [14]:
from pyspark.sql.functions import col, when
from pyspark.ml.feature import StringIndexer, VectorAssembler

In [15]:
# 데이터 인코딩 stringIndexer
indexer = StringIndexer(inputCol='Gender', outputCol='SexIndexer')
data_1 = indexer.fit(data_1).transform(data_1)
data_1.show(5)

+--------+------+------+----+-----+-----+----------+
|Survived|Pclass|Gender| Age|SibSp|Parch|SexIndexer|
+--------+------+------+----+-----+-----+----------+
|       0|     3|  male|22.0|    1|    0|       0.0|
|       1|     1|female|38.0|    1|    0|       1.0|
|       1|     3|female|26.0|    0|    0|       1.0|
|       1|     1|female|35.0|    1|    0|       1.0|
|       0|     3|  male|35.0|    0|    0|       0.0|
+--------+------+------+----+-----+-----+----------+
only showing top 5 rows



In [18]:
#FeatureVector 생성
assembler = VectorAssembler(
    inputCols=['Pclass', 'SexIndexer', 'Age' , 'SibSp', 'Parch'],  # ✅ inputCols 복수형으로 수정
    outputCol='features'
)
data_1 = assembler.transform(data_1)
data_1.select('features', 'Survived').show(5)

+--------------------+--------+
|            features|Survived|
+--------------------+--------+
|[3.0,0.0,22.0,1.0...|       0|
|[1.0,1.0,38.0,1.0...|       1|
|[3.0,1.0,26.0,0.0...|       1|
|[1.0,1.0,35.0,1.0...|       1|
|(5,[0,2],[3.0,35.0])|       0|
+--------------------+--------+
only showing top 5 rows



In [19]:
#데이터셋 분할
train_data, test_data = data_1.randomSplit([0.8,0.2], seed=42)
train_data.show(5), test_data.show(5)

+--------+------+------+----+-----+-----+----------+--------------------+
|Survived|Pclass|Gender| Age|SibSp|Parch|SexIndexer|            features|
+--------+------+------+----+-----+-----+----------+--------------------+
|       0|     1|female| 2.0|    1|    2|       1.0|[1.0,1.0,2.0,1.0,...|
|       0|     1|female|25.0|    1|    2|       1.0|[1.0,1.0,25.0,1.0...|
|       0|     1|  male|18.0|    1|    0|       0.0|[1.0,0.0,18.0,1.0...|
|       0|     1|  male|19.0|    1|    0|       0.0|[1.0,0.0,19.0,1.0...|
|       0|     1|  male|19.0|    3|    2|       0.0|[1.0,0.0,19.0,3.0...|
+--------+------+------+----+-----+-----+----------+--------------------+
only showing top 5 rows

+--------+------+------+-----------------+-----+-----+----------+--------------------+
|Survived|Pclass|Gender|              Age|SibSp|Parch|SexIndexer|            features|
+--------+------+------+-----------------+-----+-----+----------+--------------------+
|       0|     1|female|             50.0|    0|

(None, None)

In [21]:
#모델생성
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features', labelCol='Survived')
lr_model = lr.fit(train_data)
predic = lr_model.transform(test_data)
predic.select('features', 'Survived', 'prediction').show(5)

+--------------------+--------+----------+
|            features|Survived|prediction|
+--------------------+--------+----------+
|[1.0,1.0,50.0,0.0...|       0|       1.0|
|[1.0,0.0,21.0,0.0...|       0|       1.0|
|(5,[0,2],[1.0,24.0])|       0|       1.0|
|(5,[0,2],[1.0,29.0])|       0|       1.0|
|(5,[0,2],[1.0,29....|       0|       1.0|
+--------------------+--------+----------+
only showing top 5 rows



In [22]:
predic.select('Survived', 'prediction').groupBy('Survived', 'prediction').count().show()

+--------+----------+-----+
|Survived|prediction|count|
+--------+----------+-----+
|       1|       0.0|   16|
|       0|       0.0|   72|
|       1|       1.0|   45|
|       0|       1.0|   12|
+--------+----------+-----+



In [25]:
from pyspark.sql.functions import expr
comp = predic.withColumn('correct', expr('case when Survived = prediction then 1 else 0 end'))
comp.where('correct=0').count()

28

In [26]:
# 틀린 데이터만 필터링
predic.filter( col('Survived') != col('prediction')).show()

+--------+------+------+-----------------+-----+-----+----------+--------------------+--------------------+--------------------+----------+
|Survived|Pclass|Gender|              Age|SibSp|Parch|SexIndexer|            features|       rawPrediction|         probability|prediction|
+--------+------+------+-----------------+-----+-----+----------+--------------------+--------------------+--------------------+----------+
|       0|     1|female|             50.0|    0|    0|       1.0|[1.0,1.0,50.0,0.0...|[-2.0385122150888...|[0.11521831457915...|       1.0|
|       0|     1|  male|             21.0|    0|    1|       0.0|[1.0,0.0,21.0,0.0...|[-0.5036398784530...|[0.37668566590019...|       1.0|
|       0|     1|  male|             24.0|    0|    0|       0.0|(5,[0,2],[1.0,24.0])|[-0.4596486544477...|[0.38706917610046...|       1.0|
|       0|     1|  male|             29.0|    0|    0|       0.0|(5,[0,2],[1.0,29.0])|[-0.2478203388236...|[0.43836006052287...|       1.0|
|       0|     1|  m

In [27]:
# 정확도 평가
comp.selectExpr('avg(correct) as accuracy').collect()[0]['accuracy']

0.8068965517241379

In [30]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

eval = BinaryClassificationEvaluator(labelCol='Survived',
                             rawPredictionCol='rawPrediction',
                             metricName='areaUnderROC')

auc = eval.evaluate(predic)
auc

0.8634855581576892

In [31]:
spark.stop()

In [None]:
AUROC -> X축 FPR, y축 TPR의 곡선 아래면적, 1에 가까울수록 좋은 모델

# libsvm 파일 형식의 처리
- 텍스트파일 형식, 희소데이터용 압축 파일 (메모리저장, 처리속도개선 - 머신러닝에서 활용되는 형식)   
- 레이블 행:값 행:값

In [32]:
from pyspark.sql import SparkSession

#Initialize Spark Session
spark = SparkSession.builder.appName("classificationExample2").getOrCreate()
spark

In [33]:
data2 = spark.read.format('libsvm').load('learning_spark_data/sample_libsvm_data.txt')
data2.count()

100

In [35]:
data2.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 3 rows



In [36]:
train_data, test_data = data2.randomSplit([0.7,0.3], seed=12)

In [37]:
# 로지스틱회귀

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrModel = lr.fit(train_data)
pred = lrModel.transform(test_data)
pred.show(10)

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[0.95414441393043...|[0.72194788886091...|       0.0|
|  0.0|(692,[100,101,102...|[0.48568919283978...|[0.61909039185191...|       0.0|
|  0.0|(692,[123,124,125...|[1.00961478127974...|[0.73294475454830...|       0.0|
|  0.0|(692,[126,127,128...|[0.90293696106823...|[0.71155267489794...|       0.0|
|  0.0|(692,[126,127,128...|[0.75830339388632...|[0.68098526895593...|       0.0|
|  0.0|(692,[127,128,129...|[0.97194839391749...|[0.72550768363690...|       0.0|
|  0.0|(692,[127,128,129...|[0.88414239930651...|[0.70767989197633...|       0.0|
|  0.0|(692,[127,128,129...|[0.96585465524705...|[0.72429247205628...|       0.0|
|  0.0|(692,[128,129,130...|[0.91063904393044...|[0.71313091310478...|       0.0|
|  0.0|(692,[152

In [38]:
eval = BinaryClassificationEvaluator(metricName='areaUnderROC')
auc = eval.evaluate(pred)
auc

1.0

In [39]:
spark.stop()