In [14]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('IBM_HR_parc').getOrCreate()
spark

In [15]:
import os 
cwd = os.getcwd()

# CSV 파일 읽기
df = spark.read.format('csv')\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(os.path.join(cwd, "learning_spark_data/HR-Employee-Attrition.csv"))

df.show(5)

+---+---------+-----------------+---------+--------------------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+--------------------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|   BusinessTravel|DailyRate|          Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|             JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalanc

In [16]:
# df 컬럼 이름, 타입 확인하기 
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Attrition: string (nullable = true)
 |-- BusinessTravel: string (nullable = true)
 |-- DailyRate: integer (nullable = true)
 |-- Department: string (nullable = true)
 |-- DistanceFromHome: integer (nullable = true)
 |-- Education: integer (nullable = true)
 |-- EducationField: string (nullable = true)
 |-- EmployeeCount: integer (nullable = true)
 |-- EmployeeNumber: integer (nullable = true)
 |-- EnvironmentSatisfaction: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- HourlyRate: integer (nullable = true)
 |-- JobInvolvement: integer (nullable = true)
 |-- JobLevel: integer (nullable = true)
 |-- JobRole: string (nullable = true)
 |-- JobSatisfaction: integer (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- MonthlyIncome: integer (nullable = true)
 |-- MonthlyRate: integer (nullable = true)
 |-- NumCompaniesWorked: integer (nullable = true)
 |-- Over18: string (nullable = true)
 |-- OverTime: string 

In [17]:
# 결측치 확인 
from pyspark.sql.functions import col, sum as _sum

null_counts = df.select([
    _sum(col(c).isNull().cast('int')).alias(c) for c in df.columns
])

null_counts.show()

+---+---------+--------------+---------+----------+----------------+---------+--------------+-------------+--------------+-----------------------+------+----------+--------------+--------+-------+---------------+-------------+-------------+-----------+------------------+------+--------+-----------------+-----------------+------------------------+-------------+----------------+-----------------+---------------------+---------------+--------------+------------------+-----------------------+--------------------+
|Age|Attrition|BusinessTravel|DailyRate|Department|DistanceFromHome|Education|EducationField|EmployeeCount|EmployeeNumber|EnvironmentSatisfaction|Gender|HourlyRate|JobInvolvement|JobLevel|JobRole|JobSatisfaction|MaritalStatus|MonthlyIncome|MonthlyRate|NumCompaniesWorked|Over18|OverTime|PercentSalaryHike|PerformanceRating|RelationshipSatisfaction|StandardHours|StockOptionLevel|TotalWorkingYears|TrainingTimesLastYear|WorkLifeBalance|YearsAtCompany|YearsInCurrentRole|YearsSinceLastPr

In [18]:
# DistanceFromHome, EnvironmentSatisfaction, JobSatisfaction, WorkLifeBalance, OverTime
# 

In [19]:
# 퇴사 여부별 인원 수 
df.groupBy("Attrition").count().show()


+---------+-----+
|Attrition|count|
+---------+-----+
|       No| 1233|
|      Yes|  237|
+---------+-----+



In [20]:
# 퇴사 여부별 평균 급여
df.groupBy("Attrition").avg("MonthlyIncome").show()

+---------+------------------+
|Attrition|avg(MonthlyIncome)|
+---------+------------------+
|       No| 6832.739659367397|
|      Yes|4787.0928270042195|
+---------+------------------+



In [21]:
# 퇴사여부 + 직무별 평균 환경 만족도 
df.groupBy("Attrition", "JobRole")\
     .avg("EnvironmentSatisfaction")\
     .orderBy("Attrition", "JobRole")\
     .show()

+---------+--------------------+----------------------------+
|Attrition|             JobRole|avg(EnvironmentSatisfaction)|
+---------+--------------------+----------------------------+
|       No|Healthcare Repres...|           2.819672131147541|
|       No|     Human Resources|                       2.675|
|       No|Laboratory Techni...|          2.8223350253807107|
|       No|             Manager|           2.814432989690722|
|       No|Manufacturing Dir...|          2.9407407407407407|
|       No|   Research Director|          2.4871794871794872|
|       No|  Research Scientist|           2.746938775510204|
|       No|     Sales Executive|          2.7323420074349443|
|       No|Sales Representative|                        2.76|
|      Yes|Healthcare Repres...|           2.111111111111111|
|      Yes|     Human Resources|          2.3333333333333335|
|      Yes|Laboratory Techni...|          2.3870967741935485|
|      Yes|             Manager|                         1.8|
|      Y

In [22]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


In [23]:
# encoding
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [24]:
# 타겟()encoding 0/1 no/yes
label_indexer = StringIndexer(inputCol="Attrition", outputCol="label")

### 로지스틱 회귀 

In [26]:
# 컬럼 정리
# 범주형 변수: 문자형 (One-Hot Encoding)
categorical_cols = [
    "BusinessTravel", "Department", "EducationField", "Gender",
    "JobRole", "MaritalStatus", "OverTime"
]

# 수치형 변수
numeric_cols = [
    "Age", "DistanceFromHome", "Education", "EnvironmentSatisfaction",
    "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction",
    "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
    "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction",
    "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear",
    "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole",
    "YearsSinceLastPromotion", "YearsWithCurrManager"
]

# 제거할 컬럼
drop_cols = ["EmployeeNumber", "Over18", "EmployeeCount", "StandardHours"]


In [27]:
# 불필요한 컬럼 제거 
hr_df_cleaned = df.drop(*drop_cols)

In [28]:
# 3. 범주형 인코딩: StringIndexer → OneHotEncoder
indexers = [StringIndexer(inputCol=col, outputCol=col + "_idx", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCols=[col + "_idx"], outputCols=[col + "_ohe"]) for col in categorical_cols]

# 4. 특성 벡터화
assembler_inputs = [col + "_ohe" for col in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 5. 로지스틱 회귀 모델
lr = LogisticRegression(featuresCol="features", labelCol="label")

# 6. 전체 파이프라인 구성
pipeline = Pipeline(stages=indexers + encoders + [label_indexer, assembler, lr])

# 7. 데이터 분할
train_df, test_df = hr_df_cleaned.randomSplit([0.8, 0.2], seed=42)

# 8. 파이프라인 학습
model = pipeline.fit(train_df)

# 9. 예측
predictions = model.transform(test_df)

# 10. 결과 확인
predictions.select("label", "prediction", "probability").show(5, truncate=False)

+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |0.0       |[0.9406316051411292,0.0593683948588708] |
|1.0  |1.0       |[0.03785200103867787,0.9621479989613221]|
|0.0  |0.0       |[0.6543800334415575,0.3456199665584425] |
|1.0  |0.0       |[0.7058231958589236,0.2941768041410764] |
|0.0  |1.0       |[0.34369991611098155,0.6563000838890185]|
+-----+----------+----------------------------------------+
only showing top 5 rows



In [29]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# === 평가자 설정 ===
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
evaluator_precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
evaluator_recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

# === 결과 출력 ===
print("=== 모델 평가 결과 ===")
print(f"AUC: {evaluator_auc.evaluate(predictions):.4f}")
print(f"F1 Score: {evaluator_f1.evaluate(predictions):.4f}")
print(f"Precision: {evaluator_precision.evaluate(predictions):.4f}")
print(f"Recall: {evaluator_recall.evaluate(predictions):.4f}")


=== 모델 평가 결과 ===
AUC: 0.8046
F1 Score: 0.8657
Precision: 0.8766
Recall: 0.8819


### Using Random Forest 

In [30]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=100)
pipeline = Pipeline(stages=indexers + encoders + [label_indexer, assembler, rf])

model = pipeline.fit(train_df)
predictions = model.transform(test_df)

predictions.select("label", "prediction", "probability").show(5, truncate=False)


+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|0.0  |0.0       |[0.5462181236385757,0.4537818763614243] |
|1.0  |1.0       |[0.22133785436158349,0.7786621456384166]|
|0.0  |1.0       |[0.4613048732895997,0.5386951267104003] |
|1.0  |0.0       |[0.517420729681636,0.48257927031836395] |
|0.0  |0.0       |[0.7673126688201038,0.2326873311798961] |
+-----+----------+----------------------------------------+
only showing top 5 rows



In [31]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC: {auc:.4f}")


AUC: 0.8064


In [32]:
# 7. Feature Importance 출력
rf_model = model.stages[-1]  # 파이프라인의 마지막 stage가 RandomForestClassifier
importances = rf_model.featureImportances

# Feature 이름 추출
feature_names = assembler.getInputCols()

# 중요도 매핑
feature_importance_list = list(zip(feature_names, importances.toArray()))
sorted_importance = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)
print("\nFeature Importances:")
for feature, importance in sorted_importance:
    print(f"{feature:30} → {importance:.4f}")


Feature Importances:
YearsSinceLastPromotion        → 0.0859
YearsInCurrentRole             → 0.0740
YearsWithCurrManager           → 0.0697
WorkLifeBalance                → 0.0184
Department_ohe                 → 0.0173
RelationshipSatisfaction       → 0.0143
JobRole_ohe                    → 0.0096
Gender_ohe                     → 0.0088
YearsAtCompany                 → 0.0085
Education                      → 0.0084
MonthlyRate                    → 0.0081
DistanceFromHome               → 0.0076
MonthlyIncome                  → 0.0066
JobSatisfaction                → 0.0051
OverTime_ohe                   → 0.0041
JobLevel                       → 0.0040
JobInvolvement                 → 0.0040
TrainingTimesLastYear          → 0.0040
TotalWorkingYears              → 0.0039
HourlyRate                     → 0.0035
MaritalStatus_ohe              → 0.0025
Age                            → 0.0024
StockOptionLevel               → 0.0024
EducationField_ohe             → 0.0023
NumCompaniesWorked

###  Improved logistic regression
중요도 매핑을 사용해 컬럼 선택 -> 새로운 모델 생성


In [33]:
from pyspark.sql.functions import when, col
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1. 제거할 컬럼
drop_cols = ["EmployeeNumber", "Over18", "EmployeeCount", "StandardHours"]
df_new = df.drop(*drop_cols)

# 2. Label 생성 (Yes → 1, No → 0)
df_new = df_new.withColumn("label", when(col("Attrition") == "Yes", 1).otherwise(0))

# 3. 중요 Feature 목록
important_features = [
    "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager",
    "WorkLifeBalance", "Department", "MonthlyRate", "RelationshipSatisfaction",
    "MonthlyIncome", "JobRole", "Gender", "Education", "DistanceFromHome"
]

# 4. 범주형 vs 수치형 구분
categorical_cols = ["Department", "JobRole", "Gender"]
numeric_cols = [col for col in important_features if col not in categorical_cols]

# 5. Train/Test Split
train_df, test_df = df_new.randomSplit([0.7, 0.3], seed=42)

# 6. Indexing + OneHotEncoding
indexers = [StringIndexer(inputCol=c, outputCol=c + "_idx", handleInvalid='keep') for c in categorical_cols]
encoders = [OneHotEncoder(inputCols=[c + "_idx"], outputCols=[c + "_ohe"]) for c in categorical_cols]

# 7. Assemble features
assembler_inputs = [c + "_ohe" for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# 8. Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label")

# 9. Pipeline 구성
pipeline = Pipeline(stages=indexers + encoders + [assembler, lr])

# 10. 학습 및 예측
model = pipeline.fit(train_df)
predictions = model.transform(test_df)

# 11. 평가
evaluator = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"AUC with selected features: {auc:.4f}")


AUC with selected features: 0.7235
