In [8]:
# 필요한 라이브러리 임포트
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [17]:
# 데이터 로드
data = pd.read_csv('train.csv')
# test 데이터 로드
test_data = pd.read_csv('test.csv')
# 제출파일 데이터 로드
sub = pd.read_csv('./sample_submission.csv')

In [3]:
# 라벨 인코딩
data['preferred_difficulty_level'] = data['preferred_difficulty_level'].astype('category').cat.codes
data['subscription_type'] = data['subscription_type'].astype('category').cat.codes

In [5]:
# 피처와 타겟 설정
features = data.drop(['user_id', 'target','subscription_duration','recent_login_time',
                      'average_login_time','total_completed_courses',
                      'abandoned_learning_sessions','payment_pattern'], axis=1)
target = data['target']

In [9]:
features['average_time_per_learning_session'] = np.log1p(features['average_time_per_learning_session'])

In [11]:
# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [12]:
#모델 선언 및 학습
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [13]:
# 모델 평가
y_pred = dt_model.predict(X_test)
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[327 431]
 [479 763]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.43      0.42       758
           1       0.64      0.61      0.63      1242

    accuracy                           0.55      2000
   macro avg       0.52      0.52      0.52      2000
weighted avg       0.55      0.55      0.55      2000



In [18]:
# test 데이터에 대해 라벨 인코딩
test_data['preferred_difficulty_level'] = test_data['preferred_difficulty_level'].astype('category').cat.codes
test_data['subscription_type'] = test_data['subscription_type'].astype('category').cat.codes

In [19]:
# 예측을 위한 데이터 설정
test_features = test_data.drop(['user_id','subscription_duration','recent_login_time',
                      'average_login_time','total_completed_courses',
                      'abandoned_learning_sessions','payment_pattern'], axis=1)

test_features['average_time_per_learning_session'] = np.log1p(test_features['average_time_per_learning_session'])

In [20]:
# 학습된 의사결정나무 모델을 사용하여 예측
predictions = dt_model.predict(test_features)

In [21]:
# 예측 결과 확인
print(predictions)

[0 0 1 ... 1 1 0]


In [22]:
sub['target'] = predictions

In [23]:
# 제출 파일 저장
sub.to_csv('bestmodel_log1p.csv',index=False)