In [7]:
# Import library yang diperlukan
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import lightgbm as lgb
import xgboost as xgb
import warnings
import os
warnings.filterwarnings('ignore')
DATA_DIR = os.path.join(os.getcwd(), 'dataset')
OUTPUT_DIR = os.path.join(os.getcwd(), 'output')
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Muat dataset
print("Memuat dataset...")
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
target_df = pd.read_csv(os.path.join(DATA_DIR, 'target.csv'))
submission_format = pd.read_csv(os.path.join(DATA_DIR, 'submission_format.csv'))

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"Target shape: {target_df.shape}")
print(f"Submission shape: {submission_format.shape}")

# Melihat distribusi target
print("\nDistribusi target:")
print(target_df['coppaRisk'].value_counts())
print(target_df['coppaRisk'].value_counts(normalize=True).round(3) * 100)

# Simpan ID test
test_ids = test_df['ID'].values

# Konversi target ke numerik
y = target_df['coppaRisk'].map({True: 1, False: 0})


Memuat dataset...
Train shape: (7000, 16)
Test shape: (3000, 17)
Target shape: (7000, 1)
Submission shape: (3000, 2)

Distribusi target:
coppaRisk
False    6304
True      696
Name: count, dtype: int64
coppaRisk
False    90.1
True      9.9
Name: proportion, dtype: float64


In [8]:
# Fungsi untuk preprocessing data
def preprocess_data(df, is_test=False):
    # Buat salinan df
    df_processed = df.copy()
    
    # Hapus kolom ID jika ada
    if 'ID' in df_processed.columns:
        df_processed = df_processed.drop('ID', axis=1)
    
    # Identifikasi tipe kolom
    cat_cols = df_processed.select_dtypes(include=['object']).columns
    num_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns
    
    # One-hot encoding untuk kolom kategorikal
    df_encoded = pd.get_dummies(df_processed, columns=cat_cols, dummy_na=True)
    
    # Isi nilai yang hilang pada kolom numerik
    for col in num_cols:
        df_encoded[col] = df_encoded[col].fillna(df_encoded[col].median())
    
    return df_encoded

# Preprocessing
print("\nPreprocessing data...")
X_processed = preprocess_data(train_df)
X_test_processed = preprocess_data(test_df, is_test=True)


# Selaraskan kolom-kolom
train_cols = set(X_processed.columns)
test_cols = set(X_test_processed.columns)

# Tambahkan kolom yang ada di test tapi tidak di train
for col in test_cols - train_cols:
    X_processed[col] = 0

# Tambahkan kolom yang ada di train tapi tidak di test
for col in train_cols - test_cols:
    X_test_processed[col] = 0

# Pastikan urutan kolom sama
common_cols = sorted(list(set(X_processed.columns) & set(X_test_processed.columns)))
X_processed = X_processed[common_cols]
X_test_processed = X_test_processed[common_cols]
X_processed.columns = X_processed.columns.str.replace(r'[{}"\':,]', '_', regex=True)
X_test_processed.columns = X_test_processed.columns.str.replace(r'[{}"\':,]', '_', regex=True)
print(f"Processed train shape: {X_processed.shape}")
print(f"Processed test shape: {X_test_processed.shape}")



Preprocessing data...
Processed train shape: (7000, 364)
Processed test shape: (3000, 364)


In [9]:
# Split data untuk validasi
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# Parameter LightGBM yang optimal
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}

# Parameter XGBoost yang optimal
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'learning_rate': 0.05,
    'max_depth': 5,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

# Training LightGBM
print("\nTraining LightGBM model...")
lgb_train = lgb.Dataset(X_train, y_train)
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_val],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)  # mirip verbose_eval=100
    ]
)

# Training XGBoost
print("\nTraining XGBoost model...")
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dval, 'val')]

xgb_model = xgb.train(
    xgb_params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=100,
    verbose_eval=100
)



Training LightGBM model...


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[32]	valid_0's auc: 0.903582

Training XGBoost model...
[0]	val-auc:0.89475
[100]	val-auc:0.90046
[123]	val-auc:0.90067


In [10]:
# Evaluasi LightGBM
lgb_val_preds = (lgb_model.predict(X_val) > 0.5).astype(int)
lgb_val_proba = lgb_model.predict(X_val)

print(f"\nLightGBM validation accuracy: {accuracy_score(y_val, lgb_val_preds):.4f}")
print(f"LightGBM validation ROC AUC: {roc_auc_score(y_val, lgb_val_proba):.4f}")
print("\nLightGBM Classification Report:")
print(classification_report(y_val, lgb_val_preds))

# Evaluasi XGBoost
xgb_val_proba = xgb_model.predict(dval)
xgb_val_preds = (xgb_val_proba > 0.5).astype(int)

print(f"\nXGBoost validation accuracy: {accuracy_score(y_val, xgb_val_preds):.4f}")
print(f"XGBoost validation ROC AUC: {roc_auc_score(y_val, xgb_val_proba):.4f}")
print("\nXGBoost Classification Report:")
print(classification_report(y_val, xgb_val_preds))

# Evaluasi Ensemble
ensemble_val_proba = (lgb_val_proba + xgb_val_proba) / 2
ensemble_val_preds = (ensemble_val_proba > 0.5).astype(int)

print(f"\nEnsemble validation accuracy: {accuracy_score(y_val, ensemble_val_preds):.4f}")
print(f"Ensemble validation ROC AUC: {roc_auc_score(y_val, ensemble_val_proba):.4f}")
print("\nEnsemble Classification Report:")
print(classification_report(y_val, ensemble_val_preds))


LightGBM validation accuracy: 0.9014
LightGBM validation ROC AUC: 0.9036

LightGBM Classification Report:
              precision    recall  f1-score   support

           0       0.90      1.00      0.95      1261
           1       0.57      0.03      0.05       139

    accuracy                           0.90      1400
   macro avg       0.74      0.51      0.50      1400
weighted avg       0.87      0.90      0.86      1400


XGBoost validation accuracy: 0.8993
XGBoost validation ROC AUC: 0.9007

XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1261
           1       0.47      0.11      0.18       139

    accuracy                           0.90      1400
   macro avg       0.69      0.55      0.56      1400
weighted avg       0.87      0.90      0.87      1400


Ensemble validation accuracy: 0.9014
Ensemble validation ROC AUC: 0.9048

Ensemble Classification Report:
              precision    r

In [11]:
# Train model final pada seluruh data
print("\nTraining final models on all data...")

# LightGBM final
lgb_train_full = lgb.Dataset(X_processed, y)
final_lgb = lgb.train(
    lgb_params,
    lgb_train_full,
    num_boost_round=lgb_model.best_iteration
)

# XGBoost final
dtrain_full = xgb.DMatrix(X_processed, label=y)
final_xgb = xgb.train(
    xgb_params,
    dtrain_full,
    num_boost_round=xgb_model.best_iteration
)

# Analisis fitur penting
feature_importance = pd.DataFrame({
    'Feature': X_processed.columns,
    'Importance': final_lgb.feature_importance()
}).sort_values('Importance', ascending=False)

print("\nTop 10 Fitur Terpenting:")
print(feature_importance.head(10))


Training final models on all data...

Top 10 Fitur Terpenting:
                                      Feature  Importance
1                                      appAge         205
363                           userRatingCount         174
0                                     adSpent          97
7         appDescriptionBrandSafetyRating_low          48
311                     isCorporateEmailScore          35
10                          averageUserRating          32
331                    primaryGenreName_Games          30
326                primaryGenreName_Education          29
327            primaryGenreName_Entertainment          29
161  developerCountry_CANNOT IDENTIFY COUNTRY          27


In [12]:
# Buat prediksi
print("\nMembuat prediksi...")
dtest = xgb.DMatrix(X_test_processed)

# Prediksi LightGBM
lgb_preds_proba = final_lgb.predict(X_test_processed)
lgb_preds = (lgb_preds_proba > 0.09).astype(int)

# Prediksi XGBoost
xgb_preds_proba = final_xgb.predict(dtest)
xgb_preds = (xgb_preds_proba > 0.085).astype(int)

# Prediksi Ensemble
ensemble_proba = (lgb_preds_proba + xgb_preds_proba) / 2
ensemble_preds = (ensemble_proba > 0.09).astype(int)

# Buat file submission
def create_submission(preds, filename):
    submission = submission_format.copy()
    submission['coppaRisk'] = [True if p == 1 else False for p in preds]
    submission.to_csv(filename, index=False)
    return submission

# Buat semua file submission
lgb_sub = create_submission(lgb_preds, 'lgb_submission.csv')
xgb_sub = create_submission(xgb_preds, 'xgb_submission.csv')
ensemble_sub = create_submission(ensemble_preds, 'ensemble_submission.csv')

print("\nFile submission berhasil dibuat:")
print("1. lgb_submission.csv")
print("2. xgb_submission.csv")
print("3. ensemble_submission.csv (Hasil Terbaik)")

print("\nDistribusi prediksi:")
print(f"LightGBM: True={sum(lgb_preds)}, False={len(lgb_preds)-sum(lgb_preds)}")
print(f"XGBoost: True={sum(xgb_preds)}, False={len(xgb_preds)-sum(xgb_preds)}")
print(f"Ensemble: True={sum(ensemble_preds)}, False={len(ensemble_preds)-sum(ensemble_preds)}")

print("\nSelesai! Gunakan file ensemble_submission.csv untuk hasil prediksi terbaik.")



Membuat prediksi...

File submission berhasil dibuat:
1. lgb_submission.csv
2. xgb_submission.csv
3. ensemble_submission.csv (Hasil Terbaik)

Distribusi prediksi:
LightGBM: True=970, False=2030
XGBoost: True=1126, False=1874
Ensemble: True=996, False=2004

Selesai! Gunakan file ensemble_submission.csv untuk hasil prediksi terbaik.


In [13]:
hasil = pd.read_csv('/kaggle/working/xgb_submission.csv')
hasil['coppaRisk'].value_counts()

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/xgb_submission.csv'