In [None]:
import numpy as np
import pandas as pd
# Data Preprocessing (Train)
# Load the train data
df = pd.read_csv("train.csv")

# Removing 'id', 'shares' columns
df = df.drop(columns=["id", "shares"])

# Fill missing value with mean
mean_impute_cols = [
    'n_tokens_title', 'average_token_length',
    'global_subjectivity', 'global_sentiment_polarity'
]

# Fill missing value with median
median_impute_cols = [
    'n_tokens_content', 'n_unique_tokens', 'num_hrefs', 'num_self_hrefs',
    'num_imgs', 'num_videos', 'self_reference_min_shares'
]

# Filling missing values (mean)
for col in mean_impute_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].mean(), inplace=True)

# Filling missing values (median)
for col in median_impute_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# # Filling other numerical columns' missing values (mean)
numeric_cols = df.select_dtypes(include='number').columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# Filling catergorical columns' missing values (mode) and Encoding
df['data_channel'].fillna(df['data_channel'].mode()[0], inplace=True)
df['data_channel'] = df['data_channel'].astype('category').cat.codes
df['weekday'].fillna(df['weekday'].mode()[0], inplace=True)
df['weekday'] = df['weekday'].astype('category').cat.codes

# Checking remaining missing values
print("총 남은 결측치 수:", df.isnull().sum().sum())

# Setting numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'id' in numeric_cols:
    numeric_cols.remove('id')  # removing 'id'

# Clipping
for col in numeric_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

총 남은 결측치 수: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values a

In [None]:
import numpy as np
import pandas as pd
# Data Preprocessing (Test)
# Load the train data
X_submit = pd.read_csv("test.csv")

# Setting Select_col
Select_col = 'id'
X_submit = X_submit.drop(columns=[Select_col])

# Fill missing value with mean
mean_impute_cols = [
    'n_tokens_title', 'average_token_length',
    'global_subjectivity', 'global_sentiment_polarity'
]

# Fill missing value with median
median_impute_cols = [
    'n_tokens_content', 'n_unique_tokens', 'num_hrefs', 'num_self_hrefs',
    'num_imgs', 'num_videos', 'self_reference_min_shares'
]

# Filling missing values (mean)
for col in mean_impute_cols:
    if X_submit[col].isnull().sum() > 0:
        X_submit[col].fillna(X_submit[col].mean(), inplace=True)

# Filling missing values (median)
for col in median_impute_cols:
    if X_submit[col].isnull().sum() > 0:
        X_submit[col].fillna(X_submit[col].median(), inplace=True)

# # Filling other numerical columns' missing values (mean)
numeric_cols = X_submit.select_dtypes(include='number').columns
for col in numeric_cols:
    if X_submit[col].isnull().sum() > 0:
        X_submit[col].fillna(X_submit[col].median(), inplace=True)

# Filling catergorical columns' missing values (mode) and Encoding
X_submit['data_channel'].fillna(X_submit['data_channel'].mode()[0], inplace=True)
X_submit['data_channel'] = X_submit['data_channel'].astype('category').cat.codes
X_submit['weekday'].fillna(X_submit['weekday'].mode()[0], inplace=True)
X_submit['weekday'] = X_submit['weekday'].astype('category').cat.codes

# Checking remaining missing values
print("총 남은 결측치 수:", X_submit.isnull().sum().sum())

# Setting numerical columns
numeric_cols = X_submit.select_dtypes(include=[np.number]).columns.tolist()

# Clipping
for col in numeric_cols:
    lower = X_submit[col].quantile(0.01)
    upper = X_submit[col].quantile(0.99)
    X_submit[col] = X_submit[col].clip(lower, upper)

총 남은 결측치 수: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_submit[col].fillna(X_submit[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_submit[col].fillna(X_submit[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which

In [None]:
!pip install optuna



In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Setting target
target_col = 'y'
X = df.drop(columns=[target_col])
y = df[target_col]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns)

# Modeling
temp_model = XGBClassifier(
    n_estimators=100,
    use_label_encoder=False,
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    verbosity=0
)

# Feature Selection (Based on Feature Importance: Selecting 25)
temp_model.fit(X_train_scaled_df, y_train)
importances = temp_model.feature_importances_
top_features = pd.Series(importances, index=X.columns).sort_values(ascending=False).head(25).index.tolist()

X_train_selected = X_train_scaled_df[top_features]
X_test_selected = X_test_scaled_df[top_features]

''' Optimization
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'objective': 'multi:softprob' if len(np.unique(y_train)) > 2 else 'binary:logistic',
        'eval_metric': 'mlogloss' if len(np.unique(y_train)) > 2 else 'logloss',
        'verbosity': 0,
        'seed': 42,
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    f1_scores = []
    best_iterations = []

    for train_idx, val_idx in skf.split(X_train_selected, y_train):
        X_tr, X_val = X_train_selected.iloc[train_idx], X_train_selected.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        dtrain = xgb.DMatrix(X_tr.values, label=y_tr.values)
        dval = xgb.DMatrix(X_val.values, label=y_val.values)
        evals = [(dval, 'eval')]

        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=evals,
            early_stopping_rounds=50,
            verbose_eval=False
        )

        best_iterations.append(model.best_iteration)

        y_pred = model.predict(dval)
        if len(np.unique(y_train)) == 2:
            y_pred_label = (y_pred > 0.5).astype(int)
            f1 = f1_score(y_val, y_pred_label)
        else:
            y_pred_label = np.argmax(y_pred, axis=1)
            f1 = f1_score(y_val, y_pred_label, average='weighted')

        f1_scores.append(f1)

    # best_iteration 평균 저장
    avg_best_iter = int(np.mean(best_iterations))
    trial.set_user_attr("best_iteration", avg_best_iter)

    return np.mean(f1_scores)

# Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=300)'''

best_params = {
    'max_depth': 8,
    'learning_rate': 0.03219427128777494,
    'subsample': 0.7896505212438165,
    'colsample_bytree': 0.52488039775435,
    'gamma': 4.235580500440136,
    'reg_alpha': 4.741313486695497,
    'reg_lambda': 1.7263925260081991,
    'n_estimators': 1000,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'use_label_encoder': False,
    'random_state': 42,
    'verbosity': 0
}

final_model = XGBClassifier(**best_params)
final_model.fit(X_train_selected, y_train)

y_pred = final_model.predict(X_test_selected)
y_prob = final_model.predict_proba(X_test_selected)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)
main_score = (accuracy + f1 + auc) / 3

print("\n===== Validation set test results =====")
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("AUC Score:", auc)
print("Main Evaluation Score:", main_score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\n" + "="*50 + "\n")

# Read preprocessed test data
test_ids = Select_col
X_submit_features = X_submit

# Scaling and feature slelction (top 25)
X_submit_scaled = scaler.transform(X_submit_features)
X_submit_scaled_df = pd.DataFrame(X_submit_scaled, columns=X_submit_features.columns)
X_submit_selected = X_submit_scaled_df[top_features]

# Prediction
y_submit_pred = final_model.predict(X_submit_selected)
y_submit_prob = final_model.predict_proba(X_submit_selected)[:, 1]

# Download results
submission = pd.DataFrame({
    'id': test_ids,
    'y_predict': y_submit_pred,
    'y_prob': y_submit_prob
})
submission.to_csv("prediction.csv", index=False)


===== Validation set test results =====
Accuracy: 0.6686936936936937
F1 Score: 0.6674203029617907
AUC Score: 0.7332306014623667
Main Evaluation Score: 0.6897815327059504

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.67      0.67      2239
           1       0.66      0.67      0.67      2201

    accuracy                           0.67      4440
   macro avg       0.67      0.67      0.67      4440
weighted avg       0.67      0.67      0.67      4440



