GPT ke dilam kore dilo. XGboost use kore

In [20]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

# Load the dataset



In [None]:
# Encode categorical columns
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

# List of categorical columns to encode
categorical_columns = ['Drug', 'Sex', 'Ascites', 'Hepatomegaly', 'Spiders']

# Fill NaN values with 'Unknown' in categorical columns (specifically for 'Drug')
for col in categorical_columns:
    train_df[col] = train_df[col].fillna('Unknown')
    test_df[col] = test_df[col].fillna('Unknown')

# Make sure to preserve the 'Status' column
status_train = train_df['Status']

# Drop 'Status' from the dataframes before combining
train_df = train_df.drop(columns=['Status'])

# Combine the datasets (concatenate the dataframes)
combined_df = pd.concat([train_df, test_df], ignore_index=True)

# One-hot encode all categorical columns
combined_df = pd.get_dummies(combined_df, columns=categorical_columns)

# Now, split the combined dataset back into train and test dataframes
train_df = combined_df.iloc[:len(train_df), :]
test_df = combined_df.iloc[len(train_df):, :]

# Add the 'Status' column back to the dataframes
train_df['Status'] = status_train

# Check the result (first 10 rows of the train dataframe)
train_df = pd.get_dummies(train_df, columns=['Status'])
train_df.head(10)


In [None]:
# 1. Fix data types
# Convert Edema to numeric
train_df['Edema'] = train_df['Edema'].map({'N': 0, 'S': 1, 'Y': 2}).astype('int8')
test_df['Edema'] = test_df['Edema'].map({'N': 0, 'S': 1, 'Y': 2}).astype('int8')

# Fill numeric missing values with median
numeric_cols = ['N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 
                'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin', 'Stage']
for col in numeric_cols:
    train_df[col] = train_df[col].fillna(train_df[col].median())
    test_df[col] = test_df[col].fillna(train_df[col].median())

# 2. Prepare features and target
feature_cols = [col for col in train_df.columns if col not in ['id', 'Status_C', 'Status_CL', 'Status_D']]
X = train_df[feature_cols]

# Convert target to numeric labels
y = pd.Series(np.where(train_df['Status_C']==1, 0, 
                       np.where(train_df['Status_CL']==1, 1, 2)))

# 3. Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train XGBoost model
model = xgb.XGBClassifier(
    objective='multi:softprob',
    num_class=3,
    max_depth=6,
    learning_rate=0.1,
    n_estimators=200,
    random_state=42
)

# Train with validation monitoring
model.fit(
    X_train, 
    y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# 5. Evaluate model
val_pred = model.predict_proba(X_val)
val_loss = log_loss(y_val, val_pred)
print(f"\nValidation Log Loss: {val_loss:.4f}")

# Print feature importance
importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
})
print("\nTop 10 Important Features:")
print(importance.sort_values('importance', ascending=False).head(10))

# 6. Generate predictions
X_test = test_df[feature_cols]
test_pred = model.predict_proba(X_test)

submission = pd.DataFrame({
    'id': test_df['id'],
    'Status_C': test_pred[:, 0],
    'Status_CL': test_pred[:, 1],
    'Status_D': test_pred[:, 2]
})

submission.to_csv('submission.csv', index=False)
print("\nSubmission created!")