## creating a pipeline
with xgboost


In [None]:
from xgboost import XGBRegressor

In [None]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

# Load your data
data = pd.read_csv('../data/acled/darfur.csv')

# Define your features and target variable
X = data.drop('target_column', axis=1)
y = data['target_column']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical features
categorical_features = ['cat_feature1', 'cat_feature2'] # replace with your actual feature names
numeric_features = ['num_feature1', 'num_feature2']     # replace with your actual feature names

# Create transformers for numeric and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and an XGBoost classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier())
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model (you can use appropriate metrics for your problem)
print("Model score: %.3f" % model.score(X_test, y_test))


## visualization tools
- **yellowbrick** extends scikit-learn's model selection and evaluation capabilities with visualizations
- **dtreeviz** is specifically designed for decision trees (and by extension, tree-based models like XGBoost). It provides detailed visualizations of how decisions are made within the trees.