## creating a pipeline
with xgboost

notes
- this code segment is likely to be much more powerful than random forest (even absent XGB capabilities) since we can engage all columns through one shot encoding and standard scaling


In [2]:
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd

data = pd.read_csv('../data/acled/darfur.csv')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9316 entries, 0 to 9315
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EVENT_ID_CNTY       9316 non-null   object 
 1   EVENT_DATE          9316 non-null   object 
 2   YEAR                9316 non-null   int64  
 3   TIME_PRECISION      9316 non-null   int64  
 4   DISORDER_TYPE       9316 non-null   object 
 5   EVENT_TYPE          9316 non-null   object 
 6   SUB_EVENT_TYPE      9316 non-null   object 
 7   ACTOR1              9316 non-null   object 
 8   ASSOC_ACTOR_1       1998 non-null   object 
 9   INTER1              9316 non-null   int64  
 10  ACTOR2              8029 non-null   object 
 11  ASSOC_ACTOR_2       2805 non-null   object 
 12  INTER2              9316 non-null   int64  
 13  INTERACTION         9316 non-null   int64  
 14  CIVILIAN_TARGETING  5001 non-null   object 
 15  ISO                 9316 non-null   int64  
 16  REGION

In [10]:
## target: raw fatablities or clipped past a number 
X = data.drop('FATALITIES', axis=1)
y = data['FATALITIES']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical features
categorical_features = data.select_dtypes(include=['object']).columns.tolist()
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Create transformers for numeric and categorical features
## could use Imputer() from sklearn.preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different preprocessing to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a pipeline with the preprocessor and an XGBoost classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier())
])

# Train the model
model.fit(X_train, y_train)

# Evaluate the model (you can use appropriate metrics for your problem)
print("Model score: %.3f" % model.score(X_test, y_test))


ValueError: A given column is not a column of the dataframe

## visualization tools
- **yellowbrick** extends scikit-learn's model selection and evaluation capabilities with visualizations
- **dtreeviz** is specifically designed for decision trees (and by extension, tree-based models like XGBoost). It provides detailed visualizations of how decisions are made within the trees.