Import libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

#### *Import Data*
- Import the cleaned data set

In [3]:
# Read data
data = pd.read_parquet('../data/processed/pakistan_processed.parquet')

# Get training data (known gname)
full_train = data[data['gname'] != 'Unknown'].reset_index(drop=True)

#### *Remove terrorist groups that have less than 10 attacks*
- The idea here is that there are many groups that make up a very small portion of the attacks

In [4]:
# count the number of occurrences of each gname in the training subset
value_counts = full_train['gname'].value_counts()

# filter the training subset to only include rows where the value in column 'gname' appears at least 10 times
full_train = full_train.loc[full_train['gname'].isin(value_counts.index[value_counts >= 10])].reset_index(drop=True)

#### *Split X and y*

In [5]:
X = full_train.drop(columns=['gname'])
y = full_train['gname']

#### *Encode the target variable*
- The target variable in the data is a categorical string field. In order to make predictions on this data, the target must be encoded into a numerical variable

In [6]:
le = LabelEncoder()
y = le.fit_transform(y)

#### *Split data into training and testing subsets (75/25 split)*
- The full training data is split into two subsets as a way to evaluate the models performance. Due to class imbalance, the split will use a stratified sample

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#### *Initialize a Preprocessor*
- Encode categorical features
- Impute missing numeric features
- Normalize numeric features

In [8]:
# Create lists of numerical and categorical columns in X data
numeric_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(exclude=np.number).columns

# Create a preprocessor for tree-based models
preprocessor = ColumnTransformer([
    ('cat', Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), categorical_cols),
    ('num', Pipeline([
        ('imputer', SimpleImputer(fill_value=0)),
        ('normalizer', Normalizer('max'))
        ]), numeric_cols)
    ])

#### *Initialize ML pipeline*
- Pipeline = preprocessor + classifier

In [9]:
# Stacking Model
estimators = [('xgb', XGBClassifier()), ('lgbm', LGBMClassifier()), ('rf', RandomForestClassifier())]
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=3)

# ML Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', clf)])

#### *Train Pipeline*
- Entire pipeline is trained on the training subset of data to avoid data leakage
    - This means not only is the model trained on the training data, but so are all of the preprocessing steps in the pipeline

In [10]:
pipeline.fit(X_train, y_train)

#### *Evaluate model with testing subset*
- Get Accuracy

In [11]:
y_preds = pipeline.predict(X_test)
print(f"Test Data Accuracy: {metrics.accuracy_score(y_test, y_preds)}")

Test Data Accuracy: 0.7367706919945726


- Visualize Predictions on Map

In [12]:
test_subset = X_test.copy()
test_subset['actual_gname'] = le.inverse_transform(y_test)
test_subset['predicted_gname'] = le.inverse_transform(y_preds)
test_subset['is_correct'] = test_subset['actual_gname'] == test_subset['predicted_gname']

fig = px.scatter_geo(
    test_subset, lon='longitude', lat='latitude', color='is_correct',
    opacity=0.6, color_discrete_sequence=['green', 'red'],
    hover_data=['actual_gname', 'predicted_gname'])

fig.update_layout(
    autosize=True,
    width=1400,
    height=650,
    geo=dict(
        center=dict(
            lat=30.3753,
            lon=69.3451
        ),
        scope='asia',
        projection_scale=6
    )
)

fig.show()

### Full Training and Prediction on Unknown Data

Fit and Predict

In [13]:
pipeline.fit(X, y)

# Make predictions on unknown data
unknown_df = data[data['gname'] == 'Unknown'].drop(columns='gname').reset_index(drop=True)
pred_labels = pipeline.predict(unknown_df)
pred_gname = le.inverse_transform(pred_labels)
unknown_df['predicted_gname'] = pred_gname

# Save Prediction
unknown_df.to_csv('../models/final_predictions.csv')

Plot Results

In [14]:
import plotly.express as px
fig = px.scatter_geo(
    unknown_df,
    lon='longitude',
    lat='latitude',
    color='predicted_gname',
    opacity=0.75,
    color_discrete_sequence=px.colors.qualitative.Dark24,
    projection='natural earth'
)

fig.update_layout(
    autosize=True,
    width=1400,
    height=650,
    geo=dict(
        center=dict(
            lat=30.3753,
            lon=69.3451
        ),
        scope='asia',
        projection_scale=6
    )
)
fig.show()