In [27]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV, cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, Normalizer, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

Import Data

In [15]:
data = pd.read_parquet('../data/interim/pakistan_processed.parquet')
train = data[data['gname'] != 'Unknown'].reset_index(drop=True)
test = data[data['gname'] == 'Unknown'].drop(columns='gname').reset_index(drop=True)

Prepare data for modeling

In [16]:
# count the number of occurrences of each value in column 'B'
value_counts = train['gname'].value_counts()

# filter the DataFrame to only include rows where the value in column 'B' appears more than once
train = train.loc[train['gname'].isin(value_counts.index[value_counts > 1])]

# Split into X and y sets
X = train.drop(columns='gname')
y = train['gname']

# Encode the categorical target into numbers
le = LabelEncoder()
y = le.fit_transform(y)

Initialize a Preprocessor

In [29]:
# Create lists of numerical and categorical columns in X data
numeric_cols = X.select_dtypes(include=np.number).columns
categorical_cols = X.select_dtypes(exclude=np.number).columns

# Create a preprocessor for tree-based models
preprocessor = ColumnTransformer([
    ('cat', Pipeline([
        ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), categorical_cols),
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('normalizer', Normalizer('l1'))
        ]), numeric_cols)
    ])

Initialize pipeline

In [34]:
# Model
clf = LGBMClassifier(num_leaves=20, max_depth=4, learning_rate=0.1)

# Create a ML Pipeline Instance with the Tuned Classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', clf)])

Tune pipeline Parameters

In [31]:
# Create parameter grid for pipline
pipeline_param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median', 'most_frequent'],
    'preprocessor__num__normalizer__norm': ['l1', 'l2', 'max'],
    'model__num_leaves': [10, 20, 30],
    'model__max_depth': [3, 4, 5],
    'model__learning_rate': [0.1, 0.01, 0.001]
}

# Perform search
search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=pipeline_param_grid,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

search.fit(X, y)



In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
metrics.accuracy_score(y_test, preds)

0.7068403908794788

In [85]:
pipeline.fit(X, y)

In [89]:
preds = pipeline.predict(test)
pred_gname = le.inverse_transform(preds)

In [90]:
test['predicted_gname'] = pred_gname

In [91]:
test

Unnamed: 0,iyear,imonth,iday,provstate,city,latitude,longitude,multiple,success,suicide,...,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,ishostkid,predicted_gname
0,2007,12,3,Balochistan,Qilla Abdullah district,30.803630,66.711752,0,1,0,...,Unknown Explosive Type,6.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0,Tehrik-i-Taliban Pakistan (TTP)
1,2007,12,4,Khyber Pakhtunkhwa,Peshawar,34.006004,71.537430,0,1,1,...,Suicide (carried bodily by human being),1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Tehrik-i-Taliban Pakistan (TTP)
2,2007,12,8,Balochistan,Khuzdar,27.809921,66.620956,0,0,0,...,Unknown Gun Type,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Khorasan Chapter of the Islamic State
3,2007,12,9,Balochistan,Nasirabad,28.458421,68.133223,0,1,0,...,Unknown Gun Type,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Baloch Liberation Army (BLA)
4,2007,12,9,Khyber Pakhtunkhwa,Peshawar,34.006004,71.537430,0,1,0,...,"Projectile (rockets, mortars, RPGs, etc.)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,Tehrik-i-Taliban Pakistan (TTP)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9143,2018,12,26,Balochistan,Quetta,30.200820,66.994352,0,1,0,...,Unknown Gun Type,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Khorasan Chapter of the Islamic State
9144,2018,12,26,Balochistan,Pishin district,30.647345,67.142436,0,1,0,...,Unknown Gun Type,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Tehrik-i-Taliban Pakistan (TTP)
9145,2018,12,26,Sindh,Karachi,24.891116,67.143312,0,1,0,...,Handgun,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Hizb-ul-Ahrar (HuA)
9146,2018,12,28,Khyber Pakhtunkhwa,Abbottabad,34.173319,73.227866,0,1,0,...,Unknown Gun Type,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Hizb-ul-Ahrar (HuA)


In [38]:
for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

TypeError: Singleton array array(<bound method NDFrame.copy of 0                Baloch Republican Army (BRA)
1             Tehrik-i-Taliban Pakistan (TTP)
2                                    Al-Qaida
3             Tehrik-i-Taliban Pakistan (TTP)
4             Tehrik-i-Taliban Pakistan (TTP)
                        ...                  
3084    Khorasan Chapter of the Islamic State
3085             Baloch Liberation Army (BLA)
3086            Muttahida Qami Movement (MQM)
3087                      Hizb-ul-Ahrar (HuA)
3088    Khorasan Chapter of the Islamic State
Name: gname, Length: 3089, dtype: object>, dtype=object) cannot be considered a valid collection.

In [None]:
for train_index, test_index in skf.split(X_train, y_train):
    x_train_fold, x_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
    pipeline.fit(x_train_fold, y_train_fold)
    lst_accu_stratified.append(pipeline.score(x_test_fold, y_test_fold))
  
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      np.mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', np.stdev(lst_accu_stratified))

In [4]:
train

Unnamed: 0,iyear,imonth,iday,provstate,city,latitude,longitude,multiple,success,suicide,...,weaptype1_txt,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,property,ishostkid
0,2007,12,3,Balochistan,Qilla Abdullah district,30.803630,66.711752,0,1,0,...,Explosives,Unknown Explosive Type,6.0,0.0,0.0,5.0,0.0,0.0,1.0,0.0
1,2007,12,4,Khyber Pakhtunkhwa,Peshawar,34.006004,71.537430,0,1,1,...,Explosives,Suicide (carried bodily by human being),1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2007,12,6,Balochistan,Dera Bugti,29.034412,69.158661,0,1,0,...,Explosives,Remote Trigger,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
3,2007,12,8,Balochistan,Khuzdar,27.809921,66.620956,0,0,0,...,Firearms,Unknown Gun Type,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2007,12,9,Balochistan,Nasirabad,28.458421,68.133223,0,1,0,...,Firearms,Unknown Gun Type,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12232,2018,12,26,Sindh,Karachi,24.891116,67.143312,0,1,0,...,Firearms,Handgun,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
12233,2018,12,26,Khyber Pakhtunkhwa,Jarobi Darra,34.556336,71.079617,0,1,0,...,Explosives,Unknown Explosive Type,2.0,0.0,0.0,3.0,0.0,0.0,1.0,0.0
12234,2018,12,28,Khyber Pakhtunkhwa,Abbottabad,34.173319,73.227866,0,1,0,...,Firearms,Unknown Gun Type,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12235,2018,12,31,Sindh,Karachi,24.891116,67.143312,0,1,0,...,Firearms,Handgun,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
