#Imports

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/data/data_subset.csv')

#Models

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Prepare data
X = pd.get_dummies(df.drop('Recent Attack', axis=1))
y = df['Recent Attack'].map({'Yes': 1, 'No': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.73      0.73      0.73     23776
           1       0.60      0.61      0.60     16224

    accuracy                           0.68     40000
   macro avg       0.67      0.67      0.67     40000
weighted avg       0.68      0.68      0.68     40000



In [13]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Prepare data
X = pd.get_dummies(df.drop('Recent Attack', axis=1))
y = df['Recent Attack'].map({'Yes': 1, 'No': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with SMOTE and XGBoost
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Set up the hyperparameter search
param_grid = {
    'classifier__max_depth': [5],
    'classifier__n_estimators': [250, 300],
    'classifier__learning_rate': [0.1],
    'classifier__min_child_weight': [1],
    'classifier__gamma': [0.15]
}

# Grid search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', verbose=2)
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_

# Predictions and evaluation
print("Best Params: ", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 72 candidates, totalling 216 fits
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=150; total time=  13.2s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=150; total time=  13.2s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=150; total time=  13.2s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=200; total time=  13.6s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=200; total time=  13.7s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.08, classifier__max_de

In [17]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

# Prepare data
X = pd.get_dummies(df.drop('Recent Attack', axis=1))
y = df['Recent Attack'].map({'Yes': 1, 'No': 0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with SMOTE and XGBoost
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Set up the hyperparameter search
param_grid = {
    'classifier__max_depth': [5],
    'classifier__n_estimators': [250],
    'classifier__learning_rate': [0.1],
    'classifier__min_child_weight': [1],
    'classifier__gamma': [0.15]
}

# Grid search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=2)
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_

# Predictions and evaluation
print("Best Params: ", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.1, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=250; total time=  25.3s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.1, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=250; total time=  18.9s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.1, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=250; total time=  21.3s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.1, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=250; total time=  18.6s
[CV] END classifier__gamma=0.15, classifier__learning_rate=0.1, classifier__max_depth=5, classifier__min_child_weight=1, classifier__n_estimators=250; total time=  18.5s
Best Params:  {'classifier__gamma': 0.15, 'classifier__learning_rate': 0.1, 'classifier__ma

In [19]:
print("Best Params: ", grid.best_params_)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

Best Params:  {'classifier__gamma': 0.15, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 250}
Best cross-validation score: 0.76


In [18]:
import joblib

joblib.dump(best_model, '/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/best_model.pkl')

['/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/best_model.pkl']

#Dataset Info

In [15]:
# Load the dataset
file_path = '/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/data/data_subset.csv'
df = pd.read_csv(file_path)
# Display the first few rows of the dataframe
print(df.head())
# Get a summary of the dataframe
print(df.info())

# Statistical summary of numeric columns
print(df.describe())
# Check the balance of classes in the 'Recent Attack' column
print(df['Recent Attack'].value_counts(normalize=True))

  Region    Age  Gender      Race         AQI Recent Attack  \
0     NW  20-24  Female  Hispanic   13.478413            No   
1     NW  20-24  Female     Black   57.048036            No   
2     SW   5-14    Male     AI/AN   60.537909           Yes   
3     SW  20-24  Female     White  150.925547           Yes   
4     SE  15-19    Male   Mexican   42.979544           Yes   

  Rescue Inhaler Use Preventative Medication Use  
0                 No                         Yes  
1                 No                         Yes  
2                 No                         Yes  
3                Yes                         Yes  
4                 No                         Yes  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Region                       200000 non-null  object 
 1   Age                    