In [6]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.utils import class_weight

from xgboost import XGBClassifier

# 1. Loading in and (hopefully not) cleaning 

In [7]:
df = pd.read_csv('../../notebooks/data4model/FL_31_data.csv')
df.head()

Unnamed: 0,YEAR,SERIAL,NCHILD,SEX,AGE,HISPAN,RACE,EDUC,EMPSTAT,OWNERSHP,...,age_mapped,income_mapped_k,owns_home,has_children,sex_b,risk_perception,hurricane_experience,preparedness_level,conf_level,knn_avg_distance
0,2017,265776,1,0,47,0,1,11,1.0,2.0,...,47,98.0,0,1,0,1.8,0.8,0.4,3.2,0.716361
1,2017,265801,0,0,63,0,1,6,1.0,1.0,...,63,67.7,1,0,0,1.4,1.0,1.0,3.0,0.541057
2,2017,265801,1,1,59,0,1,6,3.0,1.0,...,59,67.7,1,1,1,1.6,0.8,0.8,3.0,0.840465
3,2017,265801,1,0,54,0,1,6,1.0,1.0,...,54,67.7,1,1,0,1.6,0.8,0.4,3.0,0.67026
4,2017,265801,0,1,27,0,1,6,1.0,1.0,...,27,67.7,1,0,1,1.8,1.0,1.0,3.4,0.611395


In [8]:
cat_cols = ['SEX', 'HISPAN', 'RACE', 'EDUC',
            'EMPSTAT', 'OWNERSHP', 'is_owner', 'is_renter',
            'MARST', 'MIGRATE1',
            'owns_home', 'has_children']

for col in cat_cols:
    df[col] = df[col].astype('category')

df.dtypes

YEAR                        int64
SERIAL                      int64
NCHILD                      int64
SEX                      category
AGE                         int64
HISPAN                   category
RACE                     category
EDUC                     category
EMPSTAT                  category
OWNERSHP                 category
RENT                        int64
is_owner                 category
is_renter                category
HHINCOME                  float64
INCTOT                    float64
VALUEH                    float64
MARST                    category
MIGRATE1                 category
fips                        int64
num_declarations          float64
outflow_returns           float64
inflow_returns            float64
net_migration_returns     float64
outflow_people            float64
inflow_people             float64
net_migration_people      float64
outflow_agi               float64
inflow_agi                float64
max_wind_speed            float64
storms        

In [9]:
df.isnull().sum()

YEAR                     0
SERIAL                   0
NCHILD                   0
SEX                      0
AGE                      0
HISPAN                   0
RACE                     0
EDUC                     0
EMPSTAT                  0
OWNERSHP                 0
RENT                     0
is_owner                 0
is_renter                0
HHINCOME                 0
INCTOT                   0
VALUEH                   0
MARST                    0
MIGRATE1                 0
fips                     0
num_declarations         0
outflow_returns          0
inflow_returns           0
net_migration_returns    0
outflow_people           0
inflow_people            0
net_migration_people     0
outflow_agi              0
inflow_agi               0
max_wind_speed           0
storms                   0
hurricane_exp_%          0
high_risk_%              0
well_prepared_%          0
n                        0
age_mapped               0
income_mapped_k          0
owns_home                0
h

# 2. 

In [10]:
df['MIGRATE1'].value_counts()

MIGRATE1
1    32004
2     4078
3     1045
4      188
Name: count, dtype: int64

In [11]:
df = df[df['MIGRATE1'] != 4].copy()  # Removing the abroad people

In [12]:
# 2 = Moved within the same state
# 3 = Moved to different state
# Can merge (maybe 1+2 as well? Discuss with groupmates)

df['MIGRATE1'] = df['MIGRATE1'].replace({3: 2})

  df['MIGRATE1'] = df['MIGRATE1'].replace({3: 2})


In [13]:
df['MIGRATE1'] = df['MIGRATE1'].astype('category')

In [14]:
df['MIGRATE1'].value_counts()

MIGRATE1
1    32004
2     5123
4        0
Name: count, dtype: int64

In [15]:
df['MIGRATE1'] = df['MIGRATE1'].cat.remove_unused_categories() 
df['MIGRATE1'] = df['MIGRATE1'].astype(int)
df['MIGRATE1'] = df['MIGRATE1'] - 1

In [16]:
df['MIGRATE1'].value_counts()

MIGRATE1
0    32004
1     5123
Name: count, dtype: int64

In [17]:
cat_cols = df.select_dtypes(include='category').columns

for col in cat_cols:
    df[col] = df[col].cat.codes

In [18]:
# Feature and target split

features = ['AGE', 'SEX', 'EDUC', 'HHINCOME', 'OWNERSHP',
            'VALUEH', 'RENT', 'NCHILD',
            'risk_perception', 'hurricane_experience', 
            'preparedness_level', 'conf_level',
            'knn_avg_distance']


X = df[features]
y = df['MIGRATE1']

# 3. Train/Test Split

In [19]:
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.2,
    random_state=42,
    stratify=y_temp
)

# 4. Get training.....................

In [None]:
smote = SMOTE(sampling_strategy=0.5, random_state=42)  # Oversample to 50% ratio
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [285]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

In [None]:
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

In [None]:
model = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric=['auc', 'logloss'],
    early_stopping_rounds=10,
    enable_categorical=True)

In [1]:
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [4, 6, 8],
#     'learning_rate': [0.01, 0.05, 0.1, 0.3],
#     'subsample': [0.7, 0.8, 0.9],
#     'colsample_bytree': [0.7, 0.8, 0.9],
#     'min_child_weight': [1, 2, 3],
#     'gamma': [0, 0.05, 0.1],
#     'max_delta_step': [0, 1, 2, 3],
#     'scale_pos_weight': [scale_pos_weight * 0.8,
#                          scale_pos_weight,
#                          scale_pos_weight * 1.2]
# }

# grid_search = GridSearchCV(
#     XGBClassifier(random_state=42),
#     param_grid,
#     cv=5,
#     scoring='roc_auc',
#     n_jobs=-1,
#     verbose=1
# )

# grid_search.fit(X_train_smote, y_train_smote)

# print("\a")
# print("✅ Grid search completed!")
# os.system('say "Grid search completed"')

In [288]:
print(grid_search.best_params_)
best_model = grid_search.best_estimator_

{'colsample_bytree': 0.7, 'gamma': 0.1, 'learning_rate': 0.05, 'max_delta_step': 0, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 0.8}


In [276]:
best_model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
        verbose=True,
        sample_weight=classes_weights)

[0]	validation_0-logloss:0.68490
[1]	validation_0-logloss:0.67929
[2]	validation_0-logloss:0.67398
[3]	validation_0-logloss:0.66854
[4]	validation_0-logloss:0.66227
[5]	validation_0-logloss:0.65645
[6]	validation_0-logloss:0.65280
[7]	validation_0-logloss:0.64832
[8]	validation_0-logloss:0.64413
[9]	validation_0-logloss:0.64020
[10]	validation_0-logloss:0.63685
[11]	validation_0-logloss:0.63385
[12]	validation_0-logloss:0.63166
[13]	validation_0-logloss:0.62951
[14]	validation_0-logloss:0.62768
[15]	validation_0-logloss:0.62496
[16]	validation_0-logloss:0.62261
[17]	validation_0-logloss:0.61995
[18]	validation_0-logloss:0.61772
[19]	validation_0-logloss:0.61625
[20]	validation_0-logloss:0.61452
[21]	validation_0-logloss:0.61306
[22]	validation_0-logloss:0.61173
[23]	validation_0-logloss:0.61026
[24]	validation_0-logloss:0.60926
[25]	validation_0-logloss:0.60826
[26]	validation_0-logloss:0.60730
[27]	validation_0-logloss:0.60610
[28]	validation_0-logloss:0.60511
[29]	validation_0-loglos

In [37]:
best_model = joblib.load('cache/sample_weight_model.pkl')

In [38]:
y_pred_train = best_model.predict(X_train)
y_pred_val = best_model.predict(X_val)
y_pred_proba_val = best_model.predict_proba(X_val)[:, 1]

In [39]:
f1_moved = f1_score(y_val, y_pred_val, pos_label=1)
f1_stayed = f1_score(y_val, y_pred_val, pos_label=0)
f1_macro = f1_score(y_val, y_pred_val, average='macro')
print(f"F1-Score (Moved):  {f1_moved:.4f}  ← Most important")
print(f"F1-Score (Stayed): {f1_stayed:.4f}")
print(f"F1-Score (Macro):  {f1_macro:.4f}")

F1-Score (Moved):  0.3821  ← Most important
F1-Score (Stayed): 0.7894
F1-Score (Macro):  0.5858


In [40]:
roc_auc = roc_auc_score(y_val, y_pred_proba_val)
print(f"ROC-AUC:           {roc_auc:.4f}")

ROC-AUC:           0.7526


In [None]:
# Training accuracy
train_accuracy = (y_pred_train == y_train).mean()
print(f"\nTrain Accuracy: {train_accuracy:.4f}")
e
# Test accuracy
val_accuracy = (y_pred_val == y_val).mean()
print(f"Validation Accuracy: {val_accuracy:.4f}")

# Classification report
print("\nClassification Report (Validation Set):")
print(classification_report(y_val, y_pred_val))


Train Accuracy: 0.6927
Validation Accuracy: 0.6859

Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      5121
           1       0.26      0.70      0.38       820

    accuracy                           0.69      5941
   macro avg       0.60      0.69      0.59      5941
weighted avg       0.84      0.69      0.73      5941



In [42]:
y_proba_test = best_model.predict_proba(X_test)[:, 1]
y_pred_test = best_model.predict(X_test)
# y_pred_test = (y_proba_test >= best_threshold).astype(int)

print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_test)}")

print("\nClassification Report (Test Set):")
print(classification_report(y_test, y_pred_test))

ROC AUC Score: 0.7453283595169962

Classification Report (Test Set):
              precision    recall  f1-score   support

           0       0.93      0.68      0.78      6401
           1       0.25      0.69      0.37      1025

    accuracy                           0.68      7426
   macro avg       0.59      0.68      0.58      7426
weighted avg       0.84      0.68      0.73      7426



In [43]:
#joblib.dump(best_model, 'cache/sample_weight_model.pkl')