Install Libraries

In [1]:
pip install pandas seaborn matplotlib scikit-learn numpy xgboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline

In [3]:
# Load the datasets
df = pd.read_csv('./data/treated/biggerAugmentedData.csv')

In [4]:
# Check for missing values
print(df.isnull().sum())

Unnamed: 0                 0
AGE                        0
GENDER                     0
VETERAN                    0
INCOME                     0
NIGHTS                     0
substanceabuse             0
completed                  0
probation                  0
assistancetype             0
required                   0
AT_RISK_OF_HOMELESSNESS    0
INCOME_PER_NIGHT           0
AGE_GROUP                  0
dtype: int64


In [5]:
# Replace 'inf' values with NaN to handle them as missing values
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df['VETERAN'] = df['VETERAN'].apply(lambda x: 1 if x == 'Yes' else 0)
df['GENDER'] = df['GENDER'].apply(lambda x: 1 if x == 'Female' else 0)

assistancetype_mapping = {
    'noassistance': 0,
    'tempassistance': 1,
    'permassistance': 2,
}

# Use the map function to apply the mapping to the AGE_GROUP column
df['assistancetype'] = df['assistancetype'].map(assistancetype_mapping)

age_group_mapping = {
    '18-25': 0,
    '26-40': 1,
    '41-55': 2,
    '56-70': 3
}

# Use the map function to apply the mapping to the AGE_GROUP column
df['AGE_GROUP'] = df['AGE_GROUP'].map(age_group_mapping)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,AGE,GENDER,VETERAN,INCOME,NIGHTS,substanceabuse,completed,probation,assistancetype,required,AT_RISK_OF_HOMELESSNESS,INCOME_PER_NIGHT,AGE_GROUP
0,0,56,1,0,47500,208,1,0,0,0,1,0,228.37,3
1,1,69,0,0,12500,254,0,1,1,1,0,0,49.21,3
2,2,46,1,0,40000,221,0,1,1,2,0,0,181.0,2
3,3,32,0,0,0,262,1,0,0,0,0,0,0.0,1
4,4,60,1,0,30000,331,0,1,0,1,1,0,90.63,3


In [7]:
# Randomize the dataset
df = df.sample(frac=1).reset_index(drop=True)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Assuming 'df' is your DataFrame
X = df.drop(['AT_RISK_OF_HOMELESSNESS'], axis=1)
y = df['AT_RISK_OF_HOMELESSNESS']

# Identifying categorical and numeric features
categorical_features = []
numeric_features = [col for col in X.columns if col not in categorical_features]

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [8]:
X.head()

NameError: name 'X' is not defined

Simple Logistic Regression

In [60]:
# Combine preprocessor and model into a single pipeline
model_pipeline = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, random_state=42))

# Set up k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation scores
cv_scores = cross_val_score(model_pipeline, X, y, cv=kf, scoring='accuracy')

print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average CV Score: {np.mean(cv_scores):.4f}")

Cross-Validation Scores: [0.865 0.861 0.849 0.857 0.86 ]
Average CV Score: 0.8584


In [12]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.base import clone
import numpy as np

def evaluate_model(model_pipeline, X, y, cv=5):
    """
    Evaluates a model pipeline using cross-validation and prints out average ROC-AUC, F1, Precision, and Recall scores.

    Parameters:
    - model_pipeline: The modeling pipeline that ends with a classifier.
    - X: Feature matrix.
    - y: Target vector.
    - cv: Number of cross-validation folds.

    Returns:
    - A dictionary with average scores for ROC-AUC, F1, Precision, and Recall.
    """
    
    # Initialize KFold and lists to store metrics
    kf = KFold(n_splits=cv, shuffle=True, random_state=42)
    auc_scores = []
    f1_scores = []
    precision_scores = []
    recall_scores = []

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Clone the model pipeline for a fresh model each fold
        clone_model = clone(model_pipeline)
        
        # Split the data
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        
        # Fit the model
        clone_model.fit(X_train_fold, y_train_fold)
        
        # Make predictions
        y_pred_fold = clone_model.predict(X_test_fold)
        y_pred_proba_fold = clone_model.predict_proba(X_test_fold)[:, 1]
        
        # Calculate and store the metrics
        auc_scores.append(roc_auc_score(y_test_fold, y_pred_proba_fold))
        f1_scores.append(f1_score(y_test_fold, y_pred_fold))
        precision_scores.append(precision_score(y_test_fold, y_pred_fold))
        recall_scores.append(recall_score(y_test_fold, y_pred_fold))

    # Calculate average scores
    avg_scores = {
        'ROC-AUC': np.mean(auc_scores),
        'F1': np.mean(f1_scores),
        'Precision': np.mean(precision_scores),
        'Recall': np.mean(recall_scores)
    }
    
    # Optionally, print the average scores
    for score_name, score_value in avg_scores.items():
        print(f"Average {score_name} Score: {score_value:.4f}")
    
    return avg_scores

# # Example of using the function with a logistic regression pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler

# # Sample pipeline for demonstration
# sample_pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# # Invoke the evaluation function
# # Ensure X and y are defined and hold your features and target variable, respectively
# scores = evaluate_model(sample_pipeline, X, y, cv=5)


Optimized Logistic Regression:

In [13]:
# Create a complete pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LogisticRegression(max_iter=100))])

# Define a grid of parameters to search over
param_grid = {
    'classifier__C': [40, 100, 500],
    'classifier__solver': ['liblinear', 'lbfgs']
}

# Set up the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

best_lr_model = grid_search.best_estimator_
evaluate_model(best_lr_model, X, y, cv=5)

# Make predictions with the best found parameters
y_pred = grid_search.predict(X_test)

Best parameters: {'classifier__C': 100, 'classifier__solver': 'lbfgs'}
Best CV score: 0.85775
Average ROC-AUC Score: 0.6532
Average F1 Score: 0.3187
Average Precision Score: 0.5253
Average Recall Score: 0.2322


In [21]:
# Assuming 'best_pipeline' is your fitted pipeline object that includes the logistic regression model
classifier = best_lr_model.named_steps['classifier']  # Access the logistic regression step

# Assuming 'X' is your feature matrix DataFrame used in model fitting
# Make sure 'X' reflects the preprocessed data structure that the model was trained on
feature_names = X.columns  # Feature names as they appear after preprocessing, before fitting the model

# Coefficients from the logistic regression model within the pipeline
coefficients = classifier.coef_[0]

# Create DataFrame for better visualization
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
}).sort_values(by='Coefficient', key=abs, ascending=False)  # Sorting by absolute value

print(feature_importances)

             Feature  Coefficient
3            VETERAN     0.662204
5             NIGHTS    -0.102441
11  INCOME_PER_NIGHT    -0.071547
8          probation     0.045928
7          completed     0.039479
6     substanceabuse    -0.033489
9     assistancetype     0.016226
4             INCOME    -0.011176
2             GENDER     0.009954
12         AGE_GROUP    -0.008306
1                AGE     0.005221
0         Unnamed: 0     0.004083
10          required    -0.001873


Support Vector Machine (SVM):

In [23]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Adjusting the pipeline to enable probability estimation in SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(probability=True))  # Enable probability estimation
])

param_grid = {
    'svc__C': [0.007, 0.01, 0.1],  # Regularization strength
    'svc__kernel': ['linear', 'rbf']  # Kernel type
}

# Set up the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Splitting the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

best_svm_model = grid_search.best_estimator_
evaluate_model(best_svm_model, X, y, cv=5)

Best parameters: {'svc__C': 0.007, 'svc__kernel': 'linear'}
Best CV score: 0.86625


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Average ROC-AUC Score: 0.6474
Average F1 Score: 0.3319
Average Precision Score: 0.3979
Average Recall Score: 0.2849


{'ROC-AUC': 0.6473618312401597,
 'F1': 0.3319292350645452,
 'Precision': 0.3979488765203051,
 'Recall': 0.28486555532010077}

In [25]:
# Assuming linear kernel and you want to extract coefficients
best_svm_model = grid_search.best_estimator_['svc']
coefs = best_svm_model.coef_.flatten()  # Flatten if it's multi-class

# Feature names (assuming all numeric or you've manually encoded categories)
feature_names = X.columns

# Creating a DataFrame for easier interpretation
svm_feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefs
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(svm_feature_importance)

             Feature  Coefficient
12         AGE_GROUP    -0.000333
1                AGE     0.000312
3            VETERAN     0.000123
5             NIGHTS    -0.000039
8          probation    -0.000031
11  INCOME_PER_NIGHT     0.000026
2             GENDER    -0.000019
10          required     0.000017
7          completed    -0.000017
0         Unnamed: 0    -0.000013
6     substanceabuse     0.000012
9     assistancetype     0.000009
4             INCOME    -0.000003


Random Forests:

In [96]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30]  # Maximum depth of the trees
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

best_rf_model = grid_search.best_estimator_
evaluate_model(best_rf_model, X, y, cv=5)


Best parameters: {'max_depth': 30, 'n_estimators': 100}
Best CV score: 0.8638
Average ROC-AUC Score: 0.6501
Average F1 Score: 0.2456
Average Precision Score: 0.4963
Average Recall Score: 0.1633


{'ROC-AUC': 0.6500533730861002,
 'F1': 0.2456443636832888,
 'Precision': 0.49632194033356825,
 'Recall': 0.16327194171520978}

Gradient Boosting Machines (e.g., XGBoost):

In [95]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],  # Number of gradient boosted trees
    'max_depth': [3, 6, 9],  # Maximum tree depth
    'learning_rate': [0.01, 0.1]  # Learning rate
}

grid_search = GridSearchCV(xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                           param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best CV score:", grid_search.best_score_)

best_xgb_model = grid_search.best_estimator_
evaluate_model(best_xgb_model, X, y, cv=5)

Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200}
Best CV score: 0.8642
Average ROC-AUC Score: 0.6655
Average F1 Score: 0.2141
Average Precision Score: 0.5298
Average Recall Score: 0.1381


{'ROC-AUC': 0.6655078691059779,
 'F1': 0.2141326368554933,
 'Precision': 0.5298465423465424,
 'Recall': 0.1380865236292445}

In [None]:
# Assuming you have a fitted Random Forest or XGBoost model
best_tree_model = grid_search.best_estimator_.named_steps['classifier']

# Get feature names - For tree models, this might directly correspond to columns if no one-hot encoding transformation is needed
feature_names = X.columns  # Adjust if you have transformed features

# Feature importances
importances = best_tree_model.feature_importances_

# Map importances to features
feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print(feature_importance)