In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, roc_curve, precision_recall_curve, auc,
    classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
ontario_fsa = pd.read_csv('ontario_fsa.csv')
farmboy_stores = pd.read_csv('farmboy_stores.csv')

# Extract FSAs from store addresses
farmboy_stores['FSA'] = farmboy_stores['Address'].str.extract(r'([A-Z]\d[A-Z])')
store_fsas = farmboy_stores['FSA'].dropna().unique()

# Create target variable (1 = has store, 0 = no store)
ontario_fsa['has_store'] = ontario_fsa['FSA'].isin(store_fsas).astype(int)

print(f"Total FSAs: {len(ontario_fsa)}")
print(f"Stores: {ontario_fsa['has_store'].sum()}")
print(f"Non-stores: {(ontario_fsa['has_store'] == 0).sum()}")

Total FSAs: 520
Stores: 50
Non-stores: 470


In [2]:
# Check for missing values
print("\nMissing values per column:")
print(ontario_fsa.isnull().sum())

# Check class balance
print(f"\nClass imbalance ratio: {(ontario_fsa['has_store'] == 0).sum() / ontario_fsa['has_store'].sum():.1f}:1")


Missing values per column:
FSA                                             0
LANDAREA                                        0
Bachelor's degree or higher                     3
Car, truck or van - as a driver                 3
Car, truck or van - as a passenger              3
Couple-family households                        3
Employed                                        3
Employee                                        3
Median age of the population                    3
Median total income of household in 2020 ($)    4
Permanent position                              3
Population, 2021                                0
Temporary position                              3
With children                                   3
population_density                              0
has_store                                       0
dtype: int64

Class imbalance ratio: 9.4:1


In [4]:
# Convert raw counts to rates (percentages)
ontario_fsa['education_rate'] = (
    ontario_fsa["Bachelor's degree or higher"] / 
    ontario_fsa['Population, 2021']
) * 100

ontario_fsa['employment_rate'] = (
    ontario_fsa['Employed'] / 
    ontario_fsa['Population, 2021']
) * 100

ontario_fsa['driver_rate'] = (
    ontario_fsa['Car, truck or van - as a driver'] / 
    ontario_fsa['Population, 2021']
) * 100

ontario_fsa['family_rate'] = (
    ontario_fsa['Couple-family households'] / 
    ontario_fsa['Population, 2021']
) * 100

ontario_fsa['children_rate'] = (
    ontario_fsa['With children'] / 
    ontario_fsa['Population, 2021']
) * 100

print("Demographic rates created")

Demographic rates created


In [7]:
# Assign regions based on FSA prefix
def assign_region(fsa):
    """
    M = Toronto
    K1, K2 = Ottawa
    L4-L9 = GTA
    Others = Other Ontario
    """
    if pd.isna(fsa):
        return 'Other Ontario'
    
    first = fsa[0]
    second = fsa[1] if len(fsa) > 1 else ''
    
    if first == 'M':
        return 'Toronto'
    elif first == 'K' and second in ['1', '2']:
        return 'Ottawa'
    elif first == 'L' and second in ['4', '5', '6', '7', '9']:
        return 'GTA'
    else:
        return 'Other Ontario'

ontario_fsa['region'] = ontario_fsa['FSA'].apply(assign_region)

# Create dummy variables (one-hot encoding)
# drop_first=True avoids multicollinearity (reference category = GTA)
region_dummies = pd.get_dummies(ontario_fsa['region'], prefix='region', drop_first=True)
ontario_fsa = pd.concat([ontario_fsa, region_dummies], axis=1)

print("Geographic features created")
print(f"  Regions: {ontario_fsa['region'].unique()}")

Geographic features created
  Regions: <StringArray>
['Other Ontario', 'Ottawa', 'GTA', 'Toronto']
Length: 4, dtype: str


In [8]:
# Select features for model
demographic_features = [
    'education_rate',
    'employment_rate',
    'Median age of the population',
    'Median total income of household in 2020 ($)',
    'population_density',
    'Population, 2021',
    'driver_rate',
    'family_rate',
    'children_rate'
]

geographic_features = [col for col in ontario_fsa.columns if col.startswith('region_')]

# Combine all features
all_features = demographic_features + geographic_features

print(f"\nTotal features: {len(all_features)}")
print(f"  Demographic: {len(demographic_features)}")
print(f"  Geographic: {len(geographic_features)}")


Total features: 15
  Demographic: 9
  Geographic: 6


In [10]:
# Remove rows with missing values
ontario_fsa_clean = ontario_fsa.dropna(subset=all_features + ['has_store'])

print(f"FSAs after removing missing data: {len(ontario_fsa_clean)}")
print(f"  Lost {len(ontario_fsa) - len(ontario_fsa_clean)} FSAs")

# Separate features (X) and target (y)
X = ontario_fsa_clean[all_features]
y = ontario_fsa_clean['has_store']

FSAs after removing missing data: 516
  Lost 4 FSAs


In [12]:
# Split data: 80% training, 20% testing
# stratify=y ensures both sets have same proportion of stores
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,      # 20% for testing
    stratify=y          # Maintain class balance
)

print(f"\nTraining set: {len(X_train)} FSAs")
print(f"  Stores: {y_train.sum()}")
print(f"  Non-stores: {(y_train == 0).sum()}")

print(f"\nTest set: {len(X_test)} FSAs")
print(f"  Stores: {y_test.sum()}")
print(f"  Non-stores: {(y_test == 0).sum()}")


Training set: 412 FSAs
  Stores: 40
  Non-stores: 372

Test set: 104 FSAs
  Stores: 10
  Non-stores: 94


In [14]:
# Standardize features (mean=0, std=1)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled")


Features scaled


In [15]:
# Initialize Logistic Regression
model = LogisticRegression(
    class_weight='balanced',    # Handle class imbalance
    max_iter=1000,              # Ensure convergence
    random_state=42,            # Reproducible results
    solver='lbfgs'              # Good for small-medium datasets
)

# Train (fit) the model
model.fit(X_train_scaled, y_train)

print("Model trained successfully")

Model trained successfully


In [16]:
# 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(
    model, 
    X_train_scaled, 
    y_train, 
    cv=cv,
    scoring='roc_auc'
)

print("\n5-Fold Cross-Validation ROC-AUC Scores:")
for fold, score in enumerate(cv_scores, 1):
    print(f"  Fold {fold}: {score:.3f}")

print(f"\nMean: {cv_scores.mean():.3f}")
print(f"Std:  {cv_scores.std():.3f}")
print(f"95% CI: [{cv_scores.mean() - 2*cv_scores.std():.3f}, {cv_scores.mean() + 2*cv_scores.std():.3f}]")


5-Fold Cross-Validation ROC-AUC Scores:
  Fold 1: 0.850
  Fold 2: 0.820
  Fold 3: 0.792
  Fold 4: 0.828
  Fold 5: 0.799

Mean: 0.818
Std:  0.021
95% CI: [0.776, 0.859]


In [18]:
# Predict probabilities (values between 0 and 1)
y_train_proba = model.predict_proba(X_train_scaled)[:, 1]
y_test_proba = model.predict_proba(X_test_scaled)[:, 1]

# Predict classes (0 or 1, using threshold of 0.5)
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)

print("Predictions generated")

Predictions generated


In [19]:
# Calculate ROC-AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_proba)
roc_auc_test = roc_auc_score(y_test, y_test_proba)

print("\nROC-AUC Scores:")
print(f"  Training:   {roc_auc_train:.3f}")
print(f"  Test:       {roc_auc_test:.3f}")
print(f"  Difference: {abs(roc_auc_train - roc_auc_test):.3f}")

if abs(roc_auc_train - roc_auc_test) > 0.1:
    print("  Large gap suggests overfitting")
else:
    print("  Good generalization")


ROC-AUC Scores:
  Training:   0.853
  Test:       0.683
  Difference: 0.170
  Large gap suggests overfitting


In [23]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_test_pred)

print("\nConfusion Matrix:")
print("                 Predicted No Store    Predicted Store")
print(f"Actual No Store       {cm[0,0]:<18} {cm[0,1]:<15}")
print(f"Actual Store          {cm[1,0]:<18} {cm[1,1]:<15}")

# Calculate metrics
true_neg = cm[0,0]
false_pos = cm[0,1]
false_neg = cm[1,0]
true_pos = cm[1,1]

print(f"\nMetrics:")
print(f"  True Positives (correctly predicted stores): {true_pos}")
print(f"  False Negatives (missed stores): {false_neg}")
print(f"  True Negatives (correctly predicted non-stores): {true_neg}")
print(f"  False Positives (incorrectly predicted stores): {false_pos}")

# Precision and Recall
if (true_pos + false_pos) > 0:
    precision = true_pos / (true_pos + false_pos)
    print(f"\n  Precision: {precision:.3f} (When we predict store, we're right {precision*100:.1f}% of time)")

if (true_pos + false_neg) > 0:
    recall = true_pos / (true_pos + false_neg)
    print(f"  Recall: {recall:.3f} (We correctly identify {recall*100:.1f}% of all stores)")


Confusion Matrix:
                 Predicted No Store    Predicted Store
Actual No Store       76                 18             
Actual Store          5                  5              

Metrics:
  True Positives (correctly predicted stores): 5
  False Negatives (missed stores): 5
  True Negatives (correctly predicted non-stores): 76
  False Positives (incorrectly predicted stores): 18

  Precision: 0.217 (When we predict store, we're right 21.7% of time)
  Recall: 0.500 (We correctly identify 50.0% of all stores)


In [25]:
# Scale all FSAs
all_fsas_scaled = scaler.transform(ontario_fsa_clean[all_features])

# Predict probabilities for all FSAs
ontario_fsa_clean['store_probability'] = model.predict_proba(all_fsas_scaled)[:, 1]

print("All FSAs scored")
print(f"\nProbability distribution:")
print(ontario_fsa_clean['store_probability'].describe())

All FSAs scored

Probability distribution:
count    516.000000
mean       0.346445
std        0.256805
min        0.008261
25%        0.154103
50%        0.245526
75%        0.526308
max        0.987595
Name: store_probability, dtype: float64


In [26]:
# Find FSAs without stores but high probability
high_potential = ontario_fsa_clean[
    (ontario_fsa_clean['has_store'] == 0) &  # No store currently
    (ontario_fsa_clean['store_probability'] > 0.5)  # High probability
].copy()

# Sort by probability
high_potential = high_potential.sort_values('store_probability', ascending=False)

print(f"\nHigh-potential FSAs (probability > 0.5): {len(high_potential)}")

if len(high_potential) > 0:
    print("\nTop 15 Expansion Opportunities:")
    display_cols = [
        'FSA', 'region', 'Population, 2021', 'education_rate',
        'Median age of the population', 'store_probability'
    ]
    print(high_potential[display_cols].head(15).to_string(index=False))
    
    # Business metrics
    total_pop = high_potential['Population, 2021'].sum()
    avg_prob = high_potential['store_probability'].mean()
    
    print(f"\nAggregate Metrics:")
    print(f"  Total population: {total_pop:,.0f}")
    print(f"  Average probability: {avg_prob:.3f}")
    print(f"  Estimated revenue: ${total_pop * 1500 / 1_000_000:.0f}M - ${total_pop * 2500 / 1_000_000:.0f}M")


High-potential FSAs (probability > 0.5): 99

Top 15 Expansion Opportunities:
FSA        region  Population, 2021  education_rate  Median age of the population  store_probability
K1P        Ottawa             645.0       58.139535                          32.4           0.987595
K1S        Ottawa           31257.0       40.087021                          39.2           0.980855
K1Y        Ottawa           20712.0       41.473542                          40.0           0.980180
K1R        Ottawa           20343.0       40.013764                          36.0           0.972960
K1M        Ottawa            6764.0       37.403903                          50.0           0.926243
K1V        Ottawa           57157.0       23.234250                          38.0           0.916323
K2A        Ottawa           17156.0       31.796456                          47.2           0.913998
L5M           GTA          106468.0       31.910997                          39.6           0.903521
L6M          

In [27]:
# Categorize FSAs by probability
ontario_fsa_clean['priority'] = pd.cut(
    ontario_fsa_clean['store_probability'],
    bins=[0, 0.3, 0.5, 0.7, 1.0],
    labels=['LOW', 'MEDIUM', 'HIGH', 'VERY HIGH']
)

# Summary by priority
print("\nFSAs by Priority (excluding current stores):")
priority_summary = ontario_fsa_clean[
    ontario_fsa_clean['has_store'] == 0
].groupby('priority').agg({
    'FSA': 'count',
    'Population, 2021': 'sum'
})

print(priority_summary)


FSAs by Priority (excluding current stores):
           FSA  Population, 2021
priority                        
LOW        288         6230260.0
MEDIUM      79         2386481.0
HIGH        45         1769780.0
VERY HIGH   54         2077647.0


In [28]:
# Save scored FSAs
ontario_fsa_clean.to_csv('fsa_scores_logistic_regression.csv', index=False)
print("\nSaved: fsa_scores_logistic_regression.csv")

# Save high-potential FSAs
if len(high_potential) > 0:
    high_potential.to_csv('high_potential_fsas.csv', index=False)
    print("Saved: high_potential_fsas.csv")

# Save model
import pickle
with open('logistic_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Saved: logistic_model.pkl and scaler.pkl")


Saved: fsa_scores_logistic_regression.csv
Saved: high_potential_fsas.csv
Saved: logistic_model.pkl and scaler.pkl
